diff options
160 files changed, 8656 insertions, 5556 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f2a93c8679e8..e43f2e1f2958 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -356,7 +356,7 @@ shot down by NMI autoconf= [IPV6] - See Documentation/networking/ipv6.txt. + See Documentation/networking/ipv6.rst. show_lapic= [APIC,X86] Advanced Programmable Interrupt Controller Limit apic dumping. The parameter defines the maximal @@ -831,7 +831,7 @@ decnet.addr= [HW,NET] Format: <area>[,<node>] - See also Documentation/networking/decnet.txt. + See also Documentation/networking/decnet.rst. default_hugepagesz= [same as hugepagesz=] The size of the default @@ -872,7 +872,7 @@ miss to occur. disable= [IPV6] - See Documentation/networking/ipv6.txt. + See Documentation/networking/ipv6.rst. hardened_usercopy= [KNL] Under CONFIG_HARDENED_USERCOPY, whether @@ -912,7 +912,7 @@ to workaround buggy firmware. disable_ipv6= [IPV6] - See Documentation/networking/ipv6.txt. + See Documentation/networking/ipv6.rst. disable_mtrr_cleanup [X86] The kernel tries to adjust MTRR layout from continuous @@ -4910,7 +4910,7 @@ Set the number of tcp_metrics_hash slots. Default value is 8192 or 16384 depending on total ram pages. This is used to specify the TCP metrics - cache size. See Documentation/networking/ip-sysctl.txt + cache size. See Documentation/networking/ip-sysctl.rst "tcp_no_metrics_save" section for more details. tdfx= [HW,DRM] diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index e043c9213388..2ad1b77a7182 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -353,8 +353,8 @@ socket's buffer. It will not take effect unless PF_UNIX flag is specified. 3. /proc/sys/net/ipv4 - IPV4 settings ------------------------------------- -Please see: Documentation/networking/ip-sysctl.txt and ipvs-sysctl.txt for -descriptions of these entries. +Please see: Documentation/networking/ip-sysctl.rst and +Documentation/admin-guide/sysctl/net.rst for descriptions of these entries. 4. Appletalk diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index f99677f3572f..38b4db8be7a2 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -7,7 +7,7 @@ Filter) facility, with a focus on the extended BPF version (eBPF). This kernel side documentation is still work in progress. The main textual documentation is (for historical reasons) described in -`Documentation/networking/filter.txt`_, which describe both classical +`Documentation/networking/filter.rst`_, which describe both classical and extended BPF instruction-set. The Cilium project also maintains a `BPF and XDP Reference Guide`_ that goes into great technical depth about the BPF Architecture. @@ -59,7 +59,7 @@ Testing and debugging BPF .. Links: -.. _Documentation/networking/filter.txt: ../networking/filter.txt +.. _Documentation/networking/filter.rst: ../networking/filter.txt .. _man-pages: https://www.kernel.org/doc/man-pages/ .. _bpf(2): http://man7.org/linux/man-pages/man2/bpf.2.html .. _BPF and XDP Reference Guide: http://cilium.readthedocs.io/en/latest/bpf/ diff --git a/Documentation/dev-tools/kselftest.rst b/Documentation/dev-tools/kselftest.rst index 61ae13c44f91..5d1f56fcd2e7 100644 --- a/Documentation/dev-tools/kselftest.rst +++ b/Documentation/dev-tools/kselftest.rst @@ -301,7 +301,8 @@ Helpers .. kernel-doc:: tools/testing/selftests/kselftest_harness.h :functions: TH_LOG TEST TEST_SIGNAL FIXTURE FIXTURE_DATA FIXTURE_SETUP - FIXTURE_TEARDOWN TEST_F TEST_HARNESS_MAIN + FIXTURE_TEARDOWN TEST_F TEST_HARNESS_MAIN FIXTURE_VARIANT + FIXTURE_VARIANT_ADD Operators --------- diff --git a/Documentation/devicetree/bindings/net/qca,ar71xx.txt b/Documentation/devicetree/bindings/net/qca,ar71xx.txt deleted file mode 100644 index 2a33e71ba72b..000000000000 --- a/Documentation/devicetree/bindings/net/qca,ar71xx.txt +++ /dev/null @@ -1,45 +0,0 @@ -Required properties: -- compatible: Should be "qca,<soc>-eth". Currently support compatibles are: - qca,ar7100-eth - Atheros AR7100 - qca,ar7240-eth - Atheros AR7240 - qca,ar7241-eth - Atheros AR7241 - qca,ar7242-eth - Atheros AR7242 - qca,ar9130-eth - Atheros AR9130 - qca,ar9330-eth - Atheros AR9330 - qca,ar9340-eth - Atheros AR9340 - qca,qca9530-eth - Qualcomm Atheros QCA9530 - qca,qca9550-eth - Qualcomm Atheros QCA9550 - qca,qca9560-eth - Qualcomm Atheros QCA9560 - -- reg : Address and length of the register set for the device -- interrupts : Should contain eth interrupt -- phy-mode : See ethernet.txt file in the same directory -- clocks: the clock used by the core -- clock-names: the names of the clock listed in the clocks property. These are - "eth" and "mdio". -- resets: Should contain phandles to the reset signals -- reset-names: Should contain the names of reset signal listed in the resets - property. These are "mac" and "mdio" - -Optional properties: -- phy-handle : phandle to the PHY device connected to this device. -- fixed-link : Assume a fixed link. See fixed-link.txt in the same directory. - Use instead of phy-handle. - -Optional subnodes: -- mdio : specifies the mdio bus, used as a container for phy nodes - according to phy.txt in the same directory - -Example: - -ethernet@1a000000 { - compatible = "qca,ar9330-eth"; - reg = <0x1a000000 0x200>; - interrupts = <5>; - resets = <&rst 13>, <&rst 23>; - reset-names = "mac", "mdio"; - clocks = <&pll ATH79_CLK_AHB>, <&pll ATH79_CLK_MDIO>; - clock-names = "eth", "mdio"; - - phy-mode = "gmii"; -}; diff --git a/Documentation/devicetree/bindings/net/qca,ar71xx.yaml b/Documentation/devicetree/bindings/net/qca,ar71xx.yaml new file mode 100644 index 000000000000..f99a5aabe923 --- /dev/null +++ b/Documentation/devicetree/bindings/net/qca,ar71xx.yaml @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/qca,ar71xx.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: QCA AR71XX MAC + +allOf: + - $ref: ethernet-controller.yaml# + +maintainers: + - Oleksij Rempel <o.rempel@pengutronix.de> + +properties: + compatible: + oneOf: + - items: + - enum: + - qca,ar7100-eth # Atheros AR7100 + - qca,ar7240-eth # Atheros AR7240 + - qca,ar7241-eth # Atheros AR7241 + - qca,ar7242-eth # Atheros AR7242 + - qca,ar9130-eth # Atheros AR9130 + - qca,ar9330-eth # Atheros AR9330 + - qca,ar9340-eth # Atheros AR9340 + - qca,qca9530-eth # Qualcomm Atheros QCA9530 + - qca,qca9550-eth # Qualcomm Atheros QCA9550 + - qca,qca9560-eth # Qualcomm Atheros QCA9560 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + '#address-cells': + description: number of address cells for the MDIO bus + const: 1 + + '#size-cells': + description: number of size cells on the MDIO bus + const: 0 + + clocks: + items: + - description: MAC main clock + - description: MDIO clock + + clock-names: + items: + - const: eth + - const: mdio + + resets: + items: + - description: MAC reset + - description: MDIO reset + + reset-names: + items: + - const: mac + - const: mdio + +required: + - compatible + - reg + - interrupts + - phy-mode + - clocks + - clock-names + - resets + - reset-names + +examples: + # Lager board + - | + eth0: ethernet@19000000 { + compatible = "qca,ar9330-eth"; + reg = <0x19000000 0x200>; + interrupts = <4>; + resets = <&rst 9>, <&rst 22>; + reset-names = "mac", "mdio"; + clocks = <&pll 1>, <&pll 2>; + clock-names = "eth", "mdio"; + qca,ethcfg = <ðcfg>; + phy-mode = "mii"; + phy-handle = <&phy_port4>; + }; + + eth1: ethernet@1a000000 { + compatible = "qca,ar9330-eth"; + reg = <0x1a000000 0x200>; + interrupts = <5>; + resets = <&rst 13>, <&rst 23>; + reset-names = "mac", "mdio"; + clocks = <&pll 1>, <&pll 2>; + clock-names = "eth", "mdio"; + + phy-mode = "gmii"; + + status = "disabled"; + + fixed-link { + speed = <1000>; + full-duplex; + }; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + switch10: switch@10 { + #address-cells = <1>; + #size-cells = <0>; + + compatible = "qca,ar9331-switch"; + reg = <0x10>; + resets = <&rst 8>; + reset-names = "switch"; + + interrupt-parent = <&miscintc>; + interrupts = <12>; + + interrupt-controller; + #interrupt-cells = <1>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + switch_port0: port@0 { + reg = <0x0>; + label = "cpu"; + ethernet = <ð1>; + + phy-mode = "gmii"; + + fixed-link { + speed = <1000>; + full-duplex; + }; + }; + + switch_port1: port@1 { + reg = <0x1>; + phy-handle = <&phy_port0>; + phy-mode = "internal"; + + status = "disabled"; + }; + + switch_port2: port@2 { + reg = <0x2>; + phy-handle = <&phy_port1>; + phy-mode = "internal"; + + status = "disabled"; + }; + + switch_port3: port@3 { + reg = <0x3>; + phy-handle = <&phy_port2>; + phy-mode = "internal"; + + status = "disabled"; + }; + + switch_port4: port@4 { + reg = <0x4>; + phy-handle = <&phy_port3>; + phy-mode = "internal"; + + status = "disabled"; + }; + }; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + interrupt-parent = <&switch10>; + + phy_port0: phy@0 { + reg = <0x0>; + interrupts = <0>; + status = "disabled"; + }; + + phy_port1: phy@1 { + reg = <0x1>; + interrupts = <0>; + status = "disabled"; + }; + + phy_port2: phy@2 { + reg = <0x2>; + interrupts = <0>; + status = "disabled"; + }; + + phy_port3: phy@3 { + reg = <0x3>; + interrupts = <0>; + status = "disabled"; + }; + + phy_port4: phy@4 { + reg = <0x4>; + interrupts = <0>; + status = "disabled"; + }; + }; + }; + }; + }; diff --git a/Documentation/networking/6pack.txt b/Documentation/networking/6pack.rst index 8f339428fdf4..bc5bf1f1a98f 100644 --- a/Documentation/networking/6pack.txt +++ b/Documentation/networking/6pack.rst @@ -1,27 +1,36 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +6pack Protocol +============== + This is the 6pack-mini-HOWTO, written by Andreas Könsgen DG3KQ -Internet: ajk@comnets.uni-bremen.de -AMPR-net: dg3kq@db0pra.ampr.org -AX.25: dg3kq@db0ach.#nrw.deu.eu + +:Internet: ajk@comnets.uni-bremen.de +:AMPR-net: dg3kq@db0pra.ampr.org +:AX.25: dg3kq@db0ach.#nrw.deu.eu Last update: April 7, 1998 1. What is 6pack, and what are the advantages to KISS? +====================================================== 6pack is a transmission protocol for data exchange between the PC and the TNC over a serial line. It can be used as an alternative to KISS. 6pack has two major advantages: + - The PC is given full control over the radio channel. Special control data is exchanged between the PC and the TNC so that the PC knows at any time if the TNC is receiving data, if a TNC buffer underrun or overrun has occurred, if the PTT is set and so on. This control data is processed at a higher priority than normal data, so a data stream can be interrupted at any time to issue an - important event. This helps to improve the channel access and timing - algorithms as everything is computed in the PC. It would even be possible - to experiment with something completely different from the known CSMA and + important event. This helps to improve the channel access and timing + algorithms as everything is computed in the PC. It would even be possible + to experiment with something completely different from the known CSMA and DAMA channel access methods. This kind of real-time control is especially important to supply several TNCs that are connected between each other and the PC by a daisy chain @@ -36,6 +45,7 @@ More details about 6pack are described in the file 6pack.ps that is located in the doc directory of the AX.25 utilities package. 2. Who has developed the 6pack protocol? +======================================== The 6pack protocol has been developed by Ekki Plicht DF4OR, Henning Rech DF9IC and Gunter Jost DK7WJ. A driver for 6pack, written by Gunter Jost and @@ -44,12 +54,14 @@ They have also written a firmware for TNCs to perform the 6pack protocol (see section 4 below). 3. Where can I get the latest version of 6pack for LinuX? +========================================================= At the moment, the 6pack stuff can obtained via anonymous ftp from db0bm.automation.fh-aachen.de. In the directory /incoming/dg3kq, there is a file named 6pack.tgz. 4. Preparing the TNC for 6pack operation +======================================== To be able to use 6pack, a special firmware for the TNC is needed. The EPROM of a newly bought TNC does not contain 6pack, so you will have to @@ -75,12 +87,14 @@ and the status LED are lit for about a second if the firmware initialises the TNC correctly. 5. Building and installing the 6pack driver +=========================================== The driver has been tested with kernel version 2.1.90. Use with older kernels may lead to a compilation error because the interface to a kernel function has been changed in the 2.1.8x kernels. How to turn on 6pack support: +============================= - In the linux kernel configuration program, select the code maturity level options menu and turn on the prompting for development drivers. @@ -94,27 +108,28 @@ To use the driver, the kissattach program delivered with the AX.25 utilities has to be modified. - Do a cd to the directory that holds the kissattach sources. Edit the - kissattach.c file. At the top, insert the following lines: + kissattach.c file. At the top, insert the following lines:: + + #ifndef N_6PACK + #define N_6PACK (N_AX25+1) + #endif - #ifndef N_6PACK - #define N_6PACK (N_AX25+1) - #endif + Then find the line: - Then find the line - - int disc = N_AX25; + int disc = N_AX25; and replace N_AX25 by N_6PACK. - Recompile kissattach. Rename it to spattach to avoid confusions. Installing the driver: +---------------------- -- Do an insmod 6pack. Look at your /var/log/messages file to check if the +- Do an insmod 6pack. Look at your /var/log/messages file to check if the module has printed its initialization message. - Do a spattach as you would launch kissattach when starting a KISS port. - Check if the kernel prints the message '6pack: TNC found'. + Check if the kernel prints the message '6pack: TNC found'. - From here, everything should work as if you were setting up a KISS port. The only difference is that the network device that represents @@ -138,6 +153,7 @@ from the PC to the TNC over the serial line, the status LED if data is sent to the PC. 6. Known problems +================= When testing the driver with 2.0.3x kernels and operating with data rates on the radio channel of 9600 Baud or higher, diff --git a/Documentation/networking/altera_tse.txt b/Documentation/networking/altera_tse.rst index 50b8589d12fd..7a7040072e58 100644 --- a/Documentation/networking/altera_tse.txt +++ b/Documentation/networking/altera_tse.rst @@ -1,6 +1,12 @@ - Altera Triple-Speed Ethernet MAC driver +.. SPDX-License-Identifier: GPL-2.0 -Copyright (C) 2008-2014 Altera Corporation +.. include:: <isonum.txt> + +======================================= +Altera Triple-Speed Ethernet MAC driver +======================================= + +Copyright |copy| 2008-2014 Altera Corporation This is the driver for the Altera Triple-Speed Ethernet (TSE) controllers using the SGDMA and MSGDMA soft DMA IP components. The driver uses the @@ -46,23 +52,33 @@ Jumbo frames are not supported at this time. The driver limits PHY operations to 10/100Mbps, and has not yet been fully tested for 1Gbps. This support will be added in a future maintenance update. -1) Kernel Configuration +1. Kernel Configuration +======================= + The kernel configuration option is ALTERA_TSE: + Device Drivers ---> Network device support ---> Ethernet driver support ---> Altera Triple-Speed Ethernet MAC support (ALTERA_TSE) -2) Driver parameters list: - debug: message level (0: no output, 16: all); - dma_rx_num: Number of descriptors in the RX list (default is 64); - dma_tx_num: Number of descriptors in the TX list (default is 64). +2. Driver parameters list +========================= + + - debug: message level (0: no output, 16: all); + - dma_rx_num: Number of descriptors in the RX list (default is 64); + - dma_tx_num: Number of descriptors in the TX list (default is 64). + +3. Command line options +======================= + +Driver parameters can be also passed in command line by using:: -3) Command line options -Driver parameters can be also passed in command line by using: altera_tse=dma_rx_num:128,dma_tx_num:512 -4) Driver information and notes +4. Driver information and notes +=============================== -4.1) Transmit process +4.1. Transmit process +--------------------- When the driver's transmit routine is called by the kernel, it sets up a transmit descriptor by calling the underlying DMA transmit routine (SGDMA or MSGDMA), and initiates a transmit operation. Once the transmit is complete, an @@ -70,7 +86,8 @@ interrupt is driven by the transmit DMA logic. The driver handles the transmit completion in the context of the interrupt handling chain by recycling resource required to send and track the requested transmit operation. -4.2) Receive process +4.2. Receive process +-------------------- The driver will post receive buffers to the receive DMA logic during driver initialization. Receive buffers may or may not be queued depending upon the underlying DMA logic (MSGDMA is able queue receive buffers, SGDMA is not able @@ -79,34 +96,39 @@ received, the DMA logic generates an interrupt. The driver handles a receive interrupt by obtaining the DMA receive logic status, reaping receive completions until no more receive completions are available. -4.3) Interrupt Mitigation +4.3. Interrupt Mitigation +------------------------- The driver is able to mitigate the number of its DMA interrupts using NAPI for receive operations. Interrupt mitigation is not yet supported for transmit operations, but will be added in a future maintenance release. 4.4) Ethtool support +-------------------- Ethtool is supported. Driver statistics and internal errors can be taken using: ethtool -S ethX command. It is possible to dump registers etc. 4.5) PHY Support +---------------- The driver is compatible with PAL to work with PHY and GPHY devices. 4.7) List of source files: - o Kconfig - o Makefile - o altera_tse_main.c: main network device driver - o altera_tse_ethtool.c: ethtool support - o altera_tse.h: private driver structure and common definitions - o altera_msgdma.h: MSGDMA implementation function definitions - o altera_sgdma.h: SGDMA implementation function definitions - o altera_msgdma.c: MSGDMA implementation - o altera_sgdma.c: SGDMA implementation - o altera_sgdmahw.h: SGDMA register and descriptor definitions - o altera_msgdmahw.h: MSGDMA register and descriptor definitions - o altera_utils.c: Driver utility functions - o altera_utils.h: Driver utility function definitions - -5) Debug Information +-------------------------- + - Kconfig + - Makefile + - altera_tse_main.c: main network device driver + - altera_tse_ethtool.c: ethtool support + - altera_tse.h: private driver structure and common definitions + - altera_msgdma.h: MSGDMA implementation function definitions + - altera_sgdma.h: SGDMA implementation function definitions + - altera_msgdma.c: MSGDMA implementation + - altera_sgdma.c: SGDMA implementation + - altera_sgdmahw.h: SGDMA register and descriptor definitions + - altera_msgdmahw.h: MSGDMA register and descriptor definitions + - altera_utils.c: Driver utility functions + - altera_utils.h: Driver utility function definitions + +5. Debug Information +==================== The driver exports debug information such as internal statistics, debug information, MAC and DMA registers etc. @@ -118,17 +140,18 @@ or sees the MAC registers: e.g. using: ethtool -d ethX The developer can also use the "debug" module parameter to get further debug information. -6) Statistics Support +6. Statistics Support +===================== The controller and driver support a mix of IEEE standard defined statistics, RFC defined statistics, and driver or Altera defined statistics. The four specifications containing the standard definitions for these statistics are as follows: - o IEEE 802.3-2012 - IEEE Standard for Ethernet. - o RFC 2863 found at http://www.rfc-editor.org/rfc/rfc2863.txt. - o RFC 2819 found at http://www.rfc-editor.org/rfc/rfc2819.txt. - o Altera Triple Speed Ethernet User Guide, found at http://www.altera.com + - IEEE 802.3-2012 - IEEE Standard for Ethernet. + - RFC 2863 found at http://www.rfc-editor.org/rfc/rfc2863.txt. + - RFC 2819 found at http://www.rfc-editor.org/rfc/rfc2819.txt. + - Altera Triple Speed Ethernet User Guide, found at http://www.altera.com The statistics supported by the TSE and the device driver are as follows: diff --git a/Documentation/networking/arcnet-hardware.txt b/Documentation/networking/arcnet-hardware.rst index 731de411513c..b5a1a020c824 100644 --- a/Documentation/networking/arcnet-hardware.txt +++ b/Documentation/networking/arcnet-hardware.rst @@ -1,11 +1,15 @@ - ------------------------------------------------------------------------------ -1) This file is a supplement to arcnet.txt. Please read that for general - driver configuration help. ------------------------------------------------------------------------------ -2) This file is no longer Linux-specific. It should probably be moved out of - the kernel sources. Ideas? ------------------------------------------------------------------------------ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +ARCnet Hardware +=============== + +.. note:: + + 1) This file is a supplement to arcnet.txt. Please read that for general + driver configuration help. + 2) This file is no longer Linux-specific. It should probably be moved out + of the kernel sources. Ideas? Because so many people (myself included) seem to have obtained ARCnet cards without manuals, this file contains a quick introduction to ARCnet hardware, @@ -14,8 +18,8 @@ e-mail apenwarr@worldvisions.ca with any settings for your particular card, or any other information you have! -INTRODUCTION TO ARCNET ----------------------- +Introduction to ARCnet +====================== ARCnet is a network type which works in a way similar to popular Ethernet networks but which is also different in some very important ways. @@ -30,7 +34,7 @@ since I only have the 2.5 Mbps variety. It is probably not going to saturate your 100 Mbps card. Stop complaining. :) You also cannot connect an ARCnet card to any kind of Ethernet card and -expect it to work. +expect it to work. There are two "types" of ARCnet - STAR topology and BUS topology. This refers to how the cards are meant to be wired together. According to most @@ -71,19 +75,24 @@ although they are generally kept down to the Ethernet-style 1500 bytes. For more information on the advantages and disadvantages (mostly the advantages) of ARCnet networks, you might try the "ARCnet Trade Association" WWW page: + http://www.arcnet.com -CABLING ARCNET NETWORKS ------------------------ +Cabling ARCnet Networks +======================= + +This section was rewritten by + + Vojtech Pavlik <vojtech@suse.cz> -This section was rewritten by - Vojtech Pavlik <vojtech@suse.cz> using information from several people, including: - Avery Pennraun <apenwarr@worldvisions.ca> - Stephen A. Wood <saw@hallc1.cebaf.gov> - John Paul Morrison <jmorriso@bogomips.ee.ubc.ca> - Joachim Koenig <jojo@repas.de> + + - Avery Pennraun <apenwarr@worldvisions.ca> + - Stephen A. Wood <saw@hallc1.cebaf.gov> + - John Paul Morrison <jmorriso@bogomips.ee.ubc.ca> + - Joachim Koenig <jojo@repas.de> + and Avery touched it up a bit, at Vojtech's request. ARCnet (the classic 2.5 Mbps version) can be connected by two different @@ -103,13 +112,13 @@ equal to a high impedance one with a terminator installed. Usually, the ARCnet networks are built up from STAR cards and hubs. There are two types of hubs - active and passive. Passive hubs are small boxes -with four BNC connectors containing four 47 Ohm resistors: +with four BNC connectors containing four 47 Ohm resistors:: - | | wires - R + junction --R-+-R- R 47 Ohm resistors - R - | + | | wires + R + junction + -R-+-R- R 47 Ohm resistors + R + | The shielding is connected together. Active hubs are much more complicated; they are powered and contain electronics to amplify the signal and send it @@ -127,14 +136,15 @@ And now to the cabling. What you can connect together: 2. A card to a passive hub. Remember that all unused connectors on the hub must be properly terminated with 93 Ohm (or something else if you don't have the right ones) terminators. - (Avery's note: oops, I didn't know that. Mine (TV cable) works + + (Avery's note: oops, I didn't know that. Mine (TV cable) works anyway, though.) 3. A card to an active hub. Here is no need to terminate the unused connectors except some kind of aesthetic feeling. But, there may not be more than eleven active hubs between any two computers. That of course doesn't limit the number of active hubs on the network. - + 4. An active hub to another. 5. An active hub to passive hub. @@ -142,22 +152,22 @@ And now to the cabling. What you can connect together: Remember that you cannot connect two passive hubs together. The power loss implied by such a connection is too high for the net to operate reliably. -An example of a typical ARCnet network: +An example of a typical ARCnet network:: - R S - STAR type card + R S - STAR type card S------H--------A-------S R - Terminator - | | H - Hub - | | A - Active hub - | S----H----S - S | - | - S - + | | H - Hub + | | A - Active hub + | S----H----S + S | + | + S + The BUS topology is very similar to the one used by Ethernet. The only difference is in cable and terminators: they should be 93 Ohm. Ethernet uses 50 Ohm impedance. You use T connectors to put the computers on a single line of cable, the bus. You have to put terminators at both ends of the -cable. A typical BUS ARCnet network looks like: +cable. A typical BUS ARCnet network looks like:: RT----T------T------T------T------TR B B B B B B @@ -168,63 +178,63 @@ cable. A typical BUS ARCnet network looks like: But that is not all! The two types can be connected together. According to the official documentation the only way of connecting them is using an active -hub: +hub:: - A------T------T------TR - | B B B + A------T------T------TR + | B B B S---H---S - | - S + | + S The official docs also state that you can use STAR cards at the ends of -BUS network in place of a BUS card and a terminator: +BUS network in place of a BUS card and a terminator:: S------T------T------S - B B + B B But, according to my own experiments, you can simply hang a BUS type card anywhere in middle of a cable in a STAR topology network. And more - you can use the bus card in place of any star card if you use a terminator. Then you can build very complicated networks fulfilling all your needs! An -example: - - S - | - RT------T-------T------H------S - B B B | - | R - S------A------T-------T-------A-------H------TR - | B B | | B - | S BT | - | | | S----A-----S - S------H---A----S | | - | | S------T----H---S | - S S B R S - +example:: + + S + | + RT------T-------T------H------S + B B B | + | R + S------A------T-------T-------A-------H------TR + | B B | | B + | S BT | + | | | S----A-----S + S------H---A----S | | + | | S------T----H---S | + S S B R S + A basically different cabling scheme is used with Twisted Pair cabling. Each of the TP cards has two RJ (phone-cord style) connectors. The cards are then daisy-chained together using a cable connecting every two neighboring cards. The ends are terminated with RJ 93 Ohm terminators which plug into -the empty connectors of cards on the ends of the chain. An example: +the empty connectors of cards on the ends of the chain. An example:: - ___________ ___________ - _R_|_ _|_|_ _|_R_ - | | | | | | - |Card | |Card | |Card | - |_____| |_____| |_____| + ___________ ___________ + _R_|_ _|_|_ _|_R_ + | | | | | | + |Card | |Card | |Card | + |_____| |_____| |_____| There are also hubs for the TP topology. There is nothing difficult involved in using them; you just connect a TP chain to a hub on any end or -even at both. This way you can create almost any network configuration. +even at both. This way you can create almost any network configuration. The maximum of 11 hubs between any two computers on the net applies here as -well. An example: +well. An example:: RP-------P--------P--------H-----P------P-----PR - | + | RP-----H--------P--------H-----P------PR - | | - PR PR + | | + PR PR R - RJ Terminator P - TP Card @@ -234,11 +244,13 @@ Like any network, ARCnet has a limited cable length. These are the maximum cable lengths between two active ends (an active end being an active hub or a STAR card). + ========== ======= =========== RG-62 93 Ohm up to 650 m RG-59/U 75 Ohm up to 457 m RG-11/U 75 Ohm up to 533 m IBM Type 1 150 Ohm up to 200 m IBM Type 3 100 Ohm up to 100 m + ========== ======= =========== The maximum length of all cables connected to a passive hub is limited to 65 meters for RG-62 cabling; less for others. You can see that using passive @@ -248,8 +260,8 @@ most distant points of the net is limited to 3000 meters. The maximum length of a TP cable between two cards/hubs is 650 meters. -SETTING THE JUMPERS -------------------- +Setting the Jumpers +=================== All ARCnet cards should have a total of four or five different settings: @@ -261,43 +273,51 @@ All ARCnet cards should have a total of four or five different settings: eating net connections on my system (at least) otherwise. My guess is this may be because, if your card is at 0x2E0, probing for a serial port at 0x2E8 will reset the card and probably mess things up royally. + - Avery's favourite: 0x300. - the IRQ: on 8-bit cards, it might be 2 (9), 3, 4, 5, or 7. - on 16-bit cards, it might be 2 (9), 3, 4, 5, 7, or 10-15. - + on 16-bit cards, it might be 2 (9), 3, 4, 5, 7, or 10-15. + Make sure this is different from any other card on your system. Note that IRQ2 is the same as IRQ9, as far as Linux is concerned. You can "cat /proc/interrupts" for a somewhat complete list of which ones are in use at any given time. Here is a list of common usages from Vojtech Pavlik <vojtech@suse.cz>: - ("Not on bus" means there is no way for a card to generate this + + ("Not on bus" means there is no way for a card to generate this interrupt) - IRQ 0 - Timer 0 (Not on bus) - IRQ 1 - Keyboard (Not on bus) - IRQ 2 - IRQ Controller 2 (Not on bus, nor does interrupt the CPU) - IRQ 3 - COM2 - IRQ 4 - COM1 - IRQ 5 - FREE (LPT2 if you have it; sometimes COM3; maybe PLIP) - IRQ 6 - Floppy disk controller - IRQ 7 - FREE (LPT1 if you don't use the polling driver; PLIP) - IRQ 8 - Realtime Clock Interrupt (Not on bus) - IRQ 9 - FREE (VGA vertical sync interrupt if enabled) - IRQ 10 - FREE - IRQ 11 - FREE - IRQ 12 - FREE - IRQ 13 - Numeric Coprocessor (Not on bus) - IRQ 14 - Fixed Disk Controller - IRQ 15 - FREE (Fixed Disk Controller 2 if you have it) - - Note: IRQ 9 is used on some video cards for the "vertical retrace" - interrupt. This interrupt would have been handy for things like - video games, as it occurs exactly once per screen refresh, but - unfortunately IBM cancelled this feature starting with the original - VGA and thus many VGA/SVGA cards do not support it. For this - reason, no modern software uses this interrupt and it can almost - always be safely disabled, if your video card supports it at all. - + + ====== ========================================================= + IRQ 0 Timer 0 (Not on bus) + IRQ 1 Keyboard (Not on bus) + IRQ 2 IRQ Controller 2 (Not on bus, nor does interrupt the CPU) + IRQ 3 COM2 + IRQ 4 COM1 + IRQ 5 FREE (LPT2 if you have it; sometimes COM3; maybe PLIP) + IRQ 6 Floppy disk controller + IRQ 7 FREE (LPT1 if you don't use the polling driver; PLIP) + IRQ 8 Realtime Clock Interrupt (Not on bus) + IRQ 9 FREE (VGA vertical sync interrupt if enabled) + IRQ 10 FREE + IRQ 11 FREE + IRQ 12 FREE + IRQ 13 Numeric Coprocessor (Not on bus) + IRQ 14 Fixed Disk Controller + IRQ 15 FREE (Fixed Disk Controller 2 if you have it) + ====== ========================================================= + + + .. note:: + + IRQ 9 is used on some video cards for the "vertical retrace" + interrupt. This interrupt would have been handy for things like + video games, as it occurs exactly once per screen refresh, but + unfortunately IBM cancelled this feature starting with the original + VGA and thus many VGA/SVGA cards do not support it. For this + reason, no modern software uses this interrupt and it can almost + always be safely disabled, if your video card supports it at all. + If your card for some reason CANNOT disable this IRQ (usually there is a jumper), one solution would be to clip the printed circuit contact on the board: it's the fourth contact from the left on the @@ -308,14 +328,18 @@ All ARCnet cards should have a total of four or five different settings: - the memory address: Unlike most cards, ARCnets use "shared memory" for copying buffers around. Make SURE it doesn't conflict with any other used memory in your system! + + :: + A0000 - VGA graphics memory (ok if you don't have VGA) - B0000 - Monochrome text mode - C0000 \ One of these is your VGA BIOS - usually C0000. - E0000 / - F0000 - System BIOS + B0000 - Monochrome text mode + C0000 \ One of these is your VGA BIOS - usually C0000. + E0000 / + F0000 - System BIOS Anything less than 0xA0000 is, well, a BAD idea since it isn't above 640k. + - Avery's favourite: 0xD0000 - the station address: Every ARCnet card has its own "unique" network @@ -326,6 +350,7 @@ All ARCnet cards should have a total of four or five different settings: neat stuff will probably happen if you DO use them). By the way, if you haven't already guessed, don't set this the same as any other ARCnet on your network! + - Avery's favourite: 3 and 4. Not that it matters. - There may be ETS1 and ETS2 settings. These may or may not make a @@ -336,28 +361,34 @@ All ARCnet cards should have a total of four or five different settings: requirement here is that all cards on the network with ETS1 and ETS2 jumpers have them in the same position. Chris Hindy <chrish@io.org> sent in a chart with actual values for this: + + ======= ======= =============== ==================== ET1 ET2 Response Time Reconfiguration Time - --- --- ------------- -------------------- + ======= ======= =============== ==================== open open 74.7us 840us open closed 283.4us 1680us closed open 561.8us 1680us closed closed 1118.6us 1680us - + ======= ======= =============== ==================== + Make sure you set ETS1 and ETS2 to the SAME VALUE for all cards on your network. - -Also, on many cards (not mine, though) there are red and green LED's. + +Also, on many cards (not mine, though) there are red and green LED's. Vojtech Pavlik <vojtech@suse.cz> tells me this is what they mean: + + =============== =============== ===================================== GREEN RED Status - ----- --- ------ + =============== =============== ===================================== OFF OFF Power off OFF Short flashes Cabling problems (broken cable or not - terminated) + terminated) OFF (short) ON Card init ON ON Normal state - everything OK, nothing - happens + happens ON Long flashes Data transfer ON OFF Never happens (maybe when wrong ID) + =============== =============== ===================================== The following is all the specific information people have sent me about @@ -366,7 +397,7 @@ huge amounts of duplicated information. I have no time to fix it. If you want to, PLEASE DO! Just send me a 'diff -u' of all your changes. The model # is listed right above specifics for that card, so you should be -able to use your text viewer's "search" function to find the entry you want. +able to use your text viewer's "search" function to find the entry you want. If you don't KNOW what kind of card you have, try looking through the various diagrams to see if you can tell. @@ -378,8 +409,9 @@ model that is, please e-mail me to say so. Cards Listed in this file (in this order, mostly): + =============== ======================= ==== Manufacturer Model # Bits - ------------ ------- ---- + =============== ======================= ==== SMC PC100 8 SMC PC110 8 SMC PC120 8 @@ -404,17 +436,19 @@ Cards Listed in this file (in this order, mostly): No Name Taiwan R.O.C? 8 No Name Model 9058 8 Tiara Tiara Lancard? 8 - + =============== ======================= ==== -** SMC = Standard Microsystems Corp. -** CNet Tech = CNet Technology, Inc. +* SMC = Standard Microsystems Corp. +* CNet Tech = CNet Technology, Inc. Unclassified Stuff ------------------- +================== + - Please send any other information you can find. - - - And some other stuff (more info is welcome!): + + - And some other stuff (more info is welcome!):: + From: root@ultraworld.xs4all.nl (Timo Hilbrink) To: apenwarr@foxnet.net (Avery Pennarun) Date: Wed, 26 Oct 1994 02:10:32 +0000 (GMT) @@ -423,7 +457,7 @@ Unclassified Stuff [...parts deleted...] About the jumpers: On my PC130 there is one more jumper, located near the - cable-connector and it's for changing to star or bus topology; + cable-connector and it's for changing to star or bus topology; closed: star - open: bus On the PC500 are some more jumper-pins, one block labeled with RX,PDN,TXI and another with ALE,LA17,LA18,LA19 these are undocumented.. @@ -432,136 +466,130 @@ Unclassified Stuff --- CUT --- +Standard Microsystems Corp (SMC) +================================ + +PC100, PC110, PC120, PC130 (8-bit cards) and PC500, PC600 (16-bit cards) +------------------------------------------------------------------------ -** Standard Microsystems Corp (SMC) ** -PC100, PC110, PC120, PC130 (8-bit cards) -PC500, PC600 (16-bit cards) ---------------------------------- - mainly from Avery Pennarun <apenwarr@worldvisions.ca>. Values depicted are from Avery's setup. - special thanks to Timo Hilbrink <timoh@xs4all.nl> for noting that PC120, - 130, 500, and 600 all have the same switches as Avery's PC100. + 130, 500, and 600 all have the same switches as Avery's PC100. PC500/600 have several extra, undocumented pins though. (?) - PC110 settings were verified by Stephen A. Wood <saw@cebaf.gov> - Also, the JP- and S-numbers probably don't match your card exactly. Try to find jumpers/switches with the same number of settings - it's probably more reliable. - - - JP5 [|] : : : : -(IRQ Setting) IRQ2 IRQ3 IRQ4 IRQ5 IRQ7 - Put exactly one jumper on exactly one set of pins. - - - 1 2 3 4 5 6 7 8 9 10 - S1 /----------------------------------\ -(I/O and Memory | 1 1 * 0 0 0 0 * 1 1 0 1 | - addresses) \----------------------------------/ - |--| |--------| |--------| - (a) (b) (m) - - WARNING. It's very important when setting these which way - you're holding the card, and which way you think is '1'! - - If you suspect that your settings are not being made - correctly, try reversing the direction or inverting the - switch positions. - - a: The first digit of the I/O address. - Setting Value - ------- ----- - 00 0 - 01 1 - 10 2 - 11 3 - - b: The second digit of the I/O address. - Setting Value - ------- ----- - 0000 0 - 0001 1 - 0010 2 - ... ... - 1110 E - 1111 F - - The I/O address is in the form ab0. For example, if - a is 0x2 and b is 0xE, the address will be 0x2E0. - - DO NOT SET THIS LESS THAN 0x200!!!!! - - - m: The first digit of the memory address. - Setting Value - ------- ----- - 0000 0 - 0001 1 - 0010 2 - ... ... - 1110 E - 1111 F - - The memory address is in the form m0000. For example, if - m is D, the address will be 0xD0000. - - DO NOT SET THIS TO C0000, F0000, OR LESS THAN A0000! - - 1 2 3 4 5 6 7 8 - S2 /--------------------------\ -(Station Address) | 1 1 0 0 0 0 0 0 | - \--------------------------/ - - Setting Value - ------- ----- - 00000000 00 - 10000000 01 - 01000000 02 - ... - 01111111 FE - 11111111 FF - - Note that this is binary with the digits reversed! - - DO NOT SET THIS TO 0 OR 255 (0xFF)! +:: + + JP5 [|] : : : : + (IRQ Setting) IRQ2 IRQ3 IRQ4 IRQ5 IRQ7 + Put exactly one jumper on exactly one set of pins. + + + 1 2 3 4 5 6 7 8 9 10 + S1 /----------------------------------\ + (I/O and Memory | 1 1 * 0 0 0 0 * 1 1 0 1 | + addresses) \----------------------------------/ + |--| |--------| |--------| + (a) (b) (m) + + WARNING. It's very important when setting these which way + you're holding the card, and which way you think is '1'! + + If you suspect that your settings are not being made + correctly, try reversing the direction or inverting the + switch positions. + + a: The first digit of the I/O address. + Setting Value + ------- ----- + 00 0 + 01 1 + 10 2 + 11 3 + + b: The second digit of the I/O address. + Setting Value + ------- ----- + 0000 0 + 0001 1 + 0010 2 + ... ... + 1110 E + 1111 F + + The I/O address is in the form ab0. For example, if + a is 0x2 and b is 0xE, the address will be 0x2E0. + + DO NOT SET THIS LESS THAN 0x200!!!!! + + + m: The first digit of the memory address. + Setting Value + ------- ----- + 0000 0 + 0001 1 + 0010 2 + ... ... + 1110 E + 1111 F + + The memory address is in the form m0000. For example, if + m is D, the address will be 0xD0000. + + DO NOT SET THIS TO C0000, F0000, OR LESS THAN A0000! + + 1 2 3 4 5 6 7 8 + S2 /--------------------------\ + (Station Address) | 1 1 0 0 0 0 0 0 | + \--------------------------/ + + Setting Value + ------- ----- + 00000000 00 + 10000000 01 + 01000000 02 + ... + 01111111 FE + 11111111 FF + + Note that this is binary with the digits reversed! + + DO NOT SET THIS TO 0 OR 255 (0xFF)! -***************************************************************************** -** Standard Microsystems Corp (SMC) ** PC130E/PC270E (8-bit cards) --------------------------- - - from Juergen Seifert <seifert@htwm.de> - -STANDARD MICROSYSTEMS CORPORATION (SMC) ARCNET(R)-PC130E/PC270E -=============================================================== + - from Juergen Seifert <seifert@htwm.de> This description has been written by Juergen Seifert <seifert@htwm.de> -using information from the following Original SMC Manual +using information from the following Original SMC Manual - "Configuration Guide for - ARCNET(R)-PC130E/PC270 - Network Controller Boards - Pub. # 900.044A - June, 1989" + "Configuration Guide for ARCNET(R)-PC130E/PC270 Network + Controller Boards Pub. # 900.044A June, 1989" ARCNET is a registered trademark of the Datapoint Corporation -SMC is a registered trademark of the Standard Microsystems Corporation +SMC is a registered trademark of the Standard Microsystems Corporation -The PC130E is an enhanced version of the PC130 board, is equipped with a +The PC130E is an enhanced version of the PC130 board, is equipped with a standard BNC female connector for connection to RG-62/U coax cable. Since this board is designed both for point-to-point connection in star -networks and for connection to bus networks, it is downwardly compatible +networks and for connection to bus networks, it is downwardly compatible with all the other standard boards designed for coax networks (that is, -the PC120, PC110 and PC100 star topology boards and the PC220, PC210 and +the PC120, PC110 and PC100 star topology boards and the PC220, PC210 and PC200 bus topology boards). -The PC270E is an enhanced version of the PC260 board, is equipped with two +The PC270E is an enhanced version of the PC260 board, is equipped with two modular RJ11-type jacks for connection to twisted pair wiring. It can be used in a star or a daisy-chained network. +:: - 8 7 6 5 4 3 2 1 + 8 7 6 5 4 3 2 1 ________________________________________________________________ | | S1 | | | |_________________| | @@ -587,27 +615,27 @@ It can be used in a star or a daisy-chained network. | | |_____________________________________________| -Legend: +Legend:: -SMC 90C63 ARCNET Controller / Transceiver /Logic -S1 1-3: I/O Base Address Select + SMC 90C63 ARCNET Controller / Transceiver /Logic + S1 1-3: I/O Base Address Select 4-6: Memory Base Address Select 7-8: RAM Offset Select -S2 1-8: Node ID Select -EXT Extended Timeout Select -ROM ROM Enable Select -STAR Selected - Star Topology (PC130E only) + S2 1-8: Node ID Select + EXT Extended Timeout Select + ROM ROM Enable Select + STAR Selected - Star Topology (PC130E only) Deselected - Bus Topology (PC130E only) -CR3/CR4 Diagnostic LEDs -J1 BNC RG62/U Connector (PC130E only) -J1 6-position Telephone Jack (PC270E only) -J2 6-position Telephone Jack (PC270E only) + CR3/CR4 Diagnostic LEDs + J1 BNC RG62/U Connector (PC130E only) + J1 6-position Telephone Jack (PC270E only) + J2 6-position Telephone Jack (PC270E only) Setting one of the switches to Off/Open means "1", On/Closed means "0". Setting the Node ID -------------------- +^^^^^^^^^^^^^^^^^^^ The eight switches in group S2 are used to set the node ID. These switches work in a way similar to the PC100-series cards; see that @@ -615,10 +643,10 @@ entry for more information. Setting the I/O Base Address ----------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The first three switches in switch group S1 are used to select one -of eight possible I/O Base addresses using the following table +of eight possible I/O Base addresses using the following table:: Switch | Hex I/O @@ -635,14 +663,16 @@ of eight possible I/O Base addresses using the following table Setting the Base Memory (RAM) buffer Address --------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The memory buffer requires 2K of a 16K block of RAM. The base of this 16K block can be located in any of eight positions. Switches 4-6 of switch group S1 select the Base of the 16K block. -Within that 16K address space, the buffer may be assigned any one of four +Within that 16K address space, the buffer may be assigned any one of four positions, determined by the offset, switches 7 and 8 of group S1. +:: + Switch | Hex RAM | Hex ROM 4 5 6 7 8 | Address | Address *) -----------|---------|----------- @@ -650,115 +680,111 @@ positions, determined by the offset, switches 7 and 8 of group S1. 0 0 0 0 1 | C0800 | C2000 0 0 0 1 0 | C1000 | C2000 0 0 0 1 1 | C1800 | C2000 - | | + | | 0 0 1 0 0 | C4000 | C6000 0 0 1 0 1 | C4800 | C6000 0 0 1 1 0 | C5000 | C6000 0 0 1 1 1 | C5800 | C6000 - | | + | | 0 1 0 0 0 | CC000 | CE000 0 1 0 0 1 | CC800 | CE000 0 1 0 1 0 | CD000 | CE000 0 1 0 1 1 | CD800 | CE000 - | | + | | 0 1 1 0 0 | D0000 | D2000 (Manufacturer's default) 0 1 1 0 1 | D0800 | D2000 0 1 1 1 0 | D1000 | D2000 0 1 1 1 1 | D1800 | D2000 - | | + | | 1 0 0 0 0 | D4000 | D6000 1 0 0 0 1 | D4800 | D6000 1 0 0 1 0 | D5000 | D6000 1 0 0 1 1 | D5800 | D6000 - | | + | | 1 0 1 0 0 | D8000 | DA000 1 0 1 0 1 | D8800 | DA000 1 0 1 1 0 | D9000 | DA000 1 0 1 1 1 | D9800 | DA000 - | | + | | 1 1 0 0 0 | DC000 | DE000 1 1 0 0 1 | DC800 | DE000 1 1 0 1 0 | DD000 | DE000 1 1 0 1 1 | DD800 | DE000 - | | + | | 1 1 1 0 0 | E0000 | E2000 1 1 1 0 1 | E0800 | E2000 1 1 1 1 0 | E1000 | E2000 1 1 1 1 1 | E1800 | E2000 - -*) To enable the 8K Boot PROM install the jumper ROM. - The default is jumper ROM not installed. + + *) To enable the 8K Boot PROM install the jumper ROM. + The default is jumper ROM not installed. Setting the Timeouts and Interrupt ----------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The jumpers labeled EXT1 and EXT2 are used to determine the timeout +The jumpers labeled EXT1 and EXT2 are used to determine the timeout parameters. These two jumpers are normally left open. To select a hardware interrupt level set one (only one!) of the jumpers IRQ2, IRQ3, IRQ4, IRQ5, IRQ7. The Manufacturer's default is IRQ2. - + Configuring the PC130E for Star or Bus Topology ------------------------------------------------ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The single jumper labeled STAR is used to configure the PC130E board for +The single jumper labeled STAR is used to configure the PC130E board for star or bus topology. -When the jumper is installed, the board may be used in a star network, when +When the jumper is installed, the board may be used in a star network, when it is removed, the board can be used in a bus topology. Diagnostic LEDs ---------------- +^^^^^^^^^^^^^^^ Two diagnostic LEDs are visible on the rear bracket of the board. The green LED monitors the network activity: the red one shows the -board activity: +board activity:: Green | Status Red | Status -------|------------------- ---------|------------------- on | normal activity flash/on | data transfer blink | reconfiguration off | no data transfer; off | defective board or | incorrect memory or - | node ID is zero | I/O address - + | node ID is zero | I/O address -***************************************************************************** -** Standard Microsystems Corp (SMC) ** PC500/PC550 Longboard (16-bit cards) -------------------------------------- +------------------------------------ + - from Juergen Seifert <seifert@htwm.de> -STANDARD MICROSYSTEMS CORPORATION (SMC) ARCNET-PC500/PC550 Long Board -===================================================================== + .. note:: -Note: There is another Version of the PC500 called Short Version, which + There is another Version of the PC500 called Short Version, which is different in hard- and software! The most important differences are: + - The long board has no Shared memory. - On the long board the selection of the interrupt is done by binary - coded switch, on the short board directly by jumper. - + coded switch, on the short board directly by jumper. + [Avery's note: pay special attention to that: the long board HAS NO SHARED -MEMORY. This means the current Linux-ARCnet driver can't use these cards. +MEMORY. This means the current Linux-ARCnet driver can't use these cards. I have obtained a PC500Longboard and will be doing some experiments on it in the future, but don't hold your breath. Thanks again to Juergen Seifert for his advice about this!] This description has been written by Juergen Seifert <seifert@htwm.de> -using information from the following Original SMC Manual +using information from the following Original SMC Manual - "Configuration Guide for - SMC ARCNET-PC500/PC550 - Series Network Controller Boards - Pub. # 900.033 Rev. A - November, 1989" + "Configuration Guide for SMC ARCNET-PC500/PC550 + Series Network Controller Boards Pub. # 900.033 Rev. A + November, 1989" ARCNET is a registered trademark of the Datapoint Corporation -SMC is a registered trademark of the Standard Microsystems Corporation +SMC is a registered trademark of the Standard Microsystems Corporation The PC500 is equipped with a standard BNC female connector for connection to RG-62/U coax cable. @@ -769,7 +795,9 @@ The PC550 is equipped with two modular RJ11-type jacks for connection to twisted pair wiring. It can be used in a star or a daisy-chained (BUS) network. - 1 +:: + + 1 0 9 8 7 6 5 4 3 2 1 6 5 4 3 2 1 ____________________________________________________________________ < | SW1 | | SW2 | | @@ -796,34 +824,34 @@ It can be used in a star or a daisy-chained (BUS) network. > | | | <____| |_____________________________________________| -Legend: +Legend:: -SW1 1-6: I/O Base Address Select + SW1 1-6: I/O Base Address Select 7-10: Interrupt Select -SW2 1-6: Reserved for Future Use -SW3 1-8: Node ID Select -JP2 1-4: Extended Timeout Select -JP6 Selected - Star Topology (PC500 only) + SW2 1-6: Reserved for Future Use + SW3 1-8: Node ID Select + JP2 1-4: Extended Timeout Select + JP6 Selected - Star Topology (PC500 only) Deselected - Bus Topology (PC500 only) -CR3 Green Monitors Network Activity -CR4 Red Monitors Board Activity -J1 BNC RG62/U Connector (PC500 only) -J1 6-position Telephone Jack (PC550 only) -J2 6-position Telephone Jack (PC550 only) + CR3 Green Monitors Network Activity + CR4 Red Monitors Board Activity + J1 BNC RG62/U Connector (PC500 only) + J1 6-position Telephone Jack (PC550 only) + J2 6-position Telephone Jack (PC550 only) Setting one of the switches to Off/Open means "1", On/Closed means "0". Setting the Node ID -------------------- +^^^^^^^^^^^^^^^^^^^ The eight switches in group SW3 are used to set the node ID. Each node -attached to the network must have an unique node ID which must be +attached to the network must have an unique node ID which must be different from 0. Switch 1 serves as the least significant bit (LSB). -The node ID is the sum of the values of all switches set to "1" -These values are: +The node ID is the sum of the values of all switches set to "1" +These values are:: Switch | Value -------|------- @@ -836,30 +864,30 @@ These values are: 7 | 64 8 | 128 -Some Examples: +Some Examples:: - Switch | Hex | Decimal + Switch | Hex | Decimal 8 7 6 5 4 3 2 1 | Node ID | Node ID ----------------|---------|--------- 0 0 0 0 0 0 0 0 | not allowed - 0 0 0 0 0 0 0 1 | 1 | 1 + 0 0 0 0 0 0 0 1 | 1 | 1 0 0 0 0 0 0 1 0 | 2 | 2 0 0 0 0 0 0 1 1 | 3 | 3 . . . | | 0 1 0 1 0 1 0 1 | 55 | 85 . . . | | 1 0 1 0 1 0 1 0 | AA | 170 - . . . | | + . . . | | 1 1 1 1 1 1 0 1 | FD | 253 1 1 1 1 1 1 1 0 | FE | 254 - 1 1 1 1 1 1 1 1 | FF | 255 + 1 1 1 1 1 1 1 1 | FF | 255 Setting the I/O Base Address ----------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The first six switches in switch group SW1 are used to select one -of 32 possible I/O Base addresses using the following table +of 32 possible I/O Base addresses using the following table:: Switch | Hex I/O 6 5 4 3 2 1 | Address @@ -899,16 +927,18 @@ of 32 possible I/O Base addresses using the following table Setting the Interrupt ---------------------- +^^^^^^^^^^^^^^^^^^^^^ -Switches seven through ten of switch group SW1 are used to select the -interrupt level. The interrupt level is binary coded, so selections +Switches seven through ten of switch group SW1 are used to select the +interrupt level. The interrupt level is binary coded, so selections from 0 to 15 would be possible, but only the following eight values will be supported: 3, 4, 5, 7, 9, 10, 11, 12. +:: + Switch | IRQ - 10 9 8 7 | - ---------|-------- + 10 9 8 7 | + ---------|-------- 0 0 1 1 | 3 0 1 0 0 | 4 0 1 0 1 | 5 @@ -919,52 +949,50 @@ be supported: 3, 4, 5, 7, 9, 10, 11, 12. 1 1 0 0 | 12 -Setting the Timeouts --------------------- +Setting the Timeouts +^^^^^^^^^^^^^^^^^^^^ -The two jumpers JP2 (1-4) are used to determine the timeout parameters. +The two jumpers JP2 (1-4) are used to determine the timeout parameters. These two jumpers are normally left open. Refer to the COM9026 Data Sheet for alternate configurations. Configuring the PC500 for Star or Bus Topology ----------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The single jumper labeled JP6 is used to configure the PC500 board for +The single jumper labeled JP6 is used to configure the PC500 board for star or bus topology. -When the jumper is installed, the board may be used in a star network, when +When the jumper is installed, the board may be used in a star network, when it is removed, the board can be used in a bus topology. Diagnostic LEDs ---------------- +^^^^^^^^^^^^^^^ Two diagnostic LEDs are visible on the rear bracket of the board. The green LED monitors the network activity: the red one shows the -board activity: +board activity:: Green | Status Red | Status -------|------------------- ---------|------------------- on | normal activity flash/on | data transfer blink | reconfiguration off | no data transfer; off | defective board or | incorrect memory or - | node ID is zero | I/O address - + | node ID is zero | I/O address -***************************************************************************** -** SMC ** PC710 (8-bit card) ------------------ + - from J.S. van Oosten <jvoosten@compiler.tdcnet.nl> - + Note: this data is gathered by experimenting and looking at info of other cards. However, I'm sure I got 99% of the settings right. The SMC710 card resembles the PC270 card, but is much more basic (i.e. no -LEDs, RJ11 jacks, etc.) and 8 bit. Here's a little drawing: +LEDs, RJ11 jacks, etc.) and 8 bit. Here's a little drawing:: - _______________________________________ + _______________________________________ | +---------+ +---------+ |____ | | S2 | | S1 | | | +---------+ +---------+ | @@ -976,12 +1004,12 @@ LEDs, RJ11 jacks, etc.) and 8 bit. Here's a little drawing: | +===+ | | | | .. JP1 +----------+ | - | .. | big chip | | + | .. | big chip | | | .. | 90C63 | | | .. | | | | .. +----------+ | ------- ----------- - ||||||||||||||||||||| + ||||||||||||||||||||| The row of jumpers at JP1 actually consists of 8 jumpers, (sometimes labelled) the same as on the PC270, from top to bottom: EXT2, EXT1, ROM, @@ -992,71 +1020,76 @@ are swapped (S1 is the nodeaddress, S2 sets IO- and RAM-address). I know it works when connected to a PC110 type ARCnet board. - + ***************************************************************************** -** Possibly SMC ** +Possibly SMC +============ + LCS-8830(-T) (8 and 16-bit cards) --------------------------------- + - from Mathias Katzer <mkatzer@HRZ.Uni-Bielefeld.DE> - Marek Michalkiewicz <marekm@i17linuxb.ists.pwr.wroc.pl> says the LCS-8830 is slightly different from LCS-8830-T. These are 8 bit, BUS only (the JP0 jumper is hardwired), and BNC only. - + This is a LCS-8830-T made by SMC, I think ('SMC' only appears on one PLCC, nowhere else, not even on the few Xeroxed sheets from the manual). -SMC ARCnet Board Type LCS-8830-T +SMC ARCnet Board Type LCS-8830-T:: - ------------------------------------ - | | - | JP3 88 8 JP2 | - | ##### | \ | - | ##### ET1 ET2 ###| - | 8 ###| - | U3 SW 1 JP0 ###| Phone Jacks - | -- ###| - | | | | - | | | SW2 | - | | | | - | | | ##### | - | -- ##### #### BNC Connector - | #### - | 888888 JP1 | - | 234567 | - -- ------- - ||||||||||||||||||||||||||| - -------------------------- - - -SW1: DIP-Switches for Station Address -SW2: DIP-Switches for Memory Base and I/O Base addresses - -JP0: If closed, internal termination on (default open) -JP1: IRQ Jumpers -JP2: Boot-ROM enabled if closed -JP3: Jumpers for response timeout - -U3: Boot-ROM Socket - - -ET1 ET2 Response Time Idle Time Reconfiguration Time - - 78 86 840 - X 285 316 1680 - X 563 624 1680 - X X 1130 1237 1680 - -(X means closed jumper) - -(DIP-Switch downwards means "0") + ------------------------------------ + | | + | JP3 88 8 JP2 | + | ##### | \ | + | ##### ET1 ET2 ###| + | 8 ###| + | U3 SW 1 JP0 ###| Phone Jacks + | -- ###| + | | | | + | | | SW2 | + | | | | + | | | ##### | + | -- ##### #### BNC Connector + | #### + | 888888 JP1 | + | 234567 | + -- ------- + ||||||||||||||||||||||||||| + -------------------------- + + + SW1: DIP-Switches for Station Address + SW2: DIP-Switches for Memory Base and I/O Base addresses + + JP0: If closed, internal termination on (default open) + JP1: IRQ Jumpers + JP2: Boot-ROM enabled if closed + JP3: Jumpers for response timeout + + U3: Boot-ROM Socket + + + ET1 ET2 Response Time Idle Time Reconfiguration Time + + 78 86 840 + X 285 316 1680 + X 563 624 1680 + X X 1130 1237 1680 + + (X means closed jumper) + + (DIP-Switch downwards means "0") The station address is binary-coded with SW1. The I/O base address is coded with DIP-Switches 6,7 and 8 of SW2: +======== ======== Switches Base 678 Address +======== ======== 000 260-26f 100 290-29f 010 2e0-2ef @@ -1065,19 +1098,22 @@ Switches Base 101 350-35f 011 380-38f 111 3e0-3ef +======== ======== DIP Switches 1-5 of SW2 encode the RAM and ROM Address Range: +======== ============= ================ Switches RAM ROM 12345 Address Range Address Range +======== ============= ================ 00000 C:0000-C:07ff C:2000-C:3fff 10000 C:0800-C:0fff 01000 C:1000-C:17ff 11000 C:1800-C:1fff 00100 C:4000-C:47ff C:6000-C:7fff 10100 C:4800-C:4fff -01100 C:5000-C:57ff +01100 C:5000-C:57ff 11100 C:5800-C:5fff 00010 C:C000-C:C7ff C:E000-C:ffff 10010 C:C800-C:Cfff @@ -1094,7 +1130,7 @@ Switches RAM ROM 00101 D:8000-D:87ff D:A000-D:bfff 10101 D:8800-D:8fff 01101 D:9000-D:97ff -11101 D:9800-D:9fff +11101 D:9800-D:9fff 00011 D:C000-D:c7ff D:E000-D:ffff 10011 D:C800-D:cfff 01011 D:D000-D:d7ff @@ -1103,34 +1139,37 @@ Switches RAM ROM 10111 E:0800-E:0fff 01111 E:1000-E:17ff 11111 E:1800-E:1fff +======== ============= ================ -***************************************************************************** +PureData Corp +============= -** PureData Corp ** PDI507 (8-bit card) -------------------- + - from Mark Rejhon <mdrejhon@magi.com> (slight modifications by Avery) - Avery's note: I think PDI508 cards (but definitely NOT PDI508Plus cards) are mostly the same as this. PDI508Plus cards appear to be mainly software-configured. Jumpers: + There is a jumper array at the bottom of the card, near the edge - connector. This array is labelled J1. They control the IRQs and - something else. Put only one jumper on the IRQ pins. + connector. This array is labelled J1. They control the IRQs and + something else. Put only one jumper on the IRQ pins. ETS1, ETS2 are for timing on very long distance networks. See the more general information near the top of this file. There is a J2 jumper on two pins. A jumper should be put on them, - since it was already there when I got the card. I don't know what - this jumper is for though. + since it was already there when I got the card. I don't know what + this jumper is for though. There is a two-jumper array for J3. I don't know what it is for, - but there were already two jumpers on it when I got the card. It's - a six pin grid in a two-by-three fashion. The jumpers were - configured as follows: + but there were already two jumpers on it when I got the card. It's + a six pin grid in a two-by-three fashion. The jumpers were + configured as follows:: .-------. o | o o | @@ -1140,28 +1179,28 @@ Jumpers: Carl de Billy <CARL@carainfo.com> explains J3 and J4: - J3 Diagram: + J3 Diagram:: - .-------. - o | o o | - :-------: TWIST Technology - o | o o | - `-------' - .-------. - | o o | o - :-------: COAX Technology - | o o | o - `-------' + .-------. + o | o o | + :-------: TWIST Technology + o | o o | + `-------' + .-------. + | o o | o + :-------: COAX Technology + | o o | o + `-------' - If using coax cable in a bus topology the J4 jumper must be removed; place it on one pin. - - If using bus topology with twisted pair wiring move the J3 + - If using bus topology with twisted pair wiring move the J3 jumpers so they connect the middle pin and the pins closest to the RJ11 Connectors. Also the J4 jumper must be removed; place it on one pin of J4 jumper for storage. - - If using star topology with twisted pair wiring move the J3 + - If using star topology with twisted pair wiring move the J3 jumpers so they connect the middle pin and the pins closest to the RJ11 connectors. @@ -1169,40 +1208,43 @@ Carl de Billy <CARL@carainfo.com> explains J3 and J4: DIP Switches: The DIP switches accessible on the accessible end of the card while - it is installed, is used to set the ARCnet address. There are 8 - switches. Use an address from 1 to 254. + it is installed, is used to set the ARCnet address. There are 8 + switches. Use an address from 1 to 254 - Switch No. - 12345678 ARCnet address - ----------------------------------------- + ========== ========================= + Switch No. ARCnet address + 12345678 + ========== ========================= 00000000 FF (Don't use this!) 00000001 FE 00000010 FD - .... - 11111101 2 + ... + 11111101 2 11111110 1 11111111 0 (Don't use this!) + ========== ========================= There is another array of eight DIP switches at the top of the - card. There are five labelled MS0-MS4 which seem to control the - memory address, and another three labelled IO0-IO2 which seem to - control the base I/O address of the card. + card. There are five labelled MS0-MS4 which seem to control the + memory address, and another three labelled IO0-IO2 which seem to + control the base I/O address of the card. This was difficult to test by trial and error, and the I/O addresses - are in a weird order. This was tested by setting the DIP switches, - rebooting the computer, and attempting to load ARCETHER at various - addresses (mostly between 0x200 and 0x400). The address that caused - the red transmit LED to blink, is the one that I thought works. + are in a weird order. This was tested by setting the DIP switches, + rebooting the computer, and attempting to load ARCETHER at various + addresses (mostly between 0x200 and 0x400). The address that caused + the red transmit LED to blink, is the one that I thought works. Also, the address 0x3D0 seem to have a special meaning, since the - ARCETHER packet driver loaded fine, but without the red LED - blinking. I don't know what 0x3D0 is for though. I recommend using - an address of 0x300 since Windows may not like addresses below - 0x300. - - IO Switch No. - 210 I/O address - ------------------------------- + ARCETHER packet driver loaded fine, but without the red LED + blinking. I don't know what 0x3D0 is for though. I recommend using + an address of 0x300 since Windows may not like addresses below + 0x300. + + ============= =========== + IO Switch No. I/O address + 210 + ============= =========== 111 0x260 110 0x290 101 0x2E0 @@ -1211,29 +1253,31 @@ DIP Switches: 010 0x350 001 0x380 000 0x3E0 + ============= =========== The memory switches set a reserved address space of 0x1000 bytes - (0x100 segment units, or 4k). For example if I set an address of - 0xD000, it will use up addresses 0xD000 to 0xD100. + (0x100 segment units, or 4k). For example if I set an address of + 0xD000, it will use up addresses 0xD000 to 0xD100. The memory switches were tested by booting using QEMM386 stealth, - and using LOADHI to see what address automatically became excluded - from the upper memory regions, and then attempting to load ARCETHER - using these addresses. + and using LOADHI to see what address automatically became excluded + from the upper memory regions, and then attempting to load ARCETHER + using these addresses. I recommend using an ARCnet memory address of 0xD000, and putting - the EMS page frame at 0xC000 while using QEMM stealth mode. That - way, you get contiguous high memory from 0xD100 almost all the way - the end of the megabyte. + the EMS page frame at 0xC000 while using QEMM stealth mode. That + way, you get contiguous high memory from 0xD100 almost all the way + the end of the megabyte. Memory Switch 0 (MS0) didn't seem to work properly when set to OFF - on my card. It could be malfunctioning on my card. Experiment with - it ON first, and if it doesn't work, set it to OFF. (It may be a - modifier for the 0x200 bit?) + on my card. It could be malfunctioning on my card. Experiment with + it ON first, and if it doesn't work, set it to OFF. (It may be a + modifier for the 0x200 bit?) + ============= ============================================ MS Switch No. 43210 Memory address - -------------------------------- + ============= ============================================ 00001 0xE100 (guessed - was not detected by QEMM) 00011 0xE000 (guessed - was not detected by QEMM) 00101 0xDD00 @@ -1250,40 +1294,36 @@ DIP Switches: 11011 0xC800 (guessed - crashes tested system) 11101 0xC500 (guessed - crashes tested system) 11111 0xC400 (guessed - crashes tested system) - - -***************************************************************************** + ============= ============================================ + +CNet Technology Inc. +==================== -** CNet Technology Inc. ** 120 Series (8-bit cards) ------------------------ - from Juergen Seifert <seifert@htwm.de> - -CNET TECHNOLOGY INC. (CNet) ARCNET 120A SERIES -============================================== - This description has been written by Juergen Seifert <seifert@htwm.de> -using information from the following Original CNet Manual - - "ARCNET - USER'S MANUAL - for - CN120A - CN120AB - CN120TP - CN120ST - CN120SBT - P/N:12-01-0007 - Revision 3.00" +using information from the following Original CNet Manual + + "ARCNET USER'S MANUAL for + CN120A + CN120AB + CN120TP + CN120ST + CN120SBT + P/N:12-01-0007 + Revision 3.00" ARCNET is a registered trademark of the Datapoint Corporation -P/N 120A ARCNET 8 bit XT/AT Star -P/N 120AB ARCNET 8 bit XT/AT Bus -P/N 120TP ARCNET 8 bit XT/AT Twisted Pair -P/N 120ST ARCNET 8 bit XT/AT Star, Twisted Pair -P/N 120SBT ARCNET 8 bit XT/AT Star, Bus, Twisted Pair +- P/N 120A ARCNET 8 bit XT/AT Star +- P/N 120AB ARCNET 8 bit XT/AT Bus +- P/N 120TP ARCNET 8 bit XT/AT Twisted Pair +- P/N 120ST ARCNET 8 bit XT/AT Star, Twisted Pair +- P/N 120SBT ARCNET 8 bit XT/AT Star, Bus, Twisted Pair + +:: __________________________________________________________________ | | @@ -1307,75 +1347,77 @@ P/N 120SBT ARCNET 8 bit XT/AT Star, Bus, Twisted Pair | > SOCKET | JP 6 5 4 3 2 |o|o|o| | J1 | | |______________| |o|o|o|o|o| |o|o|o| |_____| |_____ |o|o|o|o|o| ______________| - | | - |_____________________________________________| - -Legend: - -90C65 ARCNET Probe -S1 1-5: Base Memory Address Select - 6-8: Base I/O Address Select -S2 1-8: Node ID Select (ID0-ID7) -JP1 ROM Enable Select -JP2 IRQ2 -JP3 IRQ3 -JP4 IRQ4 -JP5 IRQ5 -JP6 IRQ7 -JP7/JP8 ET1, ET2 Timeout Parameters -JP10/JP11 Coax / Twisted Pair Select (CN120ST/SBT only) -JP12 Terminator Select (CN120AB/ST/SBT only) -J1 BNC RG62/U Connector (all except CN120TP) -J2 Two 6-position Telephone Jack (CN120TP/ST/SBT only) + | | + |_____________________________________________| + +Legend:: + + 90C65 ARCNET Probe + S1 1-5: Base Memory Address Select + 6-8: Base I/O Address Select + S2 1-8: Node ID Select (ID0-ID7) + JP1 ROM Enable Select + JP2 IRQ2 + JP3 IRQ3 + JP4 IRQ4 + JP5 IRQ5 + JP6 IRQ7 + JP7/JP8 ET1, ET2 Timeout Parameters + JP10/JP11 Coax / Twisted Pair Select (CN120ST/SBT only) + JP12 Terminator Select (CN120AB/ST/SBT only) + J1 BNC RG62/U Connector (all except CN120TP) + J2 Two 6-position Telephone Jack (CN120TP/ST/SBT only) Setting one of the switches to Off means "1", On means "0". Setting the Node ID -------------------- +^^^^^^^^^^^^^^^^^^^ The eight switches in SW2 are used to set the node ID. Each node attached to the network must have an unique node ID which must be different from 0. Switch 1 (ID0) serves as the least significant bit (LSB). -The node ID is the sum of the values of all switches set to "1" +The node ID is the sum of the values of all switches set to "1" These values are: - Switch | Label | Value - -------|-------|------- - 1 | ID0 | 1 - 2 | ID1 | 2 - 3 | ID2 | 4 - 4 | ID3 | 8 - 5 | ID4 | 16 - 6 | ID5 | 32 - 7 | ID6 | 64 - 8 | ID7 | 128 - -Some Examples: - - Switch | Hex | Decimal + ======= ====== ===== + Switch Label Value + ======= ====== ===== + 1 ID0 1 + 2 ID1 2 + 3 ID2 4 + 4 ID3 8 + 5 ID4 16 + 6 ID5 32 + 7 ID6 64 + 8 ID7 128 + ======= ====== ===== + +Some Examples:: + + Switch | Hex | Decimal 8 7 6 5 4 3 2 1 | Node ID | Node ID ----------------|---------|--------- 0 0 0 0 0 0 0 0 | not allowed - 0 0 0 0 0 0 0 1 | 1 | 1 + 0 0 0 0 0 0 0 1 | 1 | 1 0 0 0 0 0 0 1 0 | 2 | 2 0 0 0 0 0 0 1 1 | 3 | 3 . . . | | 0 1 0 1 0 1 0 1 | 55 | 85 . . . | | 1 0 1 0 1 0 1 0 | AA | 170 - . . . | | + . . . | | 1 1 1 1 1 1 0 1 | FD | 253 1 1 1 1 1 1 1 0 | FE | 254 1 1 1 1 1 1 1 1 | FF | 255 Setting the I/O Base Address ----------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The last three switches in switch block SW1 are used to select one -of eight possible I/O Base addresses using the following table +of eight possible I/O Base addresses using the following table:: Switch | Hex I/O @@ -1392,13 +1434,15 @@ of eight possible I/O Base addresses using the following table Setting the Base Memory (RAM) buffer Address --------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The memory buffer (RAM) requires 2K. The base of this buffer can be +The memory buffer (RAM) requires 2K. The base of this buffer can be located in any of eight positions. The address of the Boot Prom is memory base + 8K or memory base + 0x2000. Switches 1-5 of switch block SW1 select the Memory Base address. +:: + Switch | Hex RAM | Hex ROM 1 2 3 4 5 | Address | Address *) --------------------|---------|----------- @@ -1410,22 +1454,24 @@ Switches 1-5 of switch block SW1 select the Memory Base address. ON ON OFF ON OFF | D8000 | DA000 ON ON ON OFF OFF | DC000 | DE000 ON ON OFF OFF OFF | E0000 | E2000 - -*) To enable the Boot ROM install the jumper JP1 -Note: Since the switches 1 and 2 are always set to ON it may be possible + *) To enable the Boot ROM install the jumper JP1 + +.. note:: + + Since the switches 1 and 2 are always set to ON it may be possible that they can be used to add an offset of 2K, 4K or 6K to the base address, but this feature is not documented in the manual and I haven't tested it yet. Setting the Interrupt Line --------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^ To select a hardware interrupt level install one (only one!) of the jumpers -JP2, JP3, JP4, JP5, JP6. JP2 is the default. +JP2, JP3, JP4, JP5, JP6. JP2 is the default:: - Jumper | IRQ + Jumper | IRQ -------|----- 2 | 2 3 | 3 @@ -1435,71 +1481,66 @@ JP2, JP3, JP4, JP5, JP6. JP2 is the default. Setting the Internal Terminator on CN120AB/TP/SBT --------------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The jumper JP12 is used to enable the internal terminator. +The jumper JP12 is used to enable the internal terminator:: - ----- - 0 | 0 | + ----- + 0 | 0 | ----- ON | | ON | 0 | | 0 | | | OFF ----- OFF | 0 | 0 ----- - Terminator Terminator + Terminator Terminator disabled enabled - + Selecting the Connector Type on CN120ST/SBT -------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:: JP10 JP11 JP10 JP11 - ----- ----- - 0 0 | 0 | | 0 | + ----- ----- + 0 0 | 0 | | 0 | ----- ----- | | | | | 0 | | 0 | | 0 | | 0 | | | | | ----- ----- - | 0 | | 0 | 0 0 + | 0 | | 0 | 0 0 ----- ----- - Coaxial Cable Twisted Pair Cable + Coaxial Cable Twisted Pair Cable (Default) Setting the Timeout Parameters ------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The jumpers labeled EXT1 and EXT2 are used to determine the timeout +The jumpers labeled EXT1 and EXT2 are used to determine the timeout parameters. These two jumpers are normally left open. +CNet Technology Inc. +==================== -***************************************************************************** - -** CNet Technology Inc. ** 160 Series (16-bit cards) ------------------------- - from Juergen Seifert <seifert@htwm.de> -CNET TECHNOLOGY INC. (CNet) ARCNET 160A SERIES -============================================== - This description has been written by Juergen Seifert <seifert@htwm.de> -using information from the following Original CNet Manual +using information from the following Original CNet Manual - "ARCNET - USER'S MANUAL - for - CN160A - CN160AB - CN160TP - P/N:12-01-0006 - Revision 3.00" + "ARCNET USER'S MANUAL for + CN160A CN160AB CN160TP + P/N:12-01-0006 Revision 3.00" ARCNET is a registered trademark of the Datapoint Corporation -P/N 160A ARCNET 16 bit XT/AT Star -P/N 160AB ARCNET 16 bit XT/AT Bus -P/N 160TP ARCNET 16 bit XT/AT Twisted Pair +- P/N 160A ARCNET 16 bit XT/AT Star +- P/N 160AB ARCNET 16 bit XT/AT Bus +- P/N 160TP ARCNET 16 bit XT/AT Twisted Pair + +:: ___________________________________________________________________ < _________________________ ___| @@ -1526,30 +1567,30 @@ P/N 160TP ARCNET 16 bit XT/AT Twisted Pair > | | | <____________| |_______________________________________| -Legend: +Legend:: -9026 ARCNET Probe -SW1 1-6: Base I/O Address Select - 7-10: Base Memory Address Select -SW2 1-8: Node ID Select (ID0-ID7) -JP1/JP2 ET1, ET2 Timeout Parameters -JP3-JP13 Interrupt Select -J1 BNC RG62/U Connector (CN160A/AB only) -J1 Two 6-position Telephone Jack (CN160TP only) -LED + 9026 ARCNET Probe + SW1 1-6: Base I/O Address Select + 7-10: Base Memory Address Select + SW2 1-8: Node ID Select (ID0-ID7) + JP1/JP2 ET1, ET2 Timeout Parameters + JP3-JP13 Interrupt Select + J1 BNC RG62/U Connector (CN160A/AB only) + J1 Two 6-position Telephone Jack (CN160TP only) + LED Setting one of the switches to Off means "1", On means "0". Setting the Node ID -------------------- +^^^^^^^^^^^^^^^^^^^ The eight switches in SW2 are used to set the node ID. Each node attached to the network must have an unique node ID which must be different from 0. Switch 1 (ID0) serves as the least significant bit (LSB). -The node ID is the sum of the values of all switches set to "1" -These values are: +The node ID is the sum of the values of all switches set to "1" +These values are:: Switch | Label | Value -------|-------|------- @@ -1562,32 +1603,32 @@ These values are: 7 | ID6 | 64 8 | ID7 | 128 -Some Examples: +Some Examples:: - Switch | Hex | Decimal + Switch | Hex | Decimal 8 7 6 5 4 3 2 1 | Node ID | Node ID ----------------|---------|--------- 0 0 0 0 0 0 0 0 | not allowed - 0 0 0 0 0 0 0 1 | 1 | 1 + 0 0 0 0 0 0 0 1 | 1 | 1 0 0 0 0 0 0 1 0 | 2 | 2 0 0 0 0 0 0 1 1 | 3 | 3 . . . | | 0 1 0 1 0 1 0 1 | 55 | 85 . . . | | 1 0 1 0 1 0 1 0 | AA | 170 - . . . | | + . . . | | 1 1 1 1 1 1 0 1 | FD | 253 1 1 1 1 1 1 1 0 | FE | 254 1 1 1 1 1 1 1 1 | FF | 255 Setting the I/O Base Address ----------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The first six switches in switch block SW1 are used to select the I/O Base -address using the following table: +address using the following table:: - Switch | Hex I/O + Switch | Hex I/O 1 2 3 4 5 6 | Address ------------------------|-------- OFF ON ON OFF OFF ON | 260 @@ -1604,10 +1645,10 @@ Note: Other IO-Base addresses seem to be selectable, but only the above Setting the Base Memory (RAM) buffer Address --------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The switches 7-10 of switch block SW1 are used to select the Memory -Base address of the RAM (2K) and the PROM. +Base address of the RAM (2K) and the PROM:: Switch | Hex RAM | Hex ROM 7 8 9 10 | Address | Address @@ -1616,17 +1657,19 @@ Base address of the RAM (2K) and the PROM. OFF OFF ON OFF | D0000 | D8000 (Default) OFF OFF OFF ON | E0000 | E8000 -Note: Other MEM-Base addresses seem to be selectable, but only the above +.. note:: + + Other MEM-Base addresses seem to be selectable, but only the above combinations are documented. Setting the Interrupt Line --------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^ To select a hardware interrupt level install one (only one!) of the jumpers -JP3 through JP13 using the following table: +JP3 through JP13 using the following table:: - Jumper | IRQ + Jumper | IRQ -------|----------------- 3 | 14 4 | 15 @@ -1640,10 +1683,12 @@ JP3 through JP13 using the following table: 12 | 7 13 | 2 (=9) Default! -Note: - Do not use JP11=IRQ6, it may conflict with your Floppy Disk - Controller +.. note:: + + - Do not use JP11=IRQ6, it may conflict with your Floppy Disk + Controller - Use JP3=IRQ14 only, if you don't have an IDE-, MFM-, or RLL- - Hard Disk, it may conflict with their controllers + Hard Disk, it may conflict with their controllers Setting the Timeout Parameters @@ -1653,14 +1698,16 @@ The jumpers labeled JP1 and JP2 are used to determine the timeout parameters. These two jumpers are normally left open. -***************************************************************************** +Lantech +======= -** Lantech ** 8-bit card, unknown model ------------------------- - from Vlad Lungu <vlungu@ugal.ro> - his e-mail address seemed broken at the time I tried to reach him. Sorry Vlad, if you didn't get my reply. +:: + ________________________________________________________________ | 1 8 | | ___________ __| @@ -1683,25 +1730,27 @@ parameters. These two jumpers are normally left open. | | PROM | |ooooo| JP6 | | |____________| |ooooo| | |_____________ _ _| - |____________________________________________| |__| + |____________________________________________| |__| UM9065L : ARCnet Controller SW 1 : Shared Memory Address and I/O Base - ON=0 +:: + + ON=0 - 12345|Memory Address - -----|-------------- - 00001| D4000 - 00010| CC000 - 00110| D0000 - 01110| D1000 - 01101| D9000 - 10010| CC800 - 10011| DC800 - 11110| D1800 + 12345|Memory Address + -----|-------------- + 00001| D4000 + 00010| CC000 + 00110| D0000 + 01110| D1000 + 01101| D9000 + 10010| CC800 + 10011| DC800 + 11110| D1800 It seems that the bits are considered in reverse order. Also, you must observe that some of those addresses are unusual and I didn't probe them; I @@ -1710,43 +1759,48 @@ some others that I didn't write here the card seems to conflict with the video card (an S3 GENDAC). I leave the full decoding of those addresses to you. - 678| I/O Address - ---|------------ - 000| 260 - 001| failed probe - 010| 2E0 - 011| 380 - 100| 290 - 101| 350 - 110| failed probe - 111| 3E0 +:: -SW 2 : Node ID (binary coded) + 678| I/O Address + ---|------------ + 000| 260 + 001| failed probe + 010| 2E0 + 011| 380 + 100| 290 + 101| 350 + 110| failed probe + 111| 3E0 -JP 4 : Boot PROM enable CLOSE - enabled - OPEN - disabled + SW 2 : Node ID (binary coded) -JP 6 : IRQ set (ONLY ONE jumper on 1-5 for IRQ 2-6) + JP 4 : Boot PROM enable CLOSE - enabled + OPEN - disabled + JP 6 : IRQ set (ONLY ONE jumper on 1-5 for IRQ 2-6) -***************************************************************************** -** Acer ** +Acer +==== + 8-bit card, Model 5210-003 -------------------------- + - from Vojtech Pavlik <vojtech@suse.cz> using portions of the existing arcnet-hardware file. This is a 90C26 based card. Its configuration seems similar to the SMC PC100, but has some additional jumpers I don't know the meaning of. - __ - | | +:: + + __ + | | ___________|__|_________________________ | | | | | | BNC | | | |______| ___| - | _____________________ |___ + | _____________________ |___ | | | | | | Hybrid IC | | | | | o|o J1 | @@ -1762,51 +1816,51 @@ PC100, but has some additional jumpers I don't know the meaning of. | _____ | | | | _____ | | | | | | ___| - | | | | | | - | _____ | ROM | | UFS | | - | | | | | | | | - | | | ___ | | | | | - | | | | | |__.__| |__.__| | - | | NCR | |XTL| _____ _____ | - | | | |___| | | | | | - | |90C26| | | | | | - | | | | RAM | | UFS | | - | | | J17 o|o | | | | | - | | | J16 o|o | | | | | - | |__.__| |__.__| |__.__| | - | ___ | - | | |8 | - | |SW2| | - | | | | - | |___|1 | - | ___ | - | | |10 J18 o|o | - | | | o|o | - | |SW1| o|o | - | | | J21 o|o | - | |___|1 | - | | - |____________________________________| - - -Legend: - -90C26 ARCNET Chip -XTL 20 MHz Crystal -SW1 1-6 Base I/O Address Select - 7-10 Memory Address Select -SW2 1-8 Node ID Select (ID0-ID7) -J1-J5 IRQ Select -J6-J21 Unknown (Probably extra timeouts & ROM enable ...) -LED1 Activity LED -BNC Coax connector (STAR ARCnet) -RAM 2k of SRAM -ROM Boot ROM socket -UFS Unidentified Flying Sockets + | | | | | | + | _____ | ROM | | UFS | | + | | | | | | | | + | | | ___ | | | | | + | | | | | |__.__| |__.__| | + | | NCR | |XTL| _____ _____ | + | | | |___| | | | | | + | |90C26| | | | | | + | | | | RAM | | UFS | | + | | | J17 o|o | | | | | + | | | J16 o|o | | | | | + | |__.__| |__.__| |__.__| | + | ___ | + | | |8 | + | |SW2| | + | | | | + | |___|1 | + | ___ | + | | |10 J18 o|o | + | | | o|o | + | |SW1| o|o | + | | | J21 o|o | + | |___|1 | + | | + |____________________________________| + + +Legend:: + + 90C26 ARCNET Chip + XTL 20 MHz Crystal + SW1 1-6 Base I/O Address Select + 7-10 Memory Address Select + SW2 1-8 Node ID Select (ID0-ID7) + J1-J5 IRQ Select + J6-J21 Unknown (Probably extra timeouts & ROM enable ...) + LED1 Activity LED + BNC Coax connector (STAR ARCnet) + RAM 2k of SRAM + ROM Boot ROM socket + UFS Unidentified Flying Sockets Setting the Node ID -------------------- +^^^^^^^^^^^^^^^^^^^ The eight switches in SW2 are used to set the node ID. Each node attached to the network must have an unique node ID which must not be 0. @@ -1815,7 +1869,7 @@ Switch 1 (ID0) serves as the least significant bit (LSB). Setting one of the switches to OFF means "1", ON means "0". The node ID is the sum of the values of all switches set to "1" -These values are: +These values are:: Switch | Value -------|------- @@ -1832,40 +1886,40 @@ Don't set this to 0 or 255; these values are reserved. Setting the I/O Base Address ----------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The switches 1 to 6 of switch block SW1 are used to select one -of 32 possible I/O Base addresses using the following tables - - | Hex +of 32 possible I/O Base addresses using the following tables:: + + | Hex Switch | Value -------|------- - 1 | 200 - 2 | 100 - 3 | 80 - 4 | 40 - 5 | 20 - 6 | 10 + 1 | 200 + 2 | 100 + 3 | 80 + 4 | 40 + 5 | 20 + 6 | 10 The I/O address is sum of all switches set to "1". Remember that the I/O address space bellow 0x200 is RESERVED for mainboard, so -switch 1 should be ALWAYS SET TO OFF. +switch 1 should be ALWAYS SET TO OFF. Setting the Base Memory (RAM) buffer Address --------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The memory buffer (RAM) requires 2K. The base of this buffer can be located in any of sixteen positions. However, the addresses below A0000 are likely to cause system hang because there's main RAM. -Jumpers 7-10 of switch block SW1 select the Memory Base address. +Jumpers 7-10 of switch block SW1 select the Memory Base address:: Switch | Hex RAM 7 8 9 10 | Address ----------------|--------- OFF OFF OFF OFF | F0000 (conflicts with main BIOS) - OFF OFF OFF ON | E0000 + OFF OFF OFF ON | E0000 OFF OFF ON OFF | D0000 OFF OFF ON ON | C0000 (conflicts with video BIOS) OFF ON OFF OFF | B0000 (conflicts with mono video) @@ -1873,10 +1927,10 @@ Jumpers 7-10 of switch block SW1 select the Memory Base address. Setting the Interrupt Line --------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^ -Jumpers 1-5 of the jumper block J1 control the IRQ level. ON means -shorted, OFF means open. +Jumpers 1-5 of the jumper block J1 control the IRQ level. ON means +shorted, OFF means open:: Jumper | IRQ 1 2 3 4 5 | @@ -1889,65 +1943,67 @@ shorted, OFF means open. Unknown jumpers & sockets -------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^ I know nothing about these. I just guess that J16&J17 are timeout jumpers and maybe one of J18-J21 selects ROM. Also J6-J10 and J11-J15 are connecting IRQ2-7 to some pins on the UFSs. I can't guess the purpose. +Datapoint? +========== -***************************************************************************** - -** Datapoint? ** LAN-ARC-8, an 8-bit card ------------------------ + - from Vojtech Pavlik <vojtech@suse.cz> This is another SMC 90C65-based ARCnet card. I couldn't identify the manufacturer, but it might be DataPoint, because the card has the original arcNet logo in its upper right corner. - _______________________________________________________ - | _________ | - | | SW2 | ON arcNet | - | |_________| OFF ___| - | _____________ 1 ______ 8 | | 8 - | | | SW1 | XTAL | ____________ | S | - | > RAM (2k) | |______|| | | W | - | |_____________| | H | | 3 | - | _________|_____ y | |___| 1 - | _________ | | |b | | - | |_________| | | |r | | - | | SMC | |i | | - | | 90C65| |d | | - | _________ | | | | | - | | SW1 | ON | | |I | | - | |_________| OFF |_________|_____/C | _____| - | 1 8 | | | |___ - | ______________ | | | BNC |___| - | | | |____________| |_____| - | > EPROM SOCKET | _____________ | - | |______________| |_____________| | - | ______________| - | | - |________________________________________| - -Legend: - -90C65 ARCNET Chip -SW1 1-5: Base Memory Address Select - 6-8: Base I/O Address Select -SW2 1-8: Node ID Select -SW3 1-5: IRQ Select - 6-7: Extra Timeout - 8 : ROM Enable -BNC Coax connector -XTAL 20 MHz Crystal +:: + + _______________________________________________________ + | _________ | + | | SW2 | ON arcNet | + | |_________| OFF ___| + | _____________ 1 ______ 8 | | 8 + | | | SW1 | XTAL | ____________ | S | + | > RAM (2k) | |______|| | | W | + | |_____________| | H | | 3 | + | _________|_____ y | |___| 1 + | _________ | | |b | | + | |_________| | | |r | | + | | SMC | |i | | + | | 90C65| |d | | + | _________ | | | | | + | | SW1 | ON | | |I | | + | |_________| OFF |_________|_____/C | _____| + | 1 8 | | | |___ + | ______________ | | | BNC |___| + | | | |____________| |_____| + | > EPROM SOCKET | _____________ | + | |______________| |_____________| | + | ______________| + | | + |________________________________________| + +Legend:: + + 90C65 ARCNET Chip + SW1 1-5: Base Memory Address Select + 6-8: Base I/O Address Select + SW2 1-8: Node ID Select + SW3 1-5: IRQ Select + 6-7: Extra Timeout + 8 : ROM Enable + BNC Coax connector + XTAL 20 MHz Crystal Setting the Node ID -------------------- +^^^^^^^^^^^^^^^^^^^ The eight switches in SW3 are used to set the node ID. Each node attached to the network must have an unique node ID which must not be 0. @@ -1955,8 +2011,8 @@ Switch 1 serves as the least significant bit (LSB). Setting one of the switches to Off means "1", On means "0". -The node ID is the sum of the values of all switches set to "1" -These values are: +The node ID is the sum of the values of all switches set to "1" +These values are:: Switch | Value -------|------- @@ -1971,10 +2027,10 @@ These values are: Setting the I/O Base Address ----------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The last three switches in switch block SW1 are used to select one -of eight possible I/O Base addresses using the following table +of eight possible I/O Base addresses using the following table:: Switch | Hex I/O @@ -1991,13 +2047,16 @@ of eight possible I/O Base addresses using the following table Setting the Base Memory (RAM) buffer Address --------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The memory buffer (RAM) requires 2K. The base of this buffer can be +The memory buffer (RAM) requires 2K. The base of this buffer can be located in any of eight positions. The address of the Boot Prom is memory base + 0x2000. + Jumpers 3-5 of switch block SW1 select the Memory Base address. +:: + Switch | Hex RAM | Hex ROM 1 2 3 4 5 | Address | Address *) --------------------|---------|----------- @@ -2009,16 +2068,16 @@ Jumpers 3-5 of switch block SW1 select the Memory Base address. ON ON OFF ON OFF | D8000 | DA000 ON ON ON OFF OFF | DC000 | DE000 ON ON OFF OFF OFF | E0000 | E2000 - -*) To enable the Boot ROM set the switch 8 of switch block SW3 to position ON. + + *) To enable the Boot ROM set the switch 8 of switch block SW3 to position ON. The switches 1 and 2 probably add 0x0800 and 0x1000 to RAM base address. Setting the Interrupt Line --------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^ -Switches 1-5 of the switch block SW3 control the IRQ level. +Switches 1-5 of the switch block SW3 control the IRQ level:: Jumper | IRQ 1 2 3 4 5 | @@ -2031,64 +2090,67 @@ Switches 1-5 of the switch block SW3 control the IRQ level. Setting the Timeout Parameters ------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The switches 6-7 of the switch block SW3 are used to determine the timeout parameters. These two switches are normally left in the OFF position. -***************************************************************************** +Topware +======= -** Topware ** 8-bit card, TA-ARC/10 -------------------------- +--------------------- + - from Vojtech Pavlik <vojtech@suse.cz> This is another very similar 90C65 card. Most of the switches and jumpers are the same as on other clones. - _____________________________________________________________________ -| ___________ | | ______ | -| |SW2 NODE ID| | | | XTAL | | -| |___________| | Hybrid IC | |______| | -| ___________ | | __| -| |SW1 MEM+I/O| |_________________________| LED1|__|) -| |___________| 1 2 | -| J3 |o|o| TIMEOUT ______| -| ______________ |o|o| | | -| | | ___________________ | RJ | -| > EPROM SOCKET | | \ |------| -|J2 |______________| | | | | -||o| | | |______| -||o| ROM ENABLE | SMC | _________ | -| _____________ | 90C65 | |_________| _____| -| | | | | | |___ -| > RAM (2k) | | | | BNC |___| -| |_____________| | | |_____| -| |____________________| | -| ________ IRQ 2 3 4 5 7 ___________ | -||________| |o|o|o|o|o| |___________| | -|________ J1|o|o|o|o|o| ______________| - | | - |_____________________________________________| - -Legend: - -90C65 ARCNET Chip -XTAL 20 MHz Crystal -SW1 1-5 Base Memory Address Select - 6-8 Base I/O Address Select -SW2 1-8 Node ID Select (ID0-ID7) -J1 IRQ Select -J2 ROM Enable -J3 Extra Timeout -LED1 Activity LED -BNC Coax connector (BUS ARCnet) -RJ Twisted Pair Connector (daisy chain) +:: + + _____________________________________________________________________ + | ___________ | | ______ | + | |SW2 NODE ID| | | | XTAL | | + | |___________| | Hybrid IC | |______| | + | ___________ | | __| + | |SW1 MEM+I/O| |_________________________| LED1|__|) + | |___________| 1 2 | + | J3 |o|o| TIMEOUT ______| + | ______________ |o|o| | | + | | | ___________________ | RJ | + | > EPROM SOCKET | | \ |------| + |J2 |______________| | | | | + ||o| | | |______| + ||o| ROM ENABLE | SMC | _________ | + | _____________ | 90C65 | |_________| _____| + | | | | | | |___ + | > RAM (2k) | | | | BNC |___| + | |_____________| | | |_____| + | |____________________| | + | ________ IRQ 2 3 4 5 7 ___________ | + ||________| |o|o|o|o|o| |___________| | + |________ J1|o|o|o|o|o| ______________| + | | + |_____________________________________________| + +Legend:: + + 90C65 ARCNET Chip + XTAL 20 MHz Crystal + SW1 1-5 Base Memory Address Select + 6-8 Base I/O Address Select + SW2 1-8 Node ID Select (ID0-ID7) + J1 IRQ Select + J2 ROM Enable + J3 Extra Timeout + LED1 Activity LED + BNC Coax connector (BUS ARCnet) + RJ Twisted Pair Connector (daisy chain) Setting the Node ID -------------------- +^^^^^^^^^^^^^^^^^^^ The eight switches in SW2 are used to set the node ID. Each node attached to the network must have an unique node ID which must not be 0. Switch 1 (ID0) @@ -2097,7 +2159,7 @@ serves as the least significant bit (LSB). Setting one of the switches to Off means "1", On means "0". The node ID is the sum of the values of all switches set to "1" -These values are: +These values are:: Switch | Label | Value -------|-------|------- @@ -2111,10 +2173,10 @@ These values are: 8 | ID7 | 128 Setting the I/O Base Address ----------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The last three switches in switch block SW1 are used to select one -of eight possible I/O Base addresses using the following table: +of eight possible I/O Base addresses using the following table:: Switch | Hex I/O @@ -2122,7 +2184,7 @@ of eight possible I/O Base addresses using the following table: ------------|-------- ON ON ON | 260 (Manufacturer's default) OFF ON ON | 290 - ON OFF ON | 2E0 + ON OFF ON | 2E0 OFF OFF ON | 2F0 ON ON OFF | 300 OFF ON OFF | 350 @@ -2131,35 +2193,38 @@ of eight possible I/O Base addresses using the following table: Setting the Base Memory (RAM) buffer Address --------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The memory buffer (RAM) requires 2K. The base of this buffer can be located in any of eight positions. The address of the Boot Prom is memory base + 0x2000. + Jumpers 3-5 of switch block SW1 select the Memory Base address. +:: + Switch | Hex RAM | Hex ROM 1 2 3 4 5 | Address | Address *) --------------------|---------|----------- ON ON ON ON ON | C0000 | C2000 - ON ON OFF ON ON | C4000 | C6000 (Manufacturer's default) + ON ON OFF ON ON | C4000 | C6000 (Manufacturer's default) ON ON ON OFF ON | CC000 | CE000 - ON ON OFF OFF ON | D0000 | D2000 + ON ON OFF OFF ON | D0000 | D2000 ON ON ON ON OFF | D4000 | D6000 ON ON OFF ON OFF | D8000 | DA000 ON ON ON OFF OFF | DC000 | DE000 ON ON OFF OFF OFF | E0000 | E2000 -*) To enable the Boot ROM short the jumper J2. + *) To enable the Boot ROM short the jumper J2. The jumpers 1 and 2 probably add 0x0800 and 0x1000 to RAM address. Setting the Interrupt Line --------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^ Jumpers 1-5 of the jumper block J1 control the IRQ level. ON means -shorted, OFF means open. +shorted, OFF means open:: Jumper | IRQ 1 2 3 4 5 | @@ -2172,19 +2237,21 @@ shorted, OFF means open. Setting the Timeout Parameters ------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The jumpers J3 are used to set the timeout parameters. These two +The jumpers J3 are used to set the timeout parameters. These two jumpers are normally left open. - -***************************************************************************** +Thomas-Conrad +============= -** Thomas-Conrad ** Model #500-6242-0097 REV A (8-bit card) --------------------------------------- + - from Lars Karlsson <100617.3473@compuserve.com> +:: + ________________________________________________________ | ________ ________ |_____ | |........| |........| | @@ -2194,11 +2261,11 @@ Model #500-6242-0097 REV A (8-bit card) | address | | | ______ switch | | | | | | | - | | | |___| + | | | |___| | | | ______ |___._ | |______| |______| ____| BNC | Jumper- _____| Connector - | Main chip block _ __| ' + | Main chip block _ __| ' | | | | RJ Connector | |_| | with 110 Ohm | |__ Terminator @@ -2208,46 +2275,49 @@ Model #500-6242-0097 REV A (8-bit card) | |___________| |_____| |__ | Boot PROM socket IRQ-jumpers |_ Diagnostic |________ __ _| LED (red) - | | | | | | | | | | | | | | | | | | | | | | - | | | | | | | | | | | | | | | | | | | | |________| - | - | + | | | | | | | | | | | | | | | | | | | | | | + | | | | | | | | | | | | | | | | | | | | |________| + | + | And here are the settings for some of the switches and jumpers on the cards. +:: - I/O + I/O - 1 2 3 4 5 6 7 8 + 1 2 3 4 5 6 7 8 -2E0----- 0 0 0 1 0 0 0 1 -2F0----- 0 0 0 1 0 0 0 0 -300----- 0 0 0 0 1 1 1 1 -350----- 0 0 0 0 1 1 1 0 + 2E0----- 0 0 0 1 0 0 0 1 + 2F0----- 0 0 0 1 0 0 0 0 + 300----- 0 0 0 0 1 1 1 1 + 350----- 0 0 0 0 1 1 1 0 "0" in the above example means switch is off "1" means that it is on. +:: - ShMem address. + ShMem address. - 1 2 3 4 5 6 7 8 + 1 2 3 4 5 6 7 8 -CX00--0 0 1 1 | | | -DX00--0 0 1 0 | -X000--------- 1 1 | -X400--------- 1 0 | -X800--------- 0 1 | -XC00--------- 0 0 -ENHANCED----------- 1 -COMPATIBLE--------- 0 + CX00--0 0 1 1 | | | + DX00--0 0 1 0 | + X000--------- 1 1 | + X400--------- 1 0 | + X800--------- 0 1 | + XC00--------- 0 0 + ENHANCED----------- 1 + COMPATIBLE--------- 0 +:: - IRQ + IRQ - 3 4 5 7 2 - . . . . . - . . . . . + 3 4 5 7 2 + . . . . . + . . . . . There is a DIP-switch with 8 switches, used to set the shared memory address @@ -2266,10 +2336,9 @@ varies by the type of card involved. I fail to see how either of these enhance anything. Send me more detailed information about this mode, or just use "compatible" mode instead.] +Waterloo Microsystems Inc. ?? +============================= -***************************************************************************** - -** Waterloo Microsystems Inc. ?? ** 8-bit card (C) 1985 ------------------- - from Robert Michael Best <rmb117@cs.usask.ca> @@ -2283,103 +2352,104 @@ e-mail me.] The probe has not been able to detect the card on any of the J2 settings, and I tried them again with the "Waterloo" chip removed. - - _____________________________________________________________________ -| \/ \/ ___ __ __ | -| C4 C4 |^| | M || ^ ||^| | -| -- -- |_| | 5 || || | C3 | -| \/ \/ C10 |___|| ||_| | -| C4 C4 _ _ | | ?? | -| -- -- | \/ || | | -| | || | | -| | || C1 | | -| | || | \/ _____| -| | C6 || | C9 | |___ -| | || | -- | BNC |___| -| | || | >C7| |_____| -| | || | | -| __ __ |____||_____| 1 2 3 6 | -|| ^ | >C4| |o|o|o|o|o|o| J2 >C4| | -|| | |o|o|o|o|o|o| | -|| C2 | >C4| >C4| | -|| | >C8| | -|| | 2 3 4 5 6 7 IRQ >C4| | -||_____| |o|o|o|o|o|o| J3 | -|_______ |o|o|o|o|o|o| _______________| - | | - |_____________________________________________| - -C1 -- "COM9026 - SMC 8638" - In a chip socket. - -C2 -- "@Copyright - Waterloo Microsystems Inc. - 1985" - In a chip Socket with info printed on a label covering a round window - showing the circuit inside. (The window indicates it is an EPROM chip.) - -C3 -- "COM9032 - SMC 8643" - In a chip socket. - -C4 -- "74LS" - 9 total no sockets. - -M5 -- "50006-136 - 20.000000 MHZ - MTQ-T1-S3 - 0 M-TRON 86-40" - Metallic case with 4 pins, no socket. - -C6 -- "MOSTEK@TC8643 - MK6116N-20 - MALAYSIA" - No socket. - -C7 -- No stamp or label but in a 20 pin chip socket. - -C8 -- "PAL10L8CN - 8623" - In a 20 pin socket. - -C9 -- "PAl16R4A-2CN - 8641" - In a 20 pin socket. - -C10 -- "M8640 - NMC - 9306N" - In an 8 pin socket. - -?? -- Some components on a smaller board and attached with 20 pins all - along the side closest to the BNC connector. The are coated in a dark - resin. - -On the board there are two jumper banks labeled J2 and J3. The -manufacturer didn't put a J1 on the board. The two boards I have both + +:: + + _____________________________________________________________________ + | \/ \/ ___ __ __ | + | C4 C4 |^| | M || ^ ||^| | + | -- -- |_| | 5 || || | C3 | + | \/ \/ C10 |___|| ||_| | + | C4 C4 _ _ | | ?? | + | -- -- | \/ || | | + | | || | | + | | || C1 | | + | | || | \/ _____| + | | C6 || | C9 | |___ + | | || | -- | BNC |___| + | | || | >C7| |_____| + | | || | | + | __ __ |____||_____| 1 2 3 6 | + || ^ | >C4| |o|o|o|o|o|o| J2 >C4| | + || | |o|o|o|o|o|o| | + || C2 | >C4| >C4| | + || | >C8| | + || | 2 3 4 5 6 7 IRQ >C4| | + ||_____| |o|o|o|o|o|o| J3 | + |_______ |o|o|o|o|o|o| _______________| + | | + |_____________________________________________| + + C1 -- "COM9026 + SMC 8638" + In a chip socket. + + C2 -- "@Copyright + Waterloo Microsystems Inc. + 1985" + In a chip Socket with info printed on a label covering a round window + showing the circuit inside. (The window indicates it is an EPROM chip.) + + C3 -- "COM9032 + SMC 8643" + In a chip socket. + + C4 -- "74LS" + 9 total no sockets. + + M5 -- "50006-136 + 20.000000 MHZ + MTQ-T1-S3 + 0 M-TRON 86-40" + Metallic case with 4 pins, no socket. + + C6 -- "MOSTEK@TC8643 + MK6116N-20 + MALAYSIA" + No socket. + + C7 -- No stamp or label but in a 20 pin chip socket. + + C8 -- "PAL10L8CN + 8623" + In a 20 pin socket. + + C9 -- "PAl16R4A-2CN + 8641" + In a 20 pin socket. + + C10 -- "M8640 + NMC + 9306N" + In an 8 pin socket. + + ?? -- Some components on a smaller board and attached with 20 pins all + along the side closest to the BNC connector. The are coated in a dark + resin. + +On the board there are two jumper banks labeled J2 and J3. The +manufacturer didn't put a J1 on the board. The two boards I have both came with a jumper box for each bank. -J2 -- Numbered 1 2 3 4 5 6. - 4 and 5 are not stamped due to solder points. - -J3 -- IRQ 2 3 4 5 6 7 +:: + + J2 -- Numbered 1 2 3 4 5 6. + 4 and 5 are not stamped due to solder points. + + J3 -- IRQ 2 3 4 5 6 7 -The board itself has a maple leaf stamped just above the irq jumpers -and "-2 46-86" beside C2. Between C1 and C6 "ASS 'Y 300163" and "@1986 +The board itself has a maple leaf stamped just above the irq jumpers +and "-2 46-86" beside C2. Between C1 and C6 "ASS 'Y 300163" and "@1986 CORMAN CUSTOM ELECTRONICS CORP." stamped just below the BNC connector. Below that "MADE IN CANADA" - -***************************************************************************** +No Name +======= -** No Name ** 8-bit cards, 16-bit cards ------------------------- + - from Juergen Seifert <seifert@htwm.de> - -NONAME 8-BIT ARCNET -=================== I have named this ARCnet card "NONAME", since there is no name of any manufacturer on the Installation manual nor on the shipping box. The only @@ -2388,8 +2458,10 @@ it is "Made in Taiwan" This description has been written by Juergen Seifert <seifert@htwm.de> using information from the Original - "ARCnet Installation Manual" + "ARCnet Installation Manual" + +:: ________________________________________________________________ | |STAR| BUS| T/P| | @@ -2416,32 +2488,32 @@ using information from the Original | \ IRQ / T T O | |__________________1_2_M______________________| -Legend: +Legend:: -COM90C65: ARCnet Probe -S1 1-8: Node ID Select -S2 1-3: I/O Base Address Select - 4-6: Memory Base Address Select - 7-8: RAM Offset Select -ET1, ET2 Extended Timeout Select -ROM ROM Enable Select -CN RG62 Coax Connector -STAR| BUS | T/P Three fields for placing a sign (colored circle) - indicating the topology of the card + COM90C65: ARCnet Probe + S1 1-8: Node ID Select + S2 1-3: I/O Base Address Select + 4-6: Memory Base Address Select + 7-8: RAM Offset Select + ET1, ET2 Extended Timeout Select + ROM ROM Enable Select + CN RG62 Coax Connector + STAR| BUS | T/P Three fields for placing a sign (colored circle) + indicating the topology of the card Setting one of the switches to Off means "1", On means "0". Setting the Node ID -------------------- +^^^^^^^^^^^^^^^^^^^ The eight switches in group SW1 are used to set the node ID. Each node attached to the network must have an unique node ID which must be different from 0. Switch 8 serves as the least significant bit (LSB). -The node ID is the sum of the values of all switches set to "1" -These values are: +The node ID is the sum of the values of all switches set to "1" +These values are:: Switch | Value -------|------- @@ -2454,30 +2526,30 @@ These values are: 2 | 64 1 | 128 -Some Examples: +Some Examples:: - Switch | Hex | Decimal + Switch | Hex | Decimal 1 2 3 4 5 6 7 8 | Node ID | Node ID ----------------|---------|--------- 0 0 0 0 0 0 0 0 | not allowed - 0 0 0 0 0 0 0 1 | 1 | 1 + 0 0 0 0 0 0 0 1 | 1 | 1 0 0 0 0 0 0 1 0 | 2 | 2 0 0 0 0 0 0 1 1 | 3 | 3 . . . | | 0 1 0 1 0 1 0 1 | 55 | 85 . . . | | 1 0 1 0 1 0 1 0 | AA | 170 - . . . | | + . . . | | 1 1 1 1 1 1 0 1 | FD | 253 1 1 1 1 1 1 1 0 | FE | 254 1 1 1 1 1 1 1 1 | FF | 255 Setting the I/O Base Address ----------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The first three switches in switch group SW2 are used to select one -of eight possible I/O Base addresses using the following table +of eight possible I/O Base addresses using the following table:: Switch | Hex I/O 1 2 3 | Address @@ -2493,7 +2565,7 @@ of eight possible I/O Base addresses using the following table Setting the Base Memory (RAM) buffer Address --------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The memory buffer requires 2K of a 16K block of RAM. The base of this 16K block can be located in any of eight positions. @@ -2501,6 +2573,8 @@ Switches 4-6 of switch group SW2 select the Base of the 16K block. Within that 16K address space, the buffer may be assigned any one of four positions, determined by the offset, switches 7 and 8 of group SW2. +:: + Switch | Hex RAM | Hex ROM 4 5 6 7 8 | Address | Address *) -----------|---------|----------- @@ -2508,60 +2582,62 @@ positions, determined by the offset, switches 7 and 8 of group SW2. 0 0 0 0 1 | C0800 | C2000 0 0 0 1 0 | C1000 | C2000 0 0 0 1 1 | C1800 | C2000 - | | + | | 0 0 1 0 0 | C4000 | C6000 0 0 1 0 1 | C4800 | C6000 0 0 1 1 0 | C5000 | C6000 0 0 1 1 1 | C5800 | C6000 - | | + | | 0 1 0 0 0 | CC000 | CE000 0 1 0 0 1 | CC800 | CE000 0 1 0 1 0 | CD000 | CE000 0 1 0 1 1 | CD800 | CE000 - | | + | | 0 1 1 0 0 | D0000 | D2000 (Manufacturer's default) 0 1 1 0 1 | D0800 | D2000 0 1 1 1 0 | D1000 | D2000 0 1 1 1 1 | D1800 | D2000 - | | + | | 1 0 0 0 0 | D4000 | D6000 1 0 0 0 1 | D4800 | D6000 1 0 0 1 0 | D5000 | D6000 1 0 0 1 1 | D5800 | D6000 - | | + | | 1 0 1 0 0 | D8000 | DA000 1 0 1 0 1 | D8800 | DA000 1 0 1 1 0 | D9000 | DA000 1 0 1 1 1 | D9800 | DA000 - | | + | | 1 1 0 0 0 | DC000 | DE000 1 1 0 0 1 | DC800 | DE000 1 1 0 1 0 | DD000 | DE000 1 1 0 1 1 | DD800 | DE000 - | | + | | 1 1 1 0 0 | E0000 | E2000 1 1 1 0 1 | E0800 | E2000 1 1 1 1 0 | E1000 | E2000 1 1 1 1 1 | E1800 | E2000 - -*) To enable the 8K Boot PROM install the jumper ROM. - The default is jumper ROM not installed. + + *) To enable the 8K Boot PROM install the jumper ROM. + The default is jumper ROM not installed. Setting Interrupt Request Lines (IRQ) -------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ To select a hardware interrupt level set one (only one!) of the jumpers IRQ2, IRQ3, IRQ4, IRQ5 or IRQ7. The manufacturer's default is IRQ2. - + Setting the Timeouts --------------------- +^^^^^^^^^^^^^^^^^^^^ The two jumpers labeled ET1 and ET2 are used to determine the timeout parameters (response and reconfiguration time). Every node in a network must be set to the same timeout values. +:: + ET1 ET2 | Response Time (us) | Reconfiguration Time (ms) --------|--------------------|-------------------------- Off Off | 78 | 840 (Default) @@ -2572,8 +2648,8 @@ must be set to the same timeout values. On means jumper installed, Off means jumper not installed -NONAME 16-BIT ARCNET -==================== +16-BIT ARCNET +------------- The manual of my 8-Bit NONAME ARCnet Card contains another description of a 16-Bit Coax / Twisted Pair Card. This description is incomplete, @@ -2584,13 +2660,16 @@ the booklet there is a different way of counting ... 2-9, 2-10, A-1, Also the picture of the board layout is not as good as the picture of 8-Bit card, because there isn't any letter like "SW1" written to the picture. + Should somebody have such a board, please feel free to complete this description or to send a mail to me! This description has been written by Juergen Seifert <seifert@htwm.de> using information from the Original - "ARCnet Installation Manual" + "ARCnet Installation Manual" + +:: ___________________________________________________________________ < _________________ _________________ | @@ -2622,15 +2701,15 @@ Setting one of the switches to Off means "1", On means "0". Setting the Node ID -------------------- +^^^^^^^^^^^^^^^^^^^ The eight switches in group SW2 are used to set the node ID. Each node attached to the network must have an unique node ID which must be different from 0. Switch 8 serves as the least significant bit (LSB). -The node ID is the sum of the values of all switches set to "1" -These values are: +The node ID is the sum of the values of all switches set to "1" +These values are:: Switch | Value -------|------- @@ -2643,30 +2722,30 @@ These values are: 2 | 64 1 | 128 -Some Examples: +Some Examples:: - Switch | Hex | Decimal + Switch | Hex | Decimal 1 2 3 4 5 6 7 8 | Node ID | Node ID ----------------|---------|--------- 0 0 0 0 0 0 0 0 | not allowed - 0 0 0 0 0 0 0 1 | 1 | 1 + 0 0 0 0 0 0 0 1 | 1 | 1 0 0 0 0 0 0 1 0 | 2 | 2 0 0 0 0 0 0 1 1 | 3 | 3 . . . | | 0 1 0 1 0 1 0 1 | 55 | 85 . . . | | 1 0 1 0 1 0 1 0 | AA | 170 - . . . | | + . . . | | 1 1 1 1 1 1 0 1 | FD | 253 1 1 1 1 1 1 1 0 | FE | 254 1 1 1 1 1 1 1 1 | FF | 255 Setting the I/O Base Address ----------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The first three switches in switch group SW1 are used to select one -of eight possible I/O Base addresses using the following table +of eight possible I/O Base addresses using the following table:: Switch | Hex I/O 3 2 1 | Address @@ -2682,13 +2761,13 @@ of eight possible I/O Base addresses using the following table Setting the Base Memory (RAM) buffer Address --------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The memory buffer requires 2K of a 16K block of RAM. The base of this 16K block can be located in any of eight positions. Switches 6-8 of switch group SW1 select the Base of the 16K block. Within that 16K address space, the buffer may be assigned any one of four -positions, determined by the offset, switches 4 and 5 of group SW1. +positions, determined by the offset, switches 4 and 5 of group SW1:: Switch | Hex RAM | Hex ROM 8 7 6 5 4 | Address | Address @@ -2697,111 +2776,111 @@ positions, determined by the offset, switches 4 and 5 of group SW1. 0 0 0 0 1 | C0800 | C2000 0 0 0 1 0 | C1000 | C2000 0 0 0 1 1 | C1800 | C2000 - | | + | | 0 0 1 0 0 | C4000 | C6000 0 0 1 0 1 | C4800 | C6000 0 0 1 1 0 | C5000 | C6000 0 0 1 1 1 | C5800 | C6000 - | | + | | 0 1 0 0 0 | CC000 | CE000 0 1 0 0 1 | CC800 | CE000 0 1 0 1 0 | CD000 | CE000 0 1 0 1 1 | CD800 | CE000 - | | + | | 0 1 1 0 0 | D0000 | D2000 (Manufacturer's default) 0 1 1 0 1 | D0800 | D2000 0 1 1 1 0 | D1000 | D2000 0 1 1 1 1 | D1800 | D2000 - | | + | | 1 0 0 0 0 | D4000 | D6000 1 0 0 0 1 | D4800 | D6000 1 0 0 1 0 | D5000 | D6000 1 0 0 1 1 | D5800 | D6000 - | | + | | 1 0 1 0 0 | D8000 | DA000 1 0 1 0 1 | D8800 | DA000 1 0 1 1 0 | D9000 | DA000 1 0 1 1 1 | D9800 | DA000 - | | + | | 1 1 0 0 0 | DC000 | DE000 1 1 0 0 1 | DC800 | DE000 1 1 0 1 0 | DD000 | DE000 1 1 0 1 1 | DD800 | DE000 - | | + | | 1 1 1 0 0 | E0000 | E2000 1 1 1 0 1 | E0800 | E2000 1 1 1 1 0 | E1000 | E2000 1 1 1 1 1 | E1800 | E2000 - + Setting Interrupt Request Lines (IRQ) -------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ?????????????????????????????????????? Setting the Timeouts --------------------- +^^^^^^^^^^^^^^^^^^^^ ?????????????????????????????????????? -***************************************************************************** - -** No Name ** 8-bit cards ("Made in Taiwan R.O.C.") ------------ +------------------------------------- + - from Vojtech Pavlik <vojtech@suse.cz> I have named this ARCnet card "NONAME", since I got only the card with -no manual at all and the only text identifying the manufacturer is +no manual at all and the only text identifying the manufacturer is "MADE IN TAIWAN R.O.C" printed on the card. - ____________________________________________________________ - | 1 2 3 4 5 6 7 8 | - | |o|o| JP1 o|o|o|o|o|o|o|o| ON | - | + o|o|o|o|o|o|o|o| ___| - | _____________ o|o|o|o|o|o|o|o| OFF _____ | | ID7 - | | | SW1 | | | | ID6 - | > RAM (2k) | ____________________ | H | | S | ID5 - | |_____________| | || y | | W | ID4 - | | || b | | 2 | ID3 - | | || r | | | ID2 - | | || i | | | ID1 - | | 90C65 || d | |___| ID0 - | SW3 | || | | - | |o|o|o|o|o|o|o|o| ON | || I | | - | |o|o|o|o|o|o|o|o| | || C | | - | |o|o|o|o|o|o|o|o| OFF |____________________|| | _____| - | 1 2 3 4 5 6 7 8 | | | |___ - | ______________ | | | BNC |___| - | | | |_____| |_____| - | > EPROM SOCKET | | - | |______________| | - | ______________| - | | - |_____________________________________________| - -Legend: - -90C65 ARCNET Chip -SW1 1-5: Base Memory Address Select - 6-8: Base I/O Address Select -SW2 1-8: Node ID Select (ID0-ID7) -SW3 1-5: IRQ Select - 6-7: Extra Timeout - 8 : ROM Enable -JP1 Led connector -BNC Coax connector - -Although the jumpers SW1 and SW3 are marked SW, not JP, they are jumpers, not +:: + + ____________________________________________________________ + | 1 2 3 4 5 6 7 8 | + | |o|o| JP1 o|o|o|o|o|o|o|o| ON | + | + o|o|o|o|o|o|o|o| ___| + | _____________ o|o|o|o|o|o|o|o| OFF _____ | | ID7 + | | | SW1 | | | | ID6 + | > RAM (2k) | ____________________ | H | | S | ID5 + | |_____________| | || y | | W | ID4 + | | || b | | 2 | ID3 + | | || r | | | ID2 + | | || i | | | ID1 + | | 90C65 || d | |___| ID0 + | SW3 | || | | + | |o|o|o|o|o|o|o|o| ON | || I | | + | |o|o|o|o|o|o|o|o| | || C | | + | |o|o|o|o|o|o|o|o| OFF |____________________|| | _____| + | 1 2 3 4 5 6 7 8 | | | |___ + | ______________ | | | BNC |___| + | | | |_____| |_____| + | > EPROM SOCKET | | + | |______________| | + | ______________| + | | + |_____________________________________________| + +Legend:: + + 90C65 ARCNET Chip + SW1 1-5: Base Memory Address Select + 6-8: Base I/O Address Select + SW2 1-8: Node ID Select (ID0-ID7) + SW3 1-5: IRQ Select + 6-7: Extra Timeout + 8 : ROM Enable + JP1 Led connector + BNC Coax connector + +Although the jumpers SW1 and SW3 are marked SW, not JP, they are jumpers, not switches. -Setting the jumpers to ON means connecting the upper two pins, off the bottom +Setting the jumpers to ON means connecting the upper two pins, off the bottom two - or - in case of IRQ setting, connecting none of them at all. Setting the Node ID -------------------- +^^^^^^^^^^^^^^^^^^^ The eight switches in SW2 are used to set the node ID. Each node attached to the network must have an unique node ID which must not be 0. @@ -2809,8 +2888,8 @@ Switch 1 (ID0) serves as the least significant bit (LSB). Setting one of the switches to Off means "1", On means "0". -The node ID is the sum of the values of all switches set to "1" -These values are: +The node ID is the sum of the values of all switches set to "1" +These values are:: Switch | Label | Value -------|-------|------- @@ -2823,30 +2902,30 @@ These values are: 7 | ID6 | 64 8 | ID7 | 128 -Some Examples: +Some Examples:: - Switch | Hex | Decimal + Switch | Hex | Decimal 8 7 6 5 4 3 2 1 | Node ID | Node ID ----------------|---------|--------- 0 0 0 0 0 0 0 0 | not allowed - 0 0 0 0 0 0 0 1 | 1 | 1 + 0 0 0 0 0 0 0 1 | 1 | 1 0 0 0 0 0 0 1 0 | 2 | 2 0 0 0 0 0 0 1 1 | 3 | 3 . . . | | 0 1 0 1 0 1 0 1 | 55 | 85 . . . | | 1 0 1 0 1 0 1 0 | AA | 170 - . . . | | + . . . | | 1 1 1 1 1 1 0 1 | FD | 253 1 1 1 1 1 1 1 0 | FE | 254 1 1 1 1 1 1 1 1 | FF | 255 Setting the I/O Base Address ----------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The last three switches in switch block SW1 are used to select one -of eight possible I/O Base addresses using the following table +of eight possible I/O Base addresses using the following table:: Switch | Hex I/O @@ -2863,13 +2942,16 @@ of eight possible I/O Base addresses using the following table Setting the Base Memory (RAM) buffer Address --------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The memory buffer (RAM) requires 2K. The base of this buffer can be +The memory buffer (RAM) requires 2K. The base of this buffer can be located in any of eight positions. The address of the Boot Prom is memory base + 0x2000. + Jumpers 3-5 of jumper block SW1 select the Memory Base address. +:: + Switch | Hex RAM | Hex ROM 1 2 3 4 5 | Address | Address *) --------------------|---------|----------- @@ -2881,15 +2963,15 @@ Jumpers 3-5 of jumper block SW1 select the Memory Base address. ON ON OFF ON OFF | D8000 | DA000 ON ON ON OFF OFF | DC000 | DE000 ON ON OFF OFF OFF | E0000 | E2000 - -*) To enable the Boot ROM set the jumper 8 of jumper block SW3 to position ON. + + *) To enable the Boot ROM set the jumper 8 of jumper block SW3 to position ON. The jumpers 1 and 2 probably add 0x0800, 0x1000 and 0x1800 to RAM adders. Setting the Interrupt Line --------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^ -Jumpers 1-5 of the jumper block SW3 control the IRQ level. +Jumpers 1-5 of the jumper block SW3 control the IRQ level:: Jumper | IRQ 1 2 3 4 5 | @@ -2902,23 +2984,24 @@ Jumpers 1-5 of the jumper block SW3 control the IRQ level. Setting the Timeout Parameters ------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The jumpers 6-7 of the jumper block SW3 are used to determine the timeout +The jumpers 6-7 of the jumper block SW3 are used to determine the timeout parameters. These two jumpers are normally left in the OFF position. -***************************************************************************** -** No Name ** (Generic Model 9058) -------------------- - from Andrew J. Kroll <ag784@freenet.buffalo.edu> - Sorry this sat in my to-do box for so long, Andrew! (yikes - over a year!) - _____ - | < - | .---' + +:: + + _____ + | < + | .---' ________________________________________________________________ | | | | SW2 | | | | ___________ |_____________| | | @@ -2936,7 +3019,7 @@ parameters. These two jumpers are normally left in the OFF position. | |________________| | | : B |- | | | 1 2 3 4 5 6 7 8 | | : O |- | | | |_________o____|..../ A |- _______| | - | ____________________ | R |- | |------, + | ____________________ | R |- | |------, | | | | D |- | BNC | # | | > 2764 PROM SOCKET | |__________|- |_______|------' | |____________________| _________ | | @@ -2945,23 +3028,24 @@ parameters. These two jumpers are normally left in the OFF position. |___ ______________| | |H H H H H H H H H H H H H H H H H H H H H H H| | | |U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U_U| | | - \| -Legend: + \| + +Legend:: -SL90C65 ARCNET Controller / Transceiver /Logic -SW1 1-5: IRQ Select + SL90C65 ARCNET Controller / Transceiver /Logic + SW1 1-5: IRQ Select 6: ET1 7: ET2 - 8: ROM ENABLE -SW2 1-3: Memory Buffer/PROM Address + 8: ROM ENABLE + SW2 1-3: Memory Buffer/PROM Address 3-6: I/O Address Map -SW3 1-8: Node ID Select -BNC BNC RG62/U Connection + SW3 1-8: Node ID Select + BNC BNC RG62/U Connection *I* have had success using RG59B/U with *NO* terminators! What gives?! SW1: Timeouts, Interrupt and ROM ---------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ To select a hardware interrupt level set one (only one!) of the dip switches up (on) SW1...(switches 1-5) @@ -2976,10 +3060,10 @@ are normally left off (down). Setting the I/O Base Address ----------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The last three switches in switch group SW2 are used to select one -of eight possible I/O Base addresses using the following table +of eight possible I/O Base addresses using the following table:: Switch | Hex I/O @@ -2996,7 +3080,7 @@ of eight possible I/O Base addresses using the following table Setting the Base Memory Address (RAM & ROM) -------------------------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The memory buffer requires 2K of a 16K block of RAM. The base of this 16K block can be located in any of eight positions. @@ -3004,13 +3088,16 @@ Switches 1-3 of switch group SW2 select the Base of the 16K block. (0 = DOWN, 1 = UP) I could, however, only verify two settings... + +:: + Switch| Hex RAM | Hex ROM 1 2 3 | Address | Address ------|---------|----------- 0 0 0 | E0000 | E2000 0 0 1 | D0000 | D2000 (Manufacturer's default) 0 1 0 | ????? | ????? - 0 1 1 | ????? | ????? + 0 1 1 | ????? | ????? 1 0 0 | ????? | ????? 1 0 1 | ????? | ????? 1 1 0 | ????? | ????? @@ -3018,7 +3105,7 @@ I could, however, only verify two settings... Setting the Node ID -------------------- +^^^^^^^^^^^^^^^^^^^ The eight switches in group SW3 are used to set the node ID. Each node attached to the network must have an unique node ID which @@ -3026,8 +3113,9 @@ must be different from 0. Switch 1 serves as the least significant bit (LSB). switches in the DOWN position are OFF (0) and in the UP position are ON (1) -The node ID is the sum of the values of all switches set to "1" -These values are: +The node ID is the sum of the values of all switches set to "1" +These values are:: + Switch | Value -------|------- 1 | 1 @@ -3039,70 +3127,80 @@ These values are: 7 | 64 8 | 128 -Some Examples: - - Switch# | Hex | Decimal -8 7 6 5 4 3 2 1 | Node ID | Node ID -----------------|---------|--------- -0 0 0 0 0 0 0 0 | not allowed <-. -0 0 0 0 0 0 0 1 | 1 | 1 | -0 0 0 0 0 0 1 0 | 2 | 2 | -0 0 0 0 0 0 1 1 | 3 | 3 | - . . . | | | -0 1 0 1 0 1 0 1 | 55 | 85 | - . . . | | + Don't use 0 or 255! -1 0 1 0 1 0 1 0 | AA | 170 | - . . . | | | -1 1 1 1 1 1 0 1 | FD | 253 | -1 1 1 1 1 1 1 0 | FE | 254 | -1 1 1 1 1 1 1 1 | FF | 255 <-' - +Some Examples:: -***************************************************************************** + Switch# | Hex | Decimal + 8 7 6 5 4 3 2 1 | Node ID | Node ID + ----------------|---------|--------- + 0 0 0 0 0 0 0 0 | not allowed <-. + 0 0 0 0 0 0 0 1 | 1 | 1 | + 0 0 0 0 0 0 1 0 | 2 | 2 | + 0 0 0 0 0 0 1 1 | 3 | 3 | + . . . | | | + 0 1 0 1 0 1 0 1 | 55 | 85 | + . . . | | + Don't use 0 or 255! + 1 0 1 0 1 0 1 0 | AA | 170 | + . . . | | | + 1 1 1 1 1 1 0 1 | FD | 253 | + 1 1 1 1 1 1 1 0 | FE | 254 | + 1 1 1 1 1 1 1 1 | FF | 255 <-' + + +Tiara +===== -** Tiara ** (model unknown) -------------------------- +--------------- + - from Christoph Lameter <christoph@lameter.com> - - -Here is information about my card as far as I could figure it out: ------------------------------------------------ tiara -Tiara LanCard of Tiara Computer Systems. - -+----------------------------------------------+ -! ! Transmitter Unit ! ! -! +------------------+ ------- -! MEM Coax Connector -! ROM 7654321 <- I/O ------- -! : : +--------+ ! -! : : ! 90C66LJ! +++ -! : : ! ! !D Switch to set -! : : ! ! !I the Nodenumber -! : : +--------+ !P -! !++ -! 234567 <- IRQ ! -+------------!!!!!!!!!!!!!!!!!!!!!!!!--------+ - !!!!!!!!!!!!!!!!!!!!!!!! - -0 = Jumper Installed -1 = Open + + +Here is information about my card as far as I could figure it out:: + + + ----------------------------------------------- tiara + Tiara LanCard of Tiara Computer Systems. + + +----------------------------------------------+ + ! ! Transmitter Unit ! ! + ! +------------------+ ------- + ! MEM Coax Connector + ! ROM 7654321 <- I/O ------- + ! : : +--------+ ! + ! : : ! 90C66LJ! +++ + ! : : ! ! !D Switch to set + ! : : ! ! !I the Nodenumber + ! : : +--------+ !P + ! !++ + ! 234567 <- IRQ ! + +------------!!!!!!!!!!!!!!!!!!!!!!!!--------+ + !!!!!!!!!!!!!!!!!!!!!!!! + +- 0 = Jumper Installed +- 1 = Open Top Jumper line Bit 7 = ROM Enable 654=Memory location 321=I/O Settings for Memory Location (Top Jumper Line) + +=== ================ 456 Address selected +=== ================ 000 C0000 001 C4000 010 CC000 011 D0000 100 D4000 101 D8000 -110 DC000 +110 DC000 111 E0000 +=== ================ Settings for I/O Address (Top Jumper Line) + +=== ==== 123 Port +=== ==== 000 260 001 290 010 2E0 @@ -3111,23 +3209,26 @@ Settings for I/O Address (Top Jumper Line) 101 350 110 380 111 3E0 +=== ==== Settings for IRQ Selection (Lower Jumper Line) + +====== ===== 234567 +====== ===== 011111 IRQ 2 101111 IRQ 3 110111 IRQ 4 111011 IRQ 5 111110 IRQ 7 - -***************************************************************************** - +====== ===== Other Cards ------------ +=========== I have no information on other models of ARCnet cards at the moment. Please send any and all info to: + apenwarr@worldvisions.ca Thanks. diff --git a/Documentation/networking/arcnet.txt b/Documentation/networking/arcnet.rst index aff97f47c05c..e93d9820f0f1 100644 --- a/Documentation/networking/arcnet.txt +++ b/Documentation/networking/arcnet.rst @@ -1,11 +1,18 @@ ----------------------------------------------------------------------------- -NOTE: See also arcnet-hardware.txt in this directory for jumper-setting -and cabling information if you're like many of us and didn't happen to get a -manual with your ARCnet card. ----------------------------------------------------------------------------- +.. SPDX-License-Identifier: GPL-2.0 + +====== +ARCnet +====== + +.. note:: + + See also arcnet-hardware.txt in this directory for jumper-setting + and cabling information if you're like many of us and didn't happen to get a + manual with your ARCnet card. Since no one seems to listen to me otherwise, perhaps a poem will get your -attention: +attention:: + This driver's getting fat and beefy, But my cat is still named Fifi. @@ -24,28 +31,21 @@ Come on, be a sport! Send me a success report! (hey, that was even better than my original poem... this is getting bad!) --------- -WARNING: --------- - -If you don't e-mail me about your success/failure soon, I may be forced to -start SINGING. And we don't want that, do we? +.. warning:: -(You know, it might be argued that I'm pushing this point a little too much. -If you think so, why not flame me in a quick little e-mail? Please also -include the type of card(s) you're using, software, size of network, and -whether it's working or not.) + If you don't e-mail me about your success/failure soon, I may be forced to + start SINGING. And we don't want that, do we? -My e-mail address is: apenwarr@worldvisions.ca + (You know, it might be argued that I'm pushing this point a little too much. + If you think so, why not flame me in a quick little e-mail? Please also + include the type of card(s) you're using, software, size of network, and + whether it's working or not.) + My e-mail address is: apenwarr@worldvisions.ca ---------------------------------------------------------------------------- - - These are the ARCnet drivers for Linux. - -This new release (2.91) has been put together by David Woodhouse +This new release (2.91) has been put together by David Woodhouse <dwmw2@infradead.org>, in an attempt to tidy up the driver after adding support for yet another chipset. Now the generic support has been separated from the individual chipset drivers, and the source files aren't quite so packed with @@ -62,12 +62,13 @@ included and seems to be working fine! Where do I discuss these drivers? --------------------------------- -Tomasz has been so kind as to set up a new and improved mailing list. +Tomasz has been so kind as to set up a new and improved mailing list. Subscribe by sending a message with the BODY "subscribe linux-arcnet YOUR REAL NAME" to listserv@tichy.ch.uj.edu.pl. Then, to submit messages to the list, mail to linux-arcnet@tichy.ch.uj.edu.pl. There are archives of the mailing list at: + http://epistolary.org/mailman/listinfo.cgi/arcnet The people on linux-net@vger.kernel.org (now defunct, replaced by @@ -80,17 +81,20 @@ Other Drivers and Info ---------------------- You can try my ARCNET page on the World Wide Web at: - http://www.qis.net/~jschmitz/arcnet/ + + http://www.qis.net/~jschmitz/arcnet/ Also, SMC (one of the companies that makes ARCnet cards) has a WWW site you might be interested in, which includes several drivers for various cards including ARCnet. Try: + http://www.smc.com/ - + Performance Technologies makes various network software that supports ARCnet: + http://www.perftech.com/ or ftp to ftp.perftech.com. - + Novell makes a networking stack for DOS which includes ARCnet drivers. Try FTPing to ftp.novell.com. @@ -99,19 +103,20 @@ one you'll want to use with ARCnet cards) from oak.oakland.edu:/simtel/msdos/pktdrvr. It won't work perfectly on a 386+ without patches, though, and also doesn't like several cards. Fixed versions are available on my WWW page, or via e-mail if you don't have WWW -access. +access. Installing the Driver --------------------- -All you will need to do in order to install the driver is: +All you will need to do in order to install the driver is:: + make config - (be sure to choose ARCnet in the network devices + (be sure to choose ARCnet in the network devices and at least one chipset driver.) make clean make zImage - + If you obtained this ARCnet package as an upgrade to the ARCnet driver in your current kernel, you will need to first copy arcnet.c over the one in the linux/drivers/net directory. @@ -125,10 +130,12 @@ There are four chipset options: This is the normal ARCnet card, which you've probably got. This is the only chipset driver which will autoprobe if not told where the card is. -It following options on the command line: +It following options on the command line:: + com90xx=[<io>[,<irq>[,<shmem>]]][,<name>] | <name> -If you load the chipset support as a module, the options are: +If you load the chipset support as a module, the options are:: + io=<io> irq=<irq> shmem=<shmem> device=<name> To disable the autoprobe, just specify "com90xx=" on the kernel command line. @@ -136,14 +143,17 @@ To specify the name alone, but allow autoprobe, just put "com90xx=<name>" 2. ARCnet COM20020 chipset. -This is the new chipset from SMC with support for promiscuous mode (packet +This is the new chipset from SMC with support for promiscuous mode (packet sniffing), extra diagnostic information, etc. Unfortunately, there is no sensible method of autoprobing for these cards. You must specify the I/O address on the kernel command line. -The command line options are: + +The command line options are:: + com20020=<io>[,<irq>[,<node_ID>[,backplane[,CKP[,timeout]]]]][,name] -If you load the chipset support as a module, the options are: +If you load the chipset support as a module, the options are:: + io=<io> irq=<irq> node=<node_ID> backplane=<backplane> clock=<CKP> timeout=<timeout> device=<name> @@ -160,8 +170,10 @@ you have a card which doesn't support shared memory, or (strangely) in case you have so many ARCnet cards in your machine that you run out of shmem slots. If you don't give the IO address on the kernel command line, then the driver will not find the card. -The command line options are: - com90io=<io>[,<irq>][,<name>] + +The command line options are:: + + com90io=<io>[,<irq>][,<name>] If you load the chipset support as a module, the options are: io=<io> irq=<irq> device=<name> @@ -169,44 +181,49 @@ If you load the chipset support as a module, the options are: 4. ARCnet RIM I cards. These are COM90xx chips which are _completely_ memory mapped. The support for -these is not tested. If you have one, please mail the author with a success +these is not tested. If you have one, please mail the author with a success report. All options must be specified, except the device name. -Command line options: +Command line options:: + arcrimi=<shmem>,<irq>,<node_ID>[,<name>] -If you load the chipset support as a module, the options are: +If you load the chipset support as a module, the options are:: + shmem=<shmem> irq=<irq> node=<node_ID> device=<name> Loadable Module Support ----------------------- -Configure and rebuild Linux. When asked, answer 'm' to "Generic ARCnet +Configure and rebuild Linux. When asked, answer 'm' to "Generic ARCnet support" and to support for your ARCnet chipset if you want to use the -loadable module. You can also say 'y' to "Generic ARCnet support" and 'm' +loadable module. You can also say 'y' to "Generic ARCnet support" and 'm' to the chipset support if you wish. +:: + make config - make clean + make clean make zImage make modules - + If you're using a loadable module, you need to use insmod to load it, and you can specify various characteristics of your card on the command line. (In recent versions of the driver, autoprobing is much more reliable and works as a module, so most of this is now unnecessary.) -For example: +For example:: + cd /usr/src/linux/modules insmod arcnet.o insmod com90xx.o insmod com20020.o io=0x2e0 device=eth1 - + Using the Driver ---------------- -If you build your kernel with ARCnet COM90xx support included, it should +If you build your kernel with ARCnet COM90xx support included, it should probe for your card automatically when you boot. If you use a different chipset driver complied into the kernel, you must give the necessary options on the kernel command line, as detailed above. @@ -224,69 +241,78 @@ Multiple Cards in One Computer ------------------------------ Linux has pretty good support for this now, but since I've been busy, the -ARCnet driver has somewhat suffered in this respect. COM90xx support, if -compiled into the kernel, will (try to) autodetect all the installed cards. +ARCnet driver has somewhat suffered in this respect. COM90xx support, if +compiled into the kernel, will (try to) autodetect all the installed cards. + +If you have other cards, with support compiled into the kernel, then you can +just repeat the options on the kernel command line, e.g.:: + + LILO: linux com20020=0x2e0 com20020=0x380 com90io=0x260 -If you have other cards, with support compiled into the kernel, then you can -just repeat the options on the kernel command line, e.g.: -LILO: linux com20020=0x2e0 com20020=0x380 com90io=0x260 +If you have the chipset support built as a loadable module, then you need to +do something like this:: -If you have the chipset support built as a loadable module, then you need to -do something like this: insmod -o arc0 com90xx insmod -o arc1 com20020 io=0x2e0 insmod -o arc2 com90xx + The ARCnet drivers will now sort out their names automatically. How do I get it to work with...? -------------------------------- -NFS: Should be fine linux->linux, just pretend you're using Ethernet cards. - oak.oakland.edu:/simtel/msdos/nfs has some nice DOS clients. There - is also a DOS-based NFS server called SOSS. It doesn't multitask - quite the way Linux does (actually, it doesn't multitask AT ALL) but - you never know what you might need. - - With AmiTCP (and possibly others), you may need to set the following - options in your Amiga nfstab: MD 1024 MR 1024 MW 1024 - (Thanks to Christian Gottschling <ferksy@indigo.tng.oche.de> +NFS: + Should be fine linux->linux, just pretend you're using Ethernet cards. + oak.oakland.edu:/simtel/msdos/nfs has some nice DOS clients. There + is also a DOS-based NFS server called SOSS. It doesn't multitask + quite the way Linux does (actually, it doesn't multitask AT ALL) but + you never know what you might need. + + With AmiTCP (and possibly others), you may need to set the following + options in your Amiga nfstab: MD 1024 MR 1024 MW 1024 + (Thanks to Christian Gottschling <ferksy@indigo.tng.oche.de> for this.) - + Probably these refer to maximum NFS data/read/write block sizes. I don't know why the defaults on the Amiga didn't work; write to me if you know more. -DOS: If you're using the freeware arcether.com, you might want to install - the driver patch from my web page. It helps with PC/TCP, and also - can get arcether to load if it timed out too quickly during - initialization. In fact, if you use it on a 386+ you REALLY need - the patch, really. - -Windows: See DOS :) Trumpet Winsock works fine with either the Novell or +DOS: + If you're using the freeware arcether.com, you might want to install + the driver patch from my web page. It helps with PC/TCP, and also + can get arcether to load if it timed out too quickly during + initialization. In fact, if you use it on a 386+ you REALLY need + the patch, really. + +Windows: + See DOS :) Trumpet Winsock works fine with either the Novell or Arcether client, assuming you remember to load winpkt of course. -LAN Manager and Windows for Workgroups: These programs use protocols that - are incompatible with the Internet standard. They try to pretend - the cards are Ethernet, and confuse everyone else on the network. - - However, v2.00 and higher of the Linux ARCnet driver supports this - protocol via the 'arc0e' device. See the section on "Multiprotocol - Support" for more information. +LAN Manager and Windows for Workgroups: + These programs use protocols that + are incompatible with the Internet standard. They try to pretend + the cards are Ethernet, and confuse everyone else on the network. + + However, v2.00 and higher of the Linux ARCnet driver supports this + protocol via the 'arc0e' device. See the section on "Multiprotocol + Support" for more information. Using the freeware Samba server and clients for Linux, you can now interface quite nicely with TCP/IP-based WfWg or Lan Manager networks. - -Windows 95: Tools are included with Win95 that let you use either the LANMAN + +Windows 95: + Tools are included with Win95 that let you use either the LANMAN style network drivers (NDIS) or Novell drivers (ODI) to handle your ARCnet packets. If you use ODI, you'll need to use the 'arc0' - device with Linux. If you use NDIS, then try the 'arc0e' device. + device with Linux. If you use NDIS, then try the 'arc0e' device. See the "Multiprotocol Support" section below if you need arc0e, you're completely insane, and/or you need to build some kind of hybrid network that uses both encapsulation types. -OS/2: I've been told it works under Warp Connect with an ARCnet driver from +OS/2: + I've been told it works under Warp Connect with an ARCnet driver from SMC. You need to use the 'arc0e' interface for this. If you get the SMC driver to work with the TCP/IP stuff included in the "normal" Warp Bonus Pack, let me know. @@ -295,7 +321,8 @@ OS/2: I've been told it works under Warp Connect with an ARCnet driver from which should use the same protocol as WfWg does. I had no luck installing it under Warp, however. Please mail me with any results. -NetBSD/AmiTCP: These use an old version of the Internet standard ARCnet +NetBSD/AmiTCP: + These use an old version of the Internet standard ARCnet protocol (RFC1051) which is compatible with the Linux driver v2.10 ALPHA and above using the arc0s device. (See "Multiprotocol ARCnet" below.) ** Newer versions of NetBSD apparently support RFC1201. @@ -307,16 +334,17 @@ Using Multiprotocol ARCnet The ARCnet driver v2.10 ALPHA supports three protocols, each on its own "virtual network device": - arc0 - RFC1201 protocol, the official Internet standard which just - happens to be 100% compatible with Novell's TRXNET driver. + ====== =============================================================== + arc0 RFC1201 protocol, the official Internet standard which just + happens to be 100% compatible with Novell's TRXNET driver. Version 1.00 of the ARCnet driver supported _only_ this protocol. arc0 is the fastest of the three protocols (for whatever reason), and allows larger packets to be used - because it supports RFC1201 "packet splitting" operations. + because it supports RFC1201 "packet splitting" operations. Unless you have a specific need to use a different protocol, I strongly suggest that you stick with this one. - - arc0e - "Ethernet-Encapsulation" which sends packets over ARCnet + + arc0e "Ethernet-Encapsulation" which sends packets over ARCnet that are actually a lot like Ethernet packets, including the 6-byte hardware addresses. This protocol is compatible with Microsoft's NDIS ARCnet driver, like the one in WfWg and @@ -328,8 +356,8 @@ The ARCnet driver v2.10 ALPHA supports three protocols, each on its own fit. arc0e also works slightly more slowly than arc0, for reasons yet to be determined. (Probably it's the smaller MTU that does it.) - - arc0s - The "[s]imple" RFC1051 protocol is the "previous" Internet + + arc0s The "[s]imple" RFC1051 protocol is the "previous" Internet standard that is completely incompatible with the new standard. Some software today, however, continues to support the old standard (and only the old standard) @@ -338,9 +366,10 @@ The ARCnet driver v2.10 ALPHA supports three protocols, each on its own smaller than the Internet "requirement," so it's quite possible that you may run into problems. It's also slower than RFC1201 by about 25%, for the same reason as arc0e. - + The arc0s support was contributed by Tomasz Motylewski and modified somewhat by me. Bugs are probably my fault. + ====== =============================================================== You can choose not to compile arc0e and arc0s into the driver if you want - this will save you a bit of memory and avoid confusion when eg. trying to @@ -358,19 +387,21 @@ can set up your network then: two available protocols. As mentioned above, it's a good idea to use only arc0 unless you have a good reason (like some other software, ie. WfWg, that only works with arc0e). - - If you need only arc0, then the following commands should get you going: - ifconfig arc0 MY.IP.ADD.RESS - route add MY.IP.ADD.RESS arc0 - route add -net SUB.NET.ADD.RESS arc0 - [add other local routes here] - - If you need arc0e (and only arc0e), it's a little different: - ifconfig arc0 MY.IP.ADD.RESS - ifconfig arc0e MY.IP.ADD.RESS - route add MY.IP.ADD.RESS arc0e - route add -net SUB.NET.ADD.RESS arc0e - + + If you need only arc0, then the following commands should get you going:: + + ifconfig arc0 MY.IP.ADD.RESS + route add MY.IP.ADD.RESS arc0 + route add -net SUB.NET.ADD.RESS arc0 + [add other local routes here] + + If you need arc0e (and only arc0e), it's a little different:: + + ifconfig arc0 MY.IP.ADD.RESS + ifconfig arc0e MY.IP.ADD.RESS + route add MY.IP.ADD.RESS arc0e + route add -net SUB.NET.ADD.RESS arc0e + arc0s works much the same way as arc0e. @@ -391,29 +422,32 @@ can set up your network then: XT (patience), however, does not have its own Internet IP address and so I assigned it one on a "private subnet" (as defined by RFC1597). - To start with, take a simple network with just insight and freedom. + To start with, take a simple network with just insight and freedom. Insight needs to: - - talk to freedom via RFC1201 (arc0) protocol, because I like it + + - talk to freedom via RFC1201 (arc0) protocol, because I like it more and it's faster. - use freedom as its Internet gateway. - - That's pretty easy to do. Set up insight like this: - ifconfig arc0 insight - route add insight arc0 - route add freedom arc0 /* I would use the subnet here (like I said + + That's pretty easy to do. Set up insight like this:: + + ifconfig arc0 insight + route add insight arc0 + route add freedom arc0 /* I would use the subnet here (like I said to to in "single protocol" above), - but the rest of the subnet - unfortunately lies across the PPP - link on freedom, which confuses - things. */ - route add default gw freedom - - And freedom gets configured like so: - ifconfig arc0 freedom - route add freedom arc0 - route add insight arc0 - /* and default gateway is configured by pppd */ - + but the rest of the subnet + unfortunately lies across the PPP + link on freedom, which confuses + things. */ + route add default gw freedom + + And freedom gets configured like so:: + + ifconfig arc0 freedom + route add freedom arc0 + route add insight arc0 + /* and default gateway is configured by pppd */ + Great, now insight talks to freedom directly on arc0, and sends packets to the Internet through freedom. If you didn't know how to do the above, you should probably stop reading this section now because it only gets @@ -425,7 +459,7 @@ can set up your network then: Internet. (Recall that patience has a "private IP address" which won't work on the Internet; that's okay, I configured Linux IP masquerading on freedom for this subnet). - + So patience (necessarily; I don't have another IP number from my provider) has an IP address on a different subnet than freedom and insight, but needs to use freedom as an Internet gateway. Worse, most @@ -435,53 +469,54 @@ can set up your network then: insight, patience WILL send through its default gateway, regardless of the fact that both freedom and insight (courtesy of the arc0e device) could understand a direct transmission. - - I compensate by giving freedom an extra IP address - aliased 'gatekeeper' - - that is on my private subnet, the same subnet that patience is on. I + + I compensate by giving freedom an extra IP address - aliased 'gatekeeper' - + that is on my private subnet, the same subnet that patience is on. I then define gatekeeper to be the default gateway for patience. - - To configure freedom (in addition to the commands above): - ifconfig arc0e gatekeeper - route add gatekeeper arc0e - route add patience arc0e - + + To configure freedom (in addition to the commands above):: + + ifconfig arc0e gatekeeper + route add gatekeeper arc0e + route add patience arc0e + This way, freedom will send all packets for patience through arc0e, giving its IP address as gatekeeper (on the private subnet). When it talks to insight or the Internet, it will use its "freedom" Internet IP address. - - You will notice that we haven't configured the arc0e device on insight. + + You will notice that we haven't configured the arc0e device on insight. This would work, but is not really necessary, and would require me to assign insight another special IP number from my private subnet. Since both insight and patience are using freedom as their default gateway, the two can already talk to each other. - + It's quite fortunate that I set things up like this the first time (cough cough) because it's really handy when I boot insight into DOS. There, it - runs the Novell ODI protocol stack, which only works with RFC1201 ARCnet. + runs the Novell ODI protocol stack, which only works with RFC1201 ARCnet. In this mode it would be impossible for insight to communicate directly with patience, since the Novell stack is incompatible with Microsoft's Ethernet-Encap. Without changing any settings on freedom or patience, I simply set freedom as the default gateway for insight (now in DOS, remember) and all the forwarding happens "automagically" between the two hosts that would normally not be able to communicate at all. - + For those who like diagrams, I have created two "virtual subnets" on the - same physical ARCnet wire. You can picture it like this: - - - [RFC1201 NETWORK] [ETHER-ENCAP NETWORK] + same physical ARCnet wire. You can picture it like this:: + + + [RFC1201 NETWORK] [ETHER-ENCAP NETWORK] (registered Internet subnet) (RFC1597 private subnet) - - (IP Masquerade) - /---------------\ * /---------------\ - | | * | | - | +-Freedom-*-Gatekeeper-+ | - | | | * | | - \-------+-------/ | * \-------+-------/ - | | | - Insight | Patience - (Internet) + + (IP Masquerade) + /---------------\ * /---------------\ + | | * | | + | +-Freedom-*-Gatekeeper-+ | + | | | * | | + \-------+-------/ | * \-------+-------/ + | | | + Insight | Patience + (Internet) @@ -491,6 +526,7 @@ It works: what now? Send mail describing your setup, preferably including driver version, kernel version, ARCnet card model, CPU type, number of systems on your network, and list of software in use to me at the following address: + apenwarr@worldvisions.ca I do send (sometimes automated) replies to all messages I receive. My email @@ -525,7 +561,7 @@ this, you should grab the pertinent RFCs. (some are listed near the top of arcnet.c). arcdump assumes your card is at 0xD0000. If it isn't, edit the script. -Buffers 0 and 1 are used for receiving, and Buffers 2 and 3 are for sending. +Buffers 0 and 1 are used for receiving, and Buffers 2 and 3 are for sending. Ping-pong buffers are implemented both ways. If your debug level includes D_DURING and you did NOT define SLOW_XMIT_COPY, @@ -535,9 +571,11 @@ decides that the driver is broken). During a transmit, unused parts of the buffer will be cleared to 0x42 as well. This is to make it easier to figure out which bytes are being used by a packet. -You can change the debug level without recompiling the kernel by typing: +You can change the debug level without recompiling the kernel by typing:: + ifconfig arc0 down metric 1xxx /etc/rc.d/rc.inet1 + where "xxx" is the debug level you want. For example, "metric 1015" would put you at debug level 15. Debug level 7 is currently the default. @@ -546,7 +584,7 @@ combination of different debug flags; so debug level 7 is really 1+2+4 or D_NORMAL+D_EXTRA+D_INIT. To include D_DURING, you would add 16 to this, resulting in debug level 23. -If you don't understand that, you probably don't want to know anyway. +If you don't understand that, you probably don't want to know anyway. E-mail me about your problem. diff --git a/Documentation/networking/atm.txt b/Documentation/networking/atm.rst index 82921cee77fe..c1df8c038525 100644 --- a/Documentation/networking/atm.txt +++ b/Documentation/networking/atm.rst @@ -1,3 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=== +ATM +=== + In order to use anything but the most primitive functions of ATM, several user-mode programs are required to assist the kernel. These programs and related material can be found via the ATM on Linux Web diff --git a/Documentation/networking/ax25.txt b/Documentation/networking/ax25.rst index 8257dbf9be57..824afd7002db 100644 --- a/Documentation/networking/ax25.txt +++ b/Documentation/networking/ax25.rst @@ -1,3 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===== +AX.25 +===== + To use the amateur radio protocols within Linux you will need to get a suitable copy of the AX.25 Utilities. More detailed information about AX.25, NET/ROM and ROSE, associated programs and and utilities can be diff --git a/Documentation/networking/baycom.txt b/Documentation/networking/baycom.rst index 688f18fd4467..fe2d010f0e86 100644 --- a/Documentation/networking/baycom.txt +++ b/Documentation/networking/baycom.rst @@ -1,26 +1,31 @@ - LINUX DRIVERS FOR BAYCOM MODEMS +.. SPDX-License-Identifier: GPL-2.0 - Thomas M. Sailer, HB9JNX/AE4WA, <sailer@ife.ee.ethz.ch> +=============================== +Linux Drivers for Baycom Modems +=============================== -!!NEW!! (04/98) The drivers for the baycom modems have been split into +Thomas M. Sailer, HB9JNX/AE4WA, <sailer@ife.ee.ethz.ch> + +The drivers for the baycom modems have been split into separate drivers as they did not share any code, and the driver and device names have changed. This document describes the Linux Kernel Drivers for simple Baycom style -amateur radio modems. +amateur radio modems. The following drivers are available: +==================================== baycom_ser_fdx: This driver supports the SER12 modems either full or half duplex. - Its baud rate may be changed via the `baud' module parameter, + Its baud rate may be changed via the ``baud`` module parameter, therefore it supports just about every bit bang modem on a serial port. Its devices are called bcsf0 through bcsf3. This is the recommended driver for SER12 type modems, however if you have a broken UART clone that does not have working - delta status bits, you may try baycom_ser_hdx. + delta status bits, you may try baycom_ser_hdx. -baycom_ser_hdx: +baycom_ser_hdx: This is an alternative driver for SER12 type modems. It only supports half duplex, and only 1200 baud. Its devices are called bcsh0 through bcsh3. Use this driver only if baycom_ser_fdx @@ -37,45 +42,48 @@ baycom_epp: The following modems are supported: -ser12: This is a very simple 1200 baud AFSK modem. The modem consists only - of a modulator/demodulator chip, usually a TI TCM3105. The computer - is responsible for regenerating the receiver bit clock, as well as - for handling the HDLC protocol. The modem connects to a serial port, - hence the name. Since the serial port is not used as an async serial - port, the kernel driver for serial ports cannot be used, and this - driver only supports standard serial hardware (8250, 16450, 16550) - -par96: This is a modem for 9600 baud FSK compatible to the G3RUH standard. - The modem does all the filtering and regenerates the receiver clock. - Data is transferred from and to the PC via a shift register. - The shift register is filled with 16 bits and an interrupt is signalled. - The PC then empties the shift register in a burst. This modem connects - to the parallel port, hence the name. The modem leaves the - implementation of the HDLC protocol and the scrambler polynomial to - the PC. - -picpar: This is a redesign of the par96 modem by Henning Rech, DF9IC. The modem - is protocol compatible to par96, but uses only three low power ICs - and can therefore be fed from the parallel port and does not require - an additional power supply. Furthermore, it incorporates a carrier - detect circuitry. - -EPP: This is a high-speed modem adaptor that connects to an enhanced parallel port. - Its target audience is users working over a high speed hub (76.8kbit/s). - -eppfpga: This is a redesign of the EPP adaptor. - - +======= ======================================================================== +ser12 This is a very simple 1200 baud AFSK modem. The modem consists only + of a modulator/demodulator chip, usually a TI TCM3105. The computer + is responsible for regenerating the receiver bit clock, as well as + for handling the HDLC protocol. The modem connects to a serial port, + hence the name. Since the serial port is not used as an async serial + port, the kernel driver for serial ports cannot be used, and this + driver only supports standard serial hardware (8250, 16450, 16550) + +par96 This is a modem for 9600 baud FSK compatible to the G3RUH standard. + The modem does all the filtering and regenerates the receiver clock. + Data is transferred from and to the PC via a shift register. + The shift register is filled with 16 bits and an interrupt is signalled. + The PC then empties the shift register in a burst. This modem connects + to the parallel port, hence the name. The modem leaves the + implementation of the HDLC protocol and the scrambler polynomial to + the PC. + +picpar This is a redesign of the par96 modem by Henning Rech, DF9IC. The modem + is protocol compatible to par96, but uses only three low power ICs + and can therefore be fed from the parallel port and does not require + an additional power supply. Furthermore, it incorporates a carrier + detect circuitry. + +EPP This is a high-speed modem adaptor that connects to an enhanced parallel + port. + + Its target audience is users working over a high speed hub (76.8kbit/s). + +eppfpga This is a redesign of the EPP adaptor. +======= ======================================================================== All of the above modems only support half duplex communications. However, the driver supports the KISS (see below) fullduplex command. It then simply starts to send as soon as there's a packet to transmit and does not care about DCD, i.e. it starts to send even if there's someone else on the channel. -This command is required by some implementations of the DAMA channel +This command is required by some implementations of the DAMA channel access protocol. The Interface of the drivers +============================ Unlike previous drivers, these drivers are no longer character devices, but they are now true kernel network interfaces. Installation is therefore @@ -88,20 +96,22 @@ me for WAMPES which allows attaching a kernel network interface directly. Configuring the driver +====================== Every time a driver is inserted into the kernel, it has to know which modems it should access at which ports. This can be done with the setbaycom utility. If you are only using one modem, you can also configure the driver from the insmod command line (or by means of an option line in -/etc/modprobe.d/*.conf). +``/etc/modprobe.d/*.conf``). + +Examples:: -Examples: modprobe baycom_ser_fdx mode="ser12*" iobase=0x3f8 irq=4 sethdlc -i bcsf0 -p mode "ser12*" io 0x3f8 irq 4 Both lines configure the first port to drive a ser12 modem at the first -serial port (COM1 under DOS). The * in the mode parameter instructs the driver to use -the software DCD algorithm (see below). +serial port (COM1 under DOS). The * in the mode parameter instructs the driver +to use the software DCD algorithm (see below):: insmod baycom_par mode="picpar" iobase=0x378 sethdlc -i bcp0 -p mode "picpar" io 0x378 @@ -115,29 +125,33 @@ Note that both utilities interpret the values slightly differently. Hardware DCD versus Software DCD +================================ To avoid collisions on the air, the driver must know when the channel is busy. This is the task of the DCD circuitry/software. The driver may either utilise a software DCD algorithm (options=1) or use a DCD signal from the hardware (options=0). -ser12: if software DCD is utilised, the radio's squelch should always be - open. It is highly recommended to use the software DCD algorithm, - as it is much faster than most hardware squelch circuitry. The - disadvantage is a slightly higher load on the system. +======= ================================================================= +ser12 if software DCD is utilised, the radio's squelch should always be + open. It is highly recommended to use the software DCD algorithm, + as it is much faster than most hardware squelch circuitry. The + disadvantage is a slightly higher load on the system. -par96: the software DCD algorithm for this type of modem is rather poor. - The modem simply does not provide enough information to implement - a reasonable DCD algorithm in software. Therefore, if your radio - feeds the DCD input of the PAR96 modem, the use of the hardware - DCD circuitry is recommended. +par96 the software DCD algorithm for this type of modem is rather poor. + The modem simply does not provide enough information to implement + a reasonable DCD algorithm in software. Therefore, if your radio + feeds the DCD input of the PAR96 modem, the use of the hardware + DCD circuitry is recommended. -picpar: the picpar modem features a builtin DCD hardware, which is highly - recommended. +picpar the picpar modem features a builtin DCD hardware, which is highly + recommended. +======= ================================================================= Compatibility with the rest of the Linux kernel +=============================================== The serial driver and the baycom serial drivers compete for the same hardware resources. Of course only one driver can access a given @@ -154,5 +168,7 @@ The parallel port drivers (baycom_par, baycom_epp) now use the parport subsystem to arbitrate the ports between different client drivers. vy 73s de + Tom Sailer, sailer@ife.ee.ethz.ch + hb9jnx @ hb9w.ampr.org diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.rst index e3abfbd32f71..dd49f95d28d3 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.rst @@ -1,10 +1,15 @@ +.. SPDX-License-Identifier: GPL-2.0 - Linux Ethernet Bonding Driver HOWTO +=================================== +Linux Ethernet Bonding Driver HOWTO +=================================== - Latest update: 27 April 2011 +Latest update: 27 April 2011 + +Initial release: Thomas Davis <tadavis at lbl.gov> + +Corrections, HA extensions: 2000/10/03-15: -Initial release : Thomas Davis <tadavis at lbl.gov> -Corrections, HA extensions : 2000/10/03-15 : - Willy Tarreau <willy at meta-x.org> - Constantine Gavrilov <const-g at xpert.com> - Chad N. Tindel <ctindel at ieee dot org> @@ -13,98 +18,98 @@ Corrections, HA extensions : 2000/10/03-15 : Reorganized and updated Feb 2005 by Jay Vosburgh Added Sysfs information: 2006/04/24 + - Mitch Williams <mitch.a.williams at intel.com> Introduction ============ - The Linux bonding driver provides a method for aggregating +The Linux bonding driver provides a method for aggregating multiple network interfaces into a single logical "bonded" interface. The behavior of the bonded interfaces depends upon the mode; generally speaking, modes provide either hot standby or load balancing services. Additionally, link integrity monitoring may be performed. - - The bonding driver originally came from Donald Becker's + +The bonding driver originally came from Donald Becker's beowulf patches for kernel 2.0. It has changed quite a bit since, and the original tools from extreme-linux and beowulf sites will not work with this version of the driver. - For new versions of the driver, updated userspace tools, and +For new versions of the driver, updated userspace tools, and who to ask for help, please follow the links at the end of this file. -Table of Contents -================= +.. Table of Contents -1. Bonding Driver Installation + 1. Bonding Driver Installation -2. Bonding Driver Options + 2. Bonding Driver Options -3. Configuring Bonding Devices -3.1 Configuration with Sysconfig Support -3.1.1 Using DHCP with Sysconfig -3.1.2 Configuring Multiple Bonds with Sysconfig -3.2 Configuration with Initscripts Support -3.2.1 Using DHCP with Initscripts -3.2.2 Configuring Multiple Bonds with Initscripts -3.3 Configuring Bonding Manually with Ifenslave -3.3.1 Configuring Multiple Bonds Manually -3.4 Configuring Bonding Manually via Sysfs -3.5 Configuration with Interfaces Support -3.6 Overriding Configuration for Special Cases -3.7 Configuring LACP for 802.3ad mode in a more secure way + 3. Configuring Bonding Devices + 3.1 Configuration with Sysconfig Support + 3.1.1 Using DHCP with Sysconfig + 3.1.2 Configuring Multiple Bonds with Sysconfig + 3.2 Configuration with Initscripts Support + 3.2.1 Using DHCP with Initscripts + 3.2.2 Configuring Multiple Bonds with Initscripts + 3.3 Configuring Bonding Manually with Ifenslave + 3.3.1 Configuring Multiple Bonds Manually + 3.4 Configuring Bonding Manually via Sysfs + 3.5 Configuration with Interfaces Support + 3.6 Overriding Configuration for Special Cases + 3.7 Configuring LACP for 802.3ad mode in a more secure way -4. Querying Bonding Configuration -4.1 Bonding Configuration -4.2 Network Configuration + 4. Querying Bonding Configuration + 4.1 Bonding Configuration + 4.2 Network Configuration -5. Switch Configuration + 5. Switch Configuration -6. 802.1q VLAN Support + 6. 802.1q VLAN Support -7. Link Monitoring -7.1 ARP Monitor Operation -7.2 Configuring Multiple ARP Targets -7.3 MII Monitor Operation + 7. Link Monitoring + 7.1 ARP Monitor Operation + 7.2 Configuring Multiple ARP Targets + 7.3 MII Monitor Operation -8. Potential Trouble Sources -8.1 Adventures in Routing -8.2 Ethernet Device Renaming -8.3 Painfully Slow Or No Failed Link Detection By Miimon + 8. Potential Trouble Sources + 8.1 Adventures in Routing + 8.2 Ethernet Device Renaming + 8.3 Painfully Slow Or No Failed Link Detection By Miimon -9. SNMP agents + 9. SNMP agents -10. Promiscuous mode + 10. Promiscuous mode -11. Configuring Bonding for High Availability -11.1 High Availability in a Single Switch Topology -11.2 High Availability in a Multiple Switch Topology -11.2.1 HA Bonding Mode Selection for Multiple Switch Topology -11.2.2 HA Link Monitoring for Multiple Switch Topology + 11. Configuring Bonding for High Availability + 11.1 High Availability in a Single Switch Topology + 11.2 High Availability in a Multiple Switch Topology + 11.2.1 HA Bonding Mode Selection for Multiple Switch Topology + 11.2.2 HA Link Monitoring for Multiple Switch Topology -12. Configuring Bonding for Maximum Throughput -12.1 Maximum Throughput in a Single Switch Topology -12.1.1 MT Bonding Mode Selection for Single Switch Topology -12.1.2 MT Link Monitoring for Single Switch Topology -12.2 Maximum Throughput in a Multiple Switch Topology -12.2.1 MT Bonding Mode Selection for Multiple Switch Topology -12.2.2 MT Link Monitoring for Multiple Switch Topology + 12. Configuring Bonding for Maximum Throughput + 12.1 Maximum Throughput in a Single Switch Topology + 12.1.1 MT Bonding Mode Selection for Single Switch Topology + 12.1.2 MT Link Monitoring for Single Switch Topology + 12.2 Maximum Throughput in a Multiple Switch Topology + 12.2.1 MT Bonding Mode Selection for Multiple Switch Topology + 12.2.2 MT Link Monitoring for Multiple Switch Topology -13. Switch Behavior Issues -13.1 Link Establishment and Failover Delays -13.2 Duplicated Incoming Packets + 13. Switch Behavior Issues + 13.1 Link Establishment and Failover Delays + 13.2 Duplicated Incoming Packets -14. Hardware Specific Considerations -14.1 IBM BladeCenter + 14. Hardware Specific Considerations + 14.1 IBM BladeCenter -15. Frequently Asked Questions + 15. Frequently Asked Questions -16. Resources and Links + 16. Resources and Links 1. Bonding Driver Installation ============================== - Most popular distro kernels ship with the bonding driver +Most popular distro kernels ship with the bonding driver already available as a module. If your distro does not, or you have need to compile bonding from source (e.g., configuring and installing a mainline kernel from kernel.org), you'll need to perform @@ -113,54 +118,54 @@ the following steps: 1.1 Configure and build the kernel with bonding ----------------------------------------------- - The current version of the bonding driver is available in the +The current version of the bonding driver is available in the drivers/net/bonding subdirectory of the most recent kernel source (which is available on http://kernel.org). Most users "rolling their own" will want to use the most recent kernel from kernel.org. - Configure kernel with "make menuconfig" (or "make xconfig" or +Configure kernel with "make menuconfig" (or "make xconfig" or "make config"), then select "Bonding driver support" in the "Network device support" section. It is recommended that you configure the driver as module since it is currently the only way to pass parameters to the driver or configure more than one bonding device. - Build and install the new kernel and modules. +Build and install the new kernel and modules. 1.2 Bonding Control Utility -------------------------------------- +--------------------------- - It is recommended to configure bonding via iproute2 (netlink) +It is recommended to configure bonding via iproute2 (netlink) or sysfs, the old ifenslave control utility is obsolete. 2. Bonding Driver Options ========================= - Options for the bonding driver are supplied as parameters to the +Options for the bonding driver are supplied as parameters to the bonding module at load time, or are specified via sysfs. - Module options may be given as command line arguments to the +Module options may be given as command line arguments to the insmod or modprobe command, but are usually specified in either the -/etc/modprobe.d/*.conf configuration files, or in a distro-specific +``/etc/modprobe.d/*.conf`` configuration files, or in a distro-specific configuration file (some of which are detailed in the next section). - Details on bonding support for sysfs is provided in the +Details on bonding support for sysfs is provided in the "Configuring Bonding Manually via Sysfs" section, below. - The available bonding driver parameters are listed below. If a +The available bonding driver parameters are listed below. If a parameter is not specified the default value is used. When initially configuring a bond, it is recommended "tail -f /var/log/messages" be run in a separate window to watch for bonding driver error messages. - It is critical that either the miimon or arp_interval and +It is critical that either the miimon or arp_interval and arp_ip_target parameters be specified, otherwise serious network degradation will occur during link failures. Very few devices do not support at least miimon, so there is really no reason not to use it. - Options with textual values will accept either the text name +Options with textual values will accept either the text name or, for backwards compatibility, the option value. E.g., "mode=802.3ad" and "mode=4" set the same mode. - The parameters are as follows: +The parameters are as follows: active_slave @@ -246,10 +251,13 @@ ad_user_port_key In an AD system, the port-key has three parts as shown below - + ===== ============ Bits Use + ===== ============ 00 Duplex 01-05 Speed 06-15 User-defined + ===== ============ This defines the upper 10 bits of the port key. The values can be from 0 - 1023. If not given, the system defaults to 0. @@ -699,7 +707,7 @@ mode swapped with the new curr_active_slave that was chosen. -num_grat_arp +num_grat_arp, num_unsol_na Specify the number of peer notifications (gratuitous ARPs and @@ -729,13 +737,13 @@ packets_per_slave peer_notif_delay - Specify the delay, in milliseconds, between each peer - notification (gratuitous ARP and unsolicited IPv6 Neighbor - Advertisement) when they are issued after a failover event. - This delay should be a multiple of the link monitor interval - (arp_interval or miimon, whichever is active). The default - value is 0 which means to match the value of the link monitor - interval. + Specify the delay, in milliseconds, between each peer + notification (gratuitous ARP and unsolicited IPv6 Neighbor + Advertisement) when they are issued after a failover event. + This delay should be a multiple of the link monitor interval + (arp_interval or miimon, whichever is active). The default + value is 0 which means to match the value of the link monitor + interval. primary @@ -977,88 +985,88 @@ lp_interval 3. Configuring Bonding Devices ============================== - You can configure bonding using either your distro's network +You can configure bonding using either your distro's network initialization scripts, or manually using either iproute2 or the sysfs interface. Distros generally use one of three packages for the network initialization scripts: initscripts, sysconfig or interfaces. Recent versions of these packages have support for bonding, while older versions do not. - We will first describe the options for configuring bonding for +We will first describe the options for configuring bonding for distros using versions of initscripts, sysconfig and interfaces with full or partial support for bonding, then provide information on enabling bonding without support from the network initialization scripts (i.e., older versions of initscripts or sysconfig). - If you're unsure whether your distro uses sysconfig, +If you're unsure whether your distro uses sysconfig, initscripts or interfaces, or don't know if it's new enough, have no fear. Determining this is fairly straightforward. - First, look for a file called interfaces in /etc/network directory. +First, look for a file called interfaces in /etc/network directory. If this file is present in your system, then your system use interfaces. See Configuration with Interfaces Support. - Else, issue the command: +Else, issue the command:: -$ rpm -qf /sbin/ifup + $ rpm -qf /sbin/ifup - It will respond with a line of text starting with either +It will respond with a line of text starting with either "initscripts" or "sysconfig," followed by some numbers. This is the package that provides your network initialization scripts. - Next, to determine if your installation supports bonding, -issue the command: +Next, to determine if your installation supports bonding, +issue the command:: -$ grep ifenslave /sbin/ifup + $ grep ifenslave /sbin/ifup - If this returns any matches, then your initscripts or +If this returns any matches, then your initscripts or sysconfig has support for bonding. 3.1 Configuration with Sysconfig Support ---------------------------------------- - This section applies to distros using a version of sysconfig +This section applies to distros using a version of sysconfig with bonding support, for example, SuSE Linux Enterprise Server 9. - SuSE SLES 9's networking configuration system does support +SuSE SLES 9's networking configuration system does support bonding, however, at this writing, the YaST system configuration front end does not provide any means to work with bonding devices. Bonding devices can be managed by hand, however, as follows. - First, if they have not already been configured, configure the +First, if they have not already been configured, configure the slave devices. On SLES 9, this is most easily done by running the yast2 sysconfig configuration utility. The goal is for to create an ifcfg-id file for each slave device. The simplest way to accomplish this is to configure the devices for DHCP (this is only to get the file ifcfg-id file created; see below for some issues with DHCP). The -name of the configuration file for each device will be of the form: +name of the configuration file for each device will be of the form:: -ifcfg-id-xx:xx:xx:xx:xx:xx + ifcfg-id-xx:xx:xx:xx:xx:xx - Where the "xx" portion will be replaced with the digits from +Where the "xx" portion will be replaced with the digits from the device's permanent MAC address. - Once the set of ifcfg-id-xx:xx:xx:xx:xx:xx files has been +Once the set of ifcfg-id-xx:xx:xx:xx:xx:xx files has been created, it is necessary to edit the configuration files for the slave devices (the MAC addresses correspond to those of the slave devices). Before editing, the file will contain multiple lines, and will look -something like this: +something like this:: -BOOTPROTO='dhcp' -STARTMODE='on' -USERCTL='no' -UNIQUE='XNzu.WeZGOGF+4wE' -_nm_name='bus-pci-0001:61:01.0' + BOOTPROTO='dhcp' + STARTMODE='on' + USERCTL='no' + UNIQUE='XNzu.WeZGOGF+4wE' + _nm_name='bus-pci-0001:61:01.0' - Change the BOOTPROTO and STARTMODE lines to the following: +Change the BOOTPROTO and STARTMODE lines to the following:: -BOOTPROTO='none' -STARTMODE='off' + BOOTPROTO='none' + STARTMODE='off' - Do not alter the UNIQUE or _nm_name lines. Remove any other +Do not alter the UNIQUE or _nm_name lines. Remove any other lines (USERCTL, etc). - Once the ifcfg-id-xx:xx:xx:xx:xx:xx files have been modified, +Once the ifcfg-id-xx:xx:xx:xx:xx:xx files have been modified, it's time to create the configuration file for the bonding device itself. This file is named ifcfg-bondX, where X is the number of the bonding device to create, starting at 0. The first such file is @@ -1066,49 +1074,52 @@ ifcfg-bond0, the second is ifcfg-bond1, and so on. The sysconfig network configuration system will correctly start multiple instances of bonding. - The contents of the ifcfg-bondX file is as follows: - -BOOTPROTO="static" -BROADCAST="10.0.2.255" -IPADDR="10.0.2.10" -NETMASK="255.255.0.0" -NETWORK="10.0.2.0" -REMOTE_IPADDR="" -STARTMODE="onboot" -BONDING_MASTER="yes" -BONDING_MODULE_OPTS="mode=active-backup miimon=100" -BONDING_SLAVE0="eth0" -BONDING_SLAVE1="bus-pci-0000:06:08.1" - - Replace the sample BROADCAST, IPADDR, NETMASK and NETWORK +The contents of the ifcfg-bondX file is as follows:: + + BOOTPROTO="static" + BROADCAST="10.0.2.255" + IPADDR="10.0.2.10" + NETMASK="255.255.0.0" + NETWORK="10.0.2.0" + REMOTE_IPADDR="" + STARTMODE="onboot" + BONDING_MASTER="yes" + BONDING_MODULE_OPTS="mode=active-backup miimon=100" + BONDING_SLAVE0="eth0" + BONDING_SLAVE1="bus-pci-0000:06:08.1" + +Replace the sample BROADCAST, IPADDR, NETMASK and NETWORK values with the appropriate values for your network. - The STARTMODE specifies when the device is brought online. +The STARTMODE specifies when the device is brought online. The possible values are: - onboot: The device is started at boot time. If you're not + ======== ====================================================== + onboot The device is started at boot time. If you're not sure, this is probably what you want. - manual: The device is started only when ifup is called + manual The device is started only when ifup is called manually. Bonding devices may be configured this way if you do not wish them to start automatically at boot for some reason. - hotplug: The device is started by a hotplug event. This is not + hotplug The device is started by a hotplug event. This is not a valid choice for a bonding device. - off or ignore: The device configuration is ignored. + off or The device configuration is ignored. + ignore + ======== ====================================================== - The line BONDING_MASTER='yes' indicates that the device is a +The line BONDING_MASTER='yes' indicates that the device is a bonding master device. The only useful value is "yes." - The contents of BONDING_MODULE_OPTS are supplied to the +The contents of BONDING_MODULE_OPTS are supplied to the instance of the bonding module for this device. Specify the options for the bonding mode, link monitoring, and so on here. Do not include the max_bonds bonding parameter; this will confuse the configuration system if you have multiple bonding devices. - Finally, supply one BONDING_SLAVEn="slave device" for each +Finally, supply one BONDING_SLAVEn="slave device" for each slave. where "n" is an increasing value, one for each slave. The "slave device" is either an interface name, e.g., "eth0", or a device specifier for the network device. The interface name is easier to @@ -1120,34 +1131,34 @@ changes (for example, it is moved from one PCI slot to another). The example above uses one of each type for demonstration purposes; most configurations will choose one or the other for all slave devices. - When all configuration files have been modified or created, +When all configuration files have been modified or created, networking must be restarted for the configuration changes to take -effect. This can be accomplished via the following: +effect. This can be accomplished via the following:: -# /etc/init.d/network restart + # /etc/init.d/network restart - Note that the network control script (/sbin/ifdown) will +Note that the network control script (/sbin/ifdown) will remove the bonding module as part of the network shutdown processing, so it is not necessary to remove the module by hand if, e.g., the module parameters have changed. - Also, at this writing, YaST/YaST2 will not manage bonding +Also, at this writing, YaST/YaST2 will not manage bonding devices (they do not show bonding interfaces on its list of network devices). It is necessary to edit the configuration file by hand to change the bonding configuration. - Additional general options and details of the ifcfg file -format can be found in an example ifcfg template file: +Additional general options and details of the ifcfg file +format can be found in an example ifcfg template file:: -/etc/sysconfig/network/ifcfg.template + /etc/sysconfig/network/ifcfg.template - Note that the template does not document the various BONDING_ +Note that the template does not document the various ``BONDING_*`` settings described above, but does describe many of the other options. 3.1.1 Using DHCP with Sysconfig ------------------------------- - Under sysconfig, configuring a device with BOOTPROTO='dhcp' +Under sysconfig, configuring a device with BOOTPROTO='dhcp' will cause it to query DHCP for its IP address information. At this writing, this does not function for bonding devices; the scripts attempt to obtain the device address from DHCP prior to adding any of @@ -1157,7 +1168,7 @@ sent to the network. 3.1.2 Configuring Multiple Bonds with Sysconfig ----------------------------------------------- - The sysconfig network initialization system is capable of +The sysconfig network initialization system is capable of handling multiple bonding devices. All that is necessary is for each bonding instance to have an appropriately configured ifcfg-bondX file (as described above). Do not specify the "max_bonds" parameter to any @@ -1165,14 +1176,14 @@ instance of bonding, as this will confuse sysconfig. If you require multiple bonding devices with identical parameters, create multiple ifcfg-bondX files. - Because the sysconfig scripts supply the bonding module +Because the sysconfig scripts supply the bonding module options in the ifcfg-bondX file, it is not necessary to add them to -the system /etc/modules.d/*.conf configuration files. +the system ``/etc/modules.d/*.conf`` configuration files. 3.2 Configuration with Initscripts Support ------------------------------------------ - This section applies to distros using a recent version of +This section applies to distros using a recent version of initscripts with bonding support, for example, Red Hat Enterprise Linux version 3 or later, Fedora, etc. On these systems, the network initialization scripts have knowledge of bonding, and can be configured to @@ -1180,7 +1191,7 @@ control bonding devices. Note that older versions of the initscripts package have lower levels of support for bonding; this will be noted where applicable. - These distros will not automatically load the network adapter +These distros will not automatically load the network adapter driver unless the ethX device is configured with an IP address. Because of this constraint, users must manually configure a network-script file for all physical adapters that will be members of @@ -1188,19 +1199,19 @@ a bondX link. Network script files are located in the directory: /etc/sysconfig/network-scripts - The file name must be prefixed with "ifcfg-eth" and suffixed +The file name must be prefixed with "ifcfg-eth" and suffixed with the adapter's physical adapter number. For example, the script for eth0 would be named /etc/sysconfig/network-scripts/ifcfg-eth0. -Place the following text in the file: +Place the following text in the file:: -DEVICE=eth0 -USERCTL=no -ONBOOT=yes -MASTER=bond0 -SLAVE=yes -BOOTPROTO=none + DEVICE=eth0 + USERCTL=no + ONBOOT=yes + MASTER=bond0 + SLAVE=yes + BOOTPROTO=none - The DEVICE= line will be different for every ethX device and +The DEVICE= line will be different for every ethX device and must correspond with the name of the file, i.e., ifcfg-eth1 must have a device line of DEVICE=eth1. The setting of the MASTER= line will also depend on the final bonding interface name chosen for your bond. @@ -1208,69 +1219,70 @@ As with other network devices, these typically start at 0, and go up one for each device, i.e., the first bonding instance is bond0, the second is bond1, and so on. - Next, create a bond network script. The file name for this +Next, create a bond network script. The file name for this script will be /etc/sysconfig/network-scripts/ifcfg-bondX where X is the number of the bond. For bond0 the file is named "ifcfg-bond0", for bond1 it is named "ifcfg-bond1", and so on. Within that file, -place the following text: - -DEVICE=bond0 -IPADDR=192.168.1.1 -NETMASK=255.255.255.0 -NETWORK=192.168.1.0 -BROADCAST=192.168.1.255 -ONBOOT=yes -BOOTPROTO=none -USERCTL=no - - Be sure to change the networking specific lines (IPADDR, +place the following text:: + + DEVICE=bond0 + IPADDR=192.168.1.1 + NETMASK=255.255.255.0 + NETWORK=192.168.1.0 + BROADCAST=192.168.1.255 + ONBOOT=yes + BOOTPROTO=none + USERCTL=no + +Be sure to change the networking specific lines (IPADDR, NETMASK, NETWORK and BROADCAST) to match your network configuration. - For later versions of initscripts, such as that found with Fedora +For later versions of initscripts, such as that found with Fedora 7 (or later) and Red Hat Enterprise Linux version 5 (or later), it is possible, and, indeed, preferable, to specify the bonding options in the ifcfg-bond0 -file, e.g. a line of the format: +file, e.g. a line of the format:: -BONDING_OPTS="mode=active-backup arp_interval=60 arp_ip_target=192.168.1.254" + BONDING_OPTS="mode=active-backup arp_interval=60 arp_ip_target=192.168.1.254" - will configure the bond with the specified options. The options +will configure the bond with the specified options. The options specified in BONDING_OPTS are identical to the bonding module parameters except for the arp_ip_target field when using versions of initscripts older than and 8.57 (Fedora 8) and 8.45.19 (Red Hat Enterprise Linux 5.2). When using older versions each target should be included as a separate option and should be preceded by a '+' to indicate it should be added to the list of -queried targets, e.g., +queried targets, e.g.,:: - arp_ip_target=+192.168.1.1 arp_ip_target=+192.168.1.2 + arp_ip_target=+192.168.1.1 arp_ip_target=+192.168.1.2 - is the proper syntax to specify multiple targets. When specifying -options via BONDING_OPTS, it is not necessary to edit /etc/modprobe.d/*.conf. +is the proper syntax to specify multiple targets. When specifying +options via BONDING_OPTS, it is not necessary to edit +``/etc/modprobe.d/*.conf``. - For even older versions of initscripts that do not support +For even older versions of initscripts that do not support BONDING_OPTS, it is necessary to edit /etc/modprobe.d/*.conf, depending upon your distro) to load the bonding module with your desired options when the bond0 interface is brought up. The following lines in /etc/modprobe.d/*.conf will load the bonding module, and select its options: -alias bond0 bonding -options bond0 mode=balance-alb miimon=100 + alias bond0 bonding + options bond0 mode=balance-alb miimon=100 - Replace the sample parameters with the appropriate set of +Replace the sample parameters with the appropriate set of options for your configuration. - Finally run "/etc/rc.d/init.d/network restart" as root. This +Finally run "/etc/rc.d/init.d/network restart" as root. This will restart the networking subsystem and your bond link should be now up and running. 3.2.1 Using DHCP with Initscripts --------------------------------- - Recent versions of initscripts (the versions supplied with Fedora +Recent versions of initscripts (the versions supplied with Fedora Core 3 and Red Hat Enterprise Linux 4, or later versions, are reported to work) have support for assigning IP information to bonding devices via DHCP. - To configure bonding for DHCP, configure it as described +To configure bonding for DHCP, configure it as described above, except replace the line "BOOTPROTO=none" with "BOOTPROTO=dhcp" and add a line consisting of "TYPE=Bonding". Note that the TYPE value is case sensitive. @@ -1278,7 +1290,7 @@ is case sensitive. 3.2.2 Configuring Multiple Bonds with Initscripts ------------------------------------------------- - Initscripts packages that are included with Fedora 7 and Red Hat +Initscripts packages that are included with Fedora 7 and Red Hat Enterprise Linux 5 support multiple bonding interfaces by simply specifying the appropriate BONDING_OPTS= in ifcfg-bondX where X is the number of the bond. This support requires sysfs support in the kernel, @@ -1290,77 +1302,77 @@ below. 3.3 Configuring Bonding Manually with iproute2 ----------------------------------------------- - This section applies to distros whose network initialization +This section applies to distros whose network initialization scripts (the sysconfig or initscripts package) do not have specific knowledge of bonding. One such distro is SuSE Linux Enterprise Server version 8. - The general method for these systems is to place the bonding +The general method for these systems is to place the bonding module parameters into a config file in /etc/modprobe.d/ (as appropriate for the installed distro), then add modprobe and/or `ip link` commands to the system's global init script. The name of the global init script differs; for sysconfig, it is /etc/init.d/boot.local and for initscripts it is /etc/rc.d/rc.local. - For example, if you wanted to make a simple bond of two e100 +For example, if you wanted to make a simple bond of two e100 devices (presumed to be eth0 and eth1), and have it persist across reboots, edit the appropriate file (/etc/init.d/boot.local or -/etc/rc.d/rc.local), and add the following: +/etc/rc.d/rc.local), and add the following:: -modprobe bonding mode=balance-alb miimon=100 -modprobe e100 -ifconfig bond0 192.168.1.1 netmask 255.255.255.0 up -ip link set eth0 master bond0 -ip link set eth1 master bond0 + modprobe bonding mode=balance-alb miimon=100 + modprobe e100 + ifconfig bond0 192.168.1.1 netmask 255.255.255.0 up + ip link set eth0 master bond0 + ip link set eth1 master bond0 - Replace the example bonding module parameters and bond0 +Replace the example bonding module parameters and bond0 network configuration (IP address, netmask, etc) with the appropriate values for your configuration. - Unfortunately, this method will not provide support for the +Unfortunately, this method will not provide support for the ifup and ifdown scripts on the bond devices. To reload the bonding -configuration, it is necessary to run the initialization script, e.g., +configuration, it is necessary to run the initialization script, e.g.,:: -# /etc/init.d/boot.local + # /etc/init.d/boot.local - or +or:: -# /etc/rc.d/rc.local + # /etc/rc.d/rc.local - It may be desirable in such a case to create a separate script +It may be desirable in such a case to create a separate script which only initializes the bonding configuration, then call that separate script from within boot.local. This allows for bonding to be enabled without re-running the entire global init script. - To shut down the bonding devices, it is necessary to first +To shut down the bonding devices, it is necessary to first mark the bonding device itself as being down, then remove the appropriate device driver modules. For our example above, you can do -the following: +the following:: -# ifconfig bond0 down -# rmmod bonding -# rmmod e100 + # ifconfig bond0 down + # rmmod bonding + # rmmod e100 - Again, for convenience, it may be desirable to create a script +Again, for convenience, it may be desirable to create a script with these commands. 3.3.1 Configuring Multiple Bonds Manually ----------------------------------------- - This section contains information on configuring multiple +This section contains information on configuring multiple bonding devices with differing options for those systems whose network initialization scripts lack support for configuring multiple bonds. - If you require multiple bonding devices, but all with the same +If you require multiple bonding devices, but all with the same options, you may wish to use the "max_bonds" module parameter, documented above. - To create multiple bonding devices with differing options, it is +To create multiple bonding devices with differing options, it is preferable to use bonding parameters exported by sysfs, documented in the section below. - For versions of bonding without sysfs support, the only means to +For versions of bonding without sysfs support, the only means to provide multiple instances of bonding with differing options is to load the bonding driver multiple times. Note that current versions of the sysconfig network initialization scripts handle this automatically; if @@ -1368,35 +1380,35 @@ your distro uses these scripts, no special action is needed. See the section Configuring Bonding Devices, above, if you're not sure about your network initialization scripts. - To load multiple instances of the module, it is necessary to +To load multiple instances of the module, it is necessary to specify a different name for each instance (the module loading system requires that every loaded module, even multiple instances of the same module, have a unique name). This is accomplished by supplying multiple -sets of bonding options in /etc/modprobe.d/*.conf, for example: +sets of bonding options in ``/etc/modprobe.d/*.conf``, for example:: -alias bond0 bonding -options bond0 -o bond0 mode=balance-rr miimon=100 + alias bond0 bonding + options bond0 -o bond0 mode=balance-rr miimon=100 -alias bond1 bonding -options bond1 -o bond1 mode=balance-alb miimon=50 + alias bond1 bonding + options bond1 -o bond1 mode=balance-alb miimon=50 - will load the bonding module two times. The first instance is +will load the bonding module two times. The first instance is named "bond0" and creates the bond0 device in balance-rr mode with an miimon of 100. The second instance is named "bond1" and creates the bond1 device in balance-alb mode with an miimon of 50. - In some circumstances (typically with older distributions), +In some circumstances (typically with older distributions), the above does not work, and the second bonding instance never sees its options. In that case, the second options line can be substituted -as follows: +as follows:: -install bond1 /sbin/modprobe --ignore-install bonding -o bond1 \ - mode=balance-alb miimon=50 + install bond1 /sbin/modprobe --ignore-install bonding -o bond1 \ + mode=balance-alb miimon=50 - This may be repeated any number of times, specifying a new and +This may be repeated any number of times, specifying a new and unique name in place of bond1 for each subsequent instance. - It has been observed that some Red Hat supplied kernels are unable +It has been observed that some Red Hat supplied kernels are unable to rename modules at load time (the "-o bond1" part). Attempts to pass that option to modprobe will produce an "Operation not permitted" error. This has been reported on some Fedora Core kernels, and has been seen on @@ -1407,18 +1419,18 @@ kernels, and also lack sysfs support). 3.4 Configuring Bonding Manually via Sysfs ------------------------------------------ - Starting with version 3.0.0, Channel Bonding may be configured +Starting with version 3.0.0, Channel Bonding may be configured via the sysfs interface. This interface allows dynamic configuration of all bonds in the system without unloading the module. It also allows for adding and removing bonds at runtime. Ifenslave is no longer required, though it is still supported. - Use of the sysfs interface allows you to use multiple bonds +Use of the sysfs interface allows you to use multiple bonds with different configurations without having to reload the module. It also allows you to use multiple, differently configured bonds when bonding is compiled into the kernel. - You must have the sysfs filesystem mounted to configure +You must have the sysfs filesystem mounted to configure bonding this way. The examples in this document assume that you are using the standard mount point for sysfs, e.g. /sys. If your sysfs filesystem is mounted elsewhere, you will need to adjust the @@ -1426,38 +1438,45 @@ example paths accordingly. Creating and Destroying Bonds ----------------------------- -To add a new bond foo: -# echo +foo > /sys/class/net/bonding_masters +To add a new bond foo:: + + # echo +foo > /sys/class/net/bonding_masters + +To remove an existing bond bar:: -To remove an existing bond bar: -# echo -bar > /sys/class/net/bonding_masters + # echo -bar > /sys/class/net/bonding_masters -To show all existing bonds: -# cat /sys/class/net/bonding_masters +To show all existing bonds:: -NOTE: due to 4K size limitation of sysfs files, this list may be -truncated if you have more than a few hundred bonds. This is unlikely -to occur under normal operating conditions. + # cat /sys/class/net/bonding_masters + +.. note:: + + due to 4K size limitation of sysfs files, this list may be + truncated if you have more than a few hundred bonds. This is unlikely + to occur under normal operating conditions. Adding and Removing Slaves -------------------------- - Interfaces may be enslaved to a bond using the file +Interfaces may be enslaved to a bond using the file /sys/class/net/<bond>/bonding/slaves. The semantics for this file are the same as for the bonding_masters file. -To enslave interface eth0 to bond bond0: -# ifconfig bond0 up -# echo +eth0 > /sys/class/net/bond0/bonding/slaves +To enslave interface eth0 to bond bond0:: + + # ifconfig bond0 up + # echo +eth0 > /sys/class/net/bond0/bonding/slaves -To free slave eth0 from bond bond0: -# echo -eth0 > /sys/class/net/bond0/bonding/slaves +To free slave eth0 from bond bond0:: - When an interface is enslaved to a bond, symlinks between the + # echo -eth0 > /sys/class/net/bond0/bonding/slaves + +When an interface is enslaved to a bond, symlinks between the two are created in the sysfs filesystem. In this case, you would get /sys/class/net/bond0/slave_eth0 pointing to /sys/class/net/eth0, and /sys/class/net/eth0/master pointing to /sys/class/net/bond0. - This means that you can tell quickly whether or not an +This means that you can tell quickly whether or not an interface is enslaved by looking for the master symlink. Thus: # echo -eth0 > /sys/class/net/eth0/master/bonding/slaves will free eth0 from whatever bond it is enslaved to, regardless of @@ -1465,127 +1484,143 @@ the name of the bond interface. Changing a Bond's Configuration ------------------------------- - Each bond may be configured individually by manipulating the +Each bond may be configured individually by manipulating the files located in /sys/class/net/<bond name>/bonding - The names of these files correspond directly with the command- +The names of these files correspond directly with the command- line parameters described elsewhere in this file, and, with the exception of arp_ip_target, they accept the same values. To see the current setting, simply cat the appropriate file. - A few examples will be given here; for specific usage +A few examples will be given here; for specific usage guidelines for each parameter, see the appropriate section in this document. -To configure bond0 for balance-alb mode: -# ifconfig bond0 down -# echo 6 > /sys/class/net/bond0/bonding/mode - - or - -# echo balance-alb > /sys/class/net/bond0/bonding/mode - NOTE: The bond interface must be down before the mode can be -changed. - -To enable MII monitoring on bond0 with a 1 second interval: -# echo 1000 > /sys/class/net/bond0/bonding/miimon - NOTE: If ARP monitoring is enabled, it will disabled when MII -monitoring is enabled, and vice-versa. - -To add ARP targets: -# echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target -# echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target - NOTE: up to 16 target addresses may be specified. - -To remove an ARP target: -# echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target - -To configure the interval between learning packet transmits: -# echo 12 > /sys/class/net/bond0/bonding/lp_interval - NOTE: the lp_interval is the number of seconds between instances where -the bonding driver sends learning packets to each slaves peer switch. The -default interval is 1 second. +To configure bond0 for balance-alb mode:: + + # ifconfig bond0 down + # echo 6 > /sys/class/net/bond0/bonding/mode + - or - + # echo balance-alb > /sys/class/net/bond0/bonding/mode + +.. note:: + + The bond interface must be down before the mode can be changed. + +To enable MII monitoring on bond0 with a 1 second interval:: + + # echo 1000 > /sys/class/net/bond0/bonding/miimon + +.. note:: + + If ARP monitoring is enabled, it will disabled when MII + monitoring is enabled, and vice-versa. + +To add ARP targets:: + + # echo +192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target + # echo +192.168.0.101 > /sys/class/net/bond0/bonding/arp_ip_target + +.. note:: + + up to 16 target addresses may be specified. + +To remove an ARP target:: + + # echo -192.168.0.100 > /sys/class/net/bond0/bonding/arp_ip_target + +To configure the interval between learning packet transmits:: + + # echo 12 > /sys/class/net/bond0/bonding/lp_interval + +.. note:: + + the lp_interval is the number of seconds between instances where + the bonding driver sends learning packets to each slaves peer switch. The + default interval is 1 second. Example Configuration --------------------- - We begin with the same example that is shown in section 3.3, +We begin with the same example that is shown in section 3.3, executed with sysfs, and without using ifenslave. - To make a simple bond of two e100 devices (presumed to be eth0 +To make a simple bond of two e100 devices (presumed to be eth0 and eth1), and have it persist across reboots, edit the appropriate file (/etc/init.d/boot.local or /etc/rc.d/rc.local), and add the -following: +following:: -modprobe bonding -modprobe e100 -echo balance-alb > /sys/class/net/bond0/bonding/mode -ifconfig bond0 192.168.1.1 netmask 255.255.255.0 up -echo 100 > /sys/class/net/bond0/bonding/miimon -echo +eth0 > /sys/class/net/bond0/bonding/slaves -echo +eth1 > /sys/class/net/bond0/bonding/slaves + modprobe bonding + modprobe e100 + echo balance-alb > /sys/class/net/bond0/bonding/mode + ifconfig bond0 192.168.1.1 netmask 255.255.255.0 up + echo 100 > /sys/class/net/bond0/bonding/miimon + echo +eth0 > /sys/class/net/bond0/bonding/slaves + echo +eth1 > /sys/class/net/bond0/bonding/slaves - To add a second bond, with two e1000 interfaces in +To add a second bond, with two e1000 interfaces in active-backup mode, using ARP monitoring, add the following lines to -your init script: +your init script:: -modprobe e1000 -echo +bond1 > /sys/class/net/bonding_masters -echo active-backup > /sys/class/net/bond1/bonding/mode -ifconfig bond1 192.168.2.1 netmask 255.255.255.0 up -echo +192.168.2.100 /sys/class/net/bond1/bonding/arp_ip_target -echo 2000 > /sys/class/net/bond1/bonding/arp_interval -echo +eth2 > /sys/class/net/bond1/bonding/slaves -echo +eth3 > /sys/class/net/bond1/bonding/slaves + modprobe e1000 + echo +bond1 > /sys/class/net/bonding_masters + echo active-backup > /sys/class/net/bond1/bonding/mode + ifconfig bond1 192.168.2.1 netmask 255.255.255.0 up + echo +192.168.2.100 /sys/class/net/bond1/bonding/arp_ip_target + echo 2000 > /sys/class/net/bond1/bonding/arp_interval + echo +eth2 > /sys/class/net/bond1/bonding/slaves + echo +eth3 > /sys/class/net/bond1/bonding/slaves 3.5 Configuration with Interfaces Support ----------------------------------------- - This section applies to distros which use /etc/network/interfaces file +This section applies to distros which use /etc/network/interfaces file to describe network interface configuration, most notably Debian and it's derivatives. - The ifup and ifdown commands on Debian don't support bonding out of +The ifup and ifdown commands on Debian don't support bonding out of the box. The ifenslave-2.6 package should be installed to provide bonding -support. Once installed, this package will provide bond-* options to be used -into /etc/network/interfaces. +support. Once installed, this package will provide ``bond-*`` options +to be used into /etc/network/interfaces. - Note that ifenslave-2.6 package will load the bonding module and use +Note that ifenslave-2.6 package will load the bonding module and use the ifenslave command when appropriate. Example Configurations ---------------------- In /etc/network/interfaces, the following stanza will configure bond0, in -active-backup mode, with eth0 and eth1 as slaves. +active-backup mode, with eth0 and eth1 as slaves:: -auto bond0 -iface bond0 inet dhcp - bond-slaves eth0 eth1 - bond-mode active-backup - bond-miimon 100 - bond-primary eth0 eth1 + auto bond0 + iface bond0 inet dhcp + bond-slaves eth0 eth1 + bond-mode active-backup + bond-miimon 100 + bond-primary eth0 eth1 If the above configuration doesn't work, you might have a system using upstart for system startup. This is most notably true for recent Ubuntu versions. The following stanza in /etc/network/interfaces will -produce the same result on those systems. - -auto bond0 -iface bond0 inet dhcp - bond-slaves none - bond-mode active-backup - bond-miimon 100 - -auto eth0 -iface eth0 inet manual - bond-master bond0 - bond-primary eth0 eth1 - -auto eth1 -iface eth1 inet manual - bond-master bond0 - bond-primary eth0 eth1 - -For a full list of bond-* supported options in /etc/network/interfaces and some -more advanced examples tailored to you particular distros, see the files in +produce the same result on those systems:: + + auto bond0 + iface bond0 inet dhcp + bond-slaves none + bond-mode active-backup + bond-miimon 100 + + auto eth0 + iface eth0 inet manual + bond-master bond0 + bond-primary eth0 eth1 + + auto eth1 + iface eth1 inet manual + bond-master bond0 + bond-primary eth0 eth1 + +For a full list of ``bond-*`` supported options in /etc/network/interfaces and +some more advanced examples tailored to you particular distros, see the files in /usr/share/doc/ifenslave-2.6. 3.6 Overriding Configuration for Special Cases @@ -1610,31 +1645,31 @@ tx_queues can be used to change this value. There is no sysfs parameter available as the allocation is done at module init time. The output of the file /proc/net/bonding/bondX has changed so the output Queue -ID is now printed for each slave: +ID is now printed for each slave:: -Bonding Mode: fault-tolerance (active-backup) -Primary Slave: None -Currently Active Slave: eth0 -MII Status: up -MII Polling Interval (ms): 0 -Up Delay (ms): 0 -Down Delay (ms): 0 + Bonding Mode: fault-tolerance (active-backup) + Primary Slave: None + Currently Active Slave: eth0 + MII Status: up + MII Polling Interval (ms): 0 + Up Delay (ms): 0 + Down Delay (ms): 0 -Slave Interface: eth0 -MII Status: up -Link Failure Count: 0 -Permanent HW addr: 00:1a:a0:12:8f:cb -Slave queue ID: 0 + Slave Interface: eth0 + MII Status: up + Link Failure Count: 0 + Permanent HW addr: 00:1a:a0:12:8f:cb + Slave queue ID: 0 -Slave Interface: eth1 -MII Status: up -Link Failure Count: 0 -Permanent HW addr: 00:1a:a0:12:8f:cc -Slave queue ID: 2 + Slave Interface: eth1 + MII Status: up + Link Failure Count: 0 + Permanent HW addr: 00:1a:a0:12:8f:cc + Slave queue ID: 2 -The queue_id for a slave can be set using the command: +The queue_id for a slave can be set using the command:: -# echo "eth1:2" > /sys/class/net/bond0/bonding/queue_id + # echo "eth1:2" > /sys/class/net/bond0/bonding/queue_id Any interface that needs a queue_id set should set it with multiple calls like the one above until proper priorities are set for all interfaces. On @@ -1645,12 +1680,12 @@ These queue id's can be used in conjunction with the tc utility to configure a multiqueue qdisc and filters to bias certain traffic to transmit on certain slave devices. For instance, say we wanted, in the above configuration to force all traffic bound to 192.168.1.100 to use eth1 in the bond as its output -device. The following commands would accomplish this: +device. The following commands would accomplish this:: -# tc qdisc add dev bond0 handle 1 root multiq + # tc qdisc add dev bond0 handle 1 root multiq -# tc filter add dev bond0 protocol ip parent 1: prio 1 u32 match ip dst \ - 192.168.1.100 action skbedit queue_mapping 2 + # tc filter add dev bond0 protocol ip parent 1: prio 1 u32 match ip \ + dst 192.168.1.100 action skbedit queue_mapping 2 These commands tell the kernel to attach a multiqueue queue discipline to the bond0 interface and filter traffic enqueued to it, such that packets with a dst @@ -1663,7 +1698,7 @@ that normal output policy selection should take place. One benefit to simply leaving the qid for a slave to 0 is the multiqueue awareness in the bonding driver that is now present. This awareness allows tc filters to be placed on slave devices as well as bond devices and the bonding driver will simply act as -a pass-through for selecting output queues on the slave device rather than +a pass-through for selecting output queues on the slave device rather than output port selection. This feature first appeared in bonding driver version 3.7.0 and support for @@ -1689,31 +1724,31 @@ few bonding parameters: (a) ad_actor_system : You can set a random mac-address that can be used for these LACPDU exchanges. The value can not be either NULL or Multicast. Also it's preferable to set the local-admin bit. Following shell code - generates a random mac-address as described above. + generates a random mac-address as described above:: - # sys_mac_addr=$(printf '%02x:%02x:%02x:%02x:%02x:%02x' \ - $(( (RANDOM & 0xFE) | 0x02 )) \ - $(( RANDOM & 0xFF )) \ - $(( RANDOM & 0xFF )) \ - $(( RANDOM & 0xFF )) \ - $(( RANDOM & 0xFF )) \ - $(( RANDOM & 0xFF ))) - # echo $sys_mac_addr > /sys/class/net/bond0/bonding/ad_actor_system + # sys_mac_addr=$(printf '%02x:%02x:%02x:%02x:%02x:%02x' \ + $(( (RANDOM & 0xFE) | 0x02 )) \ + $(( RANDOM & 0xFF )) \ + $(( RANDOM & 0xFF )) \ + $(( RANDOM & 0xFF )) \ + $(( RANDOM & 0xFF )) \ + $(( RANDOM & 0xFF ))) + # echo $sys_mac_addr > /sys/class/net/bond0/bonding/ad_actor_system (b) ad_actor_sys_prio : Randomize the system priority. The default value is 65535, but system can take the value from 1 - 65535. Following shell - code generates random priority and sets it. + code generates random priority and sets it:: - # sys_prio=$(( 1 + RANDOM + RANDOM )) - # echo $sys_prio > /sys/class/net/bond0/bonding/ad_actor_sys_prio + # sys_prio=$(( 1 + RANDOM + RANDOM )) + # echo $sys_prio > /sys/class/net/bond0/bonding/ad_actor_sys_prio (c) ad_user_port_key : Use the user portion of the port-key. The default keeps this empty. These are the upper 10 bits of the port-key and value ranges from 0 - 1023. Following shell code generates these 10 bits and - sets it. + sets it:: - # usr_port_key=$(( RANDOM & 0x3FF )) - # echo $usr_port_key > /sys/class/net/bond0/bonding/ad_user_port_key + # usr_port_key=$(( RANDOM & 0x3FF )) + # echo $usr_port_key > /sys/class/net/bond0/bonding/ad_user_port_key 4 Querying Bonding Configuration @@ -1722,81 +1757,81 @@ few bonding parameters: 4.1 Bonding Configuration ------------------------- - Each bonding device has a read-only file residing in the +Each bonding device has a read-only file residing in the /proc/net/bonding directory. The file contents include information about the bonding configuration, options and state of each slave. - For example, the contents of /proc/net/bonding/bond0 after the +For example, the contents of /proc/net/bonding/bond0 after the driver is loaded with parameters of mode=0 and miimon=1000 is -generally as follows: +generally as follows:: Ethernet Channel Bonding Driver: 2.6.1 (October 29, 2004) - Bonding Mode: load balancing (round-robin) - Currently Active Slave: eth0 - MII Status: up - MII Polling Interval (ms): 1000 - Up Delay (ms): 0 - Down Delay (ms): 0 - - Slave Interface: eth1 - MII Status: up - Link Failure Count: 1 - - Slave Interface: eth0 - MII Status: up - Link Failure Count: 1 - - The precise format and contents will change depending upon the + Bonding Mode: load balancing (round-robin) + Currently Active Slave: eth0 + MII Status: up + MII Polling Interval (ms): 1000 + Up Delay (ms): 0 + Down Delay (ms): 0 + + Slave Interface: eth1 + MII Status: up + Link Failure Count: 1 + + Slave Interface: eth0 + MII Status: up + Link Failure Count: 1 + +The precise format and contents will change depending upon the bonding configuration, state, and version of the bonding driver. 4.2 Network configuration ------------------------- - The network configuration can be inspected using the ifconfig +The network configuration can be inspected using the ifconfig command. Bonding devices will have the MASTER flag set; Bonding slave devices will have the SLAVE flag set. The ifconfig output does not contain information on which slaves are associated with which masters. - In the example below, the bond0 interface is the master +In the example below, the bond0 interface is the master (MASTER) while eth0 and eth1 are slaves (SLAVE). Notice all slaves of bond0 have the same MAC address (HWaddr) as bond0 for all modes except -TLB and ALB that require a unique MAC address for each slave. - -# /sbin/ifconfig -bond0 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4 - inet addr:XXX.XXX.XXX.YYY Bcast:XXX.XXX.XXX.255 Mask:255.255.252.0 - UP BROADCAST RUNNING MASTER MULTICAST MTU:1500 Metric:1 - RX packets:7224794 errors:0 dropped:0 overruns:0 frame:0 - TX packets:3286647 errors:1 dropped:0 overruns:1 carrier:0 - collisions:0 txqueuelen:0 - -eth0 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4 - UP BROADCAST RUNNING SLAVE MULTICAST MTU:1500 Metric:1 - RX packets:3573025 errors:0 dropped:0 overruns:0 frame:0 - TX packets:1643167 errors:1 dropped:0 overruns:1 carrier:0 - collisions:0 txqueuelen:100 - Interrupt:10 Base address:0x1080 - -eth1 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4 - UP BROADCAST RUNNING SLAVE MULTICAST MTU:1500 Metric:1 - RX packets:3651769 errors:0 dropped:0 overruns:0 frame:0 - TX packets:1643480 errors:0 dropped:0 overruns:0 carrier:0 - collisions:0 txqueuelen:100 - Interrupt:9 Base address:0x1400 +TLB and ALB that require a unique MAC address for each slave:: + + # /sbin/ifconfig + bond0 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4 + inet addr:XXX.XXX.XXX.YYY Bcast:XXX.XXX.XXX.255 Mask:255.255.252.0 + UP BROADCAST RUNNING MASTER MULTICAST MTU:1500 Metric:1 + RX packets:7224794 errors:0 dropped:0 overruns:0 frame:0 + TX packets:3286647 errors:1 dropped:0 overruns:1 carrier:0 + collisions:0 txqueuelen:0 + + eth0 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4 + UP BROADCAST RUNNING SLAVE MULTICAST MTU:1500 Metric:1 + RX packets:3573025 errors:0 dropped:0 overruns:0 frame:0 + TX packets:1643167 errors:1 dropped:0 overruns:1 carrier:0 + collisions:0 txqueuelen:100 + Interrupt:10 Base address:0x1080 + + eth1 Link encap:Ethernet HWaddr 00:C0:F0:1F:37:B4 + UP BROADCAST RUNNING SLAVE MULTICAST MTU:1500 Metric:1 + RX packets:3651769 errors:0 dropped:0 overruns:0 frame:0 + TX packets:1643480 errors:0 dropped:0 overruns:0 carrier:0 + collisions:0 txqueuelen:100 + Interrupt:9 Base address:0x1400 5. Switch Configuration ======================= - For this section, "switch" refers to whatever system the +For this section, "switch" refers to whatever system the bonded devices are directly connected to (i.e., where the other end of the cable plugs into). This may be an actual dedicated switch device, or it may be another regular system (e.g., another computer running Linux), - The active-backup, balance-tlb and balance-alb modes do not +The active-backup, balance-tlb and balance-alb modes do not require any specific configuration of the switch. - The 802.3ad mode requires that the switch have the appropriate +The 802.3ad mode requires that the switch have the appropriate ports configured as an 802.3ad aggregation. The precise method used to configure this varies from switch to switch, but, for example, a Cisco 3550 series switch requires that the appropriate ports first be @@ -1804,7 +1839,7 @@ grouped together in a single etherchannel instance, then that etherchannel is set to mode "lacp" to enable 802.3ad (instead of standard EtherChannel). - The balance-rr, balance-xor and broadcast modes generally +The balance-rr, balance-xor and broadcast modes generally require that the switch have the appropriate ports grouped together. The nomenclature for such a group differs between switches, it may be called an "etherchannel" (as in the Cisco example, above), a "trunk @@ -1820,7 +1855,7 @@ with another EtherChannel group. 6. 802.1q VLAN Support ====================== - It is possible to configure VLAN devices over a bond interface +It is possible to configure VLAN devices over a bond interface using the 8021q driver. However, only packets coming from the 8021q driver and passing through bonding will be tagged by default. Self generated packets, for example, bonding's learning packets or ARP @@ -1829,7 +1864,7 @@ tagged internally by bonding itself. As a result, bonding must "learn" the VLAN IDs configured above it, and use those IDs to tag self generated packets. - For reasons of simplicity, and to support the use of adapters +For reasons of simplicity, and to support the use of adapters that can do VLAN hardware acceleration offloading, the bonding interface declares itself as fully hardware offloading capable, it gets the add_vid/kill_vid notifications to gather the necessary @@ -1839,7 +1874,7 @@ should go through an adapter that is not offloading capable are "un-accelerated" by the bonding driver so the VLAN tag sits in the regular location. - VLAN interfaces *must* be added on top of a bonding interface +VLAN interfaces *must* be added on top of a bonding interface only after enslaving at least one slave. The bonding interface has a hardware address of 00:00:00:00:00:00 until the first slave is added. If the VLAN interface is created prior to the first enslavement, it @@ -1847,23 +1882,23 @@ would pick up the all-zeroes hardware address. Once the first slave is attached to the bond, the bond device itself will pick up the slave's hardware address, which is then available for the VLAN device. - Also, be aware that a similar problem can occur if all slaves +Also, be aware that a similar problem can occur if all slaves are released from a bond that still has one or more VLAN interfaces on top of it. When a new slave is added, the bonding interface will obtain its hardware address from the first slave, which might not match the hardware address of the VLAN interfaces (which was ultimately copied from an earlier slave). - There are two methods to insure that the VLAN device operates +There are two methods to insure that the VLAN device operates with the correct hardware address if all slaves are removed from a bond interface: - 1. Remove all VLAN interfaces then recreate them +1. Remove all VLAN interfaces then recreate them - 2. Set the bonding interface's hardware address so that it +2. Set the bonding interface's hardware address so that it matches the hardware address of the VLAN interfaces. - Note that changing a VLAN interface's HW address would set the +Note that changing a VLAN interface's HW address would set the underlying device -- i.e. the bonding interface -- to promiscuous mode, which might not be what you want. @@ -1871,24 +1906,24 @@ mode, which might not be what you want. 7. Link Monitoring ================== - The bonding driver at present supports two schemes for +The bonding driver at present supports two schemes for monitoring a slave device's link state: the ARP monitor and the MII monitor. - At the present time, due to implementation restrictions in the +At the present time, due to implementation restrictions in the bonding driver itself, it is not possible to enable both ARP and MII monitoring simultaneously. 7.1 ARP Monitor Operation ------------------------- - The ARP monitor operates as its name suggests: it sends ARP +The ARP monitor operates as its name suggests: it sends ARP queries to one or more designated peer systems on the network, and uses the response as an indication that the link is operating. This gives some assurance that traffic is actually flowing to and from one or more peers on the local network. - The ARP monitor relies on the device driver itself to verify +The ARP monitor relies on the device driver itself to verify that traffic is flowing. In particular, the driver must keep up to date the last receive time, dev->last_rx. Drivers that use NETIF_F_LLTX flag must also update netdev_queue->trans_start. If they do not, then the @@ -1900,36 +1935,36 @@ your device driver is not updating last_rx and trans_start. 7.2 Configuring Multiple ARP Targets ------------------------------------ - While ARP monitoring can be done with just one target, it can +While ARP monitoring can be done with just one target, it can be useful in a High Availability setup to have several targets to monitor. In the case of just one target, the target itself may go down or have a problem making it unresponsive to ARP requests. Having an additional target (or several) increases the reliability of the ARP monitoring. - Multiple ARP targets must be separated by commas as follows: +Multiple ARP targets must be separated by commas as follows:: -# example options for ARP monitoring with three targets -alias bond0 bonding -options bond0 arp_interval=60 arp_ip_target=192.168.0.1,192.168.0.3,192.168.0.9 + # example options for ARP monitoring with three targets + alias bond0 bonding + options bond0 arp_interval=60 arp_ip_target=192.168.0.1,192.168.0.3,192.168.0.9 - For just a single target the options would resemble: +For just a single target the options would resemble:: -# example options for ARP monitoring with one target -alias bond0 bonding -options bond0 arp_interval=60 arp_ip_target=192.168.0.100 + # example options for ARP monitoring with one target + alias bond0 bonding + options bond0 arp_interval=60 arp_ip_target=192.168.0.100 7.3 MII Monitor Operation ------------------------- - The MII monitor monitors only the carrier state of the local +The MII monitor monitors only the carrier state of the local network interface. It accomplishes this in one of three ways: by depending upon the device driver to maintain its carrier state, by querying the device's MII registers, or by making an ethtool query to the device. - If the use_carrier module parameter is 1 (the default value), +If the use_carrier module parameter is 1 (the default value), then the MII monitor will rely on the driver for carrier state information (via the netif_carrier subsystem). As explained in the use_carrier parameter information, above, if the MII monitor fails to @@ -1937,7 +1972,7 @@ detect carrier loss on the device (e.g., when the cable is physically disconnected), it may be that the driver does not support netif_carrier. - If use_carrier is 0, then the MII monitor will first query the +If use_carrier is 0, then the MII monitor will first query the device's (via ioctl) MII registers and check the link state. If that request fails (not just that it returns carrier down), then the MII monitor will make an ethtool ETHOOL_GLINK request to attempt to obtain @@ -1952,25 +1987,25 @@ up. 8.1 Adventures in Routing ------------------------- - When bonding is configured, it is important that the slave +When bonding is configured, it is important that the slave devices not have routes that supersede routes of the master (or, generally, not have routes at all). For example, suppose the bonding device bond0 has two slaves, eth0 and eth1, and the routing table is -as follows: +as follows:: -Kernel IP routing table -Destination Gateway Genmask Flags MSS Window irtt Iface -10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 eth0 -10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 eth1 -10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 bond0 -127.0.0.0 0.0.0.0 255.0.0.0 U 40 0 0 lo + Kernel IP routing table + Destination Gateway Genmask Flags MSS Window irtt Iface + 10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 eth0 + 10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 eth1 + 10.0.0.0 0.0.0.0 255.255.0.0 U 40 0 0 bond0 + 127.0.0.0 0.0.0.0 255.0.0.0 U 40 0 0 lo - This routing configuration will likely still update the +This routing configuration will likely still update the receive/transmit times in the driver (needed by the ARP monitor), but may bypass the bonding driver (because outgoing traffic to, in this case, another host on network 10 would use eth0 or eth1 before bond0). - The ARP monitor (and ARP itself) may become confused by this +The ARP monitor (and ARP itself) may become confused by this configuration, because ARP requests (generated by the ARP monitor) will be sent on one interface (bond0), but the corresponding reply will arrive on a different interface (eth0). This reply looks to ARP @@ -1978,7 +2013,7 @@ as an unsolicited ARP reply (because ARP matches replies on an interface basis), and is discarded. The MII monitor is not affected by the state of the routing table. - The solution here is simply to insure that slaves do not have +The solution here is simply to insure that slaves do not have routes of their own, and if for some reason they must, those routes do not supersede routes of their master. This should generally be the case, but unusual configurations or errant manual or automatic static @@ -1987,22 +2022,22 @@ route additions may cause trouble. 8.2 Ethernet Device Renaming ---------------------------- - On systems with network configuration scripts that do not +On systems with network configuration scripts that do not associate physical devices directly with network interface names (so that the same physical device always has the same "ethX" name), it may be necessary to add some special logic to config files in /etc/modprobe.d/. - For example, given a modules.conf containing the following: +For example, given a modules.conf containing the following:: -alias bond0 bonding -options bond0 mode=some-mode miimon=50 -alias eth0 tg3 -alias eth1 tg3 -alias eth2 e1000 -alias eth3 e1000 + alias bond0 bonding + options bond0 mode=some-mode miimon=50 + alias eth0 tg3 + alias eth1 tg3 + alias eth2 e1000 + alias eth3 e1000 - If neither eth0 and eth1 are slaves to bond0, then when the +If neither eth0 and eth1 are slaves to bond0, then when the bond0 interface comes up, the devices may end up reordered. This happens because bonding is loaded first, then its slave device's drivers are loaded next. Since no other drivers have been loaded, @@ -2010,36 +2045,36 @@ when the e1000 driver loads, it will receive eth0 and eth1 for its devices, but the bonding configuration tries to enslave eth2 and eth3 (which may later be assigned to the tg3 devices). - Adding the following: +Adding the following:: -add above bonding e1000 tg3 + add above bonding e1000 tg3 - causes modprobe to load e1000 then tg3, in that order, when +causes modprobe to load e1000 then tg3, in that order, when bonding is loaded. This command is fully documented in the modules.conf manual page. - On systems utilizing modprobe an equivalent problem can occur. +On systems utilizing modprobe an equivalent problem can occur. In this case, the following can be added to config files in -/etc/modprobe.d/ as: +/etc/modprobe.d/ as:: -softdep bonding pre: tg3 e1000 + softdep bonding pre: tg3 e1000 - This will load tg3 and e1000 modules before loading the bonding one. +This will load tg3 and e1000 modules before loading the bonding one. Full documentation on this can be found in the modprobe.d and modprobe manual pages. 8.3. Painfully Slow Or No Failed Link Detection By Miimon --------------------------------------------------------- - By default, bonding enables the use_carrier option, which +By default, bonding enables the use_carrier option, which instructs bonding to trust the driver to maintain carrier state. - As discussed in the options section, above, some drivers do +As discussed in the options section, above, some drivers do not support the netif_carrier_on/_off link state tracking system. With use_carrier enabled, bonding will always see these links as up, regardless of their actual state. - Additionally, other drivers do support netif_carrier, but do +Additionally, other drivers do support netif_carrier, but do not maintain it in real time, e.g., only polling the link state at some fixed interval. In this case, miimon will detect failures, but only after some long period of time has expired. If it appears that @@ -2051,7 +2086,7 @@ use_carrier=0 method of querying the registers directly works). If use_carrier=0 does not improve the failover, then the driver may cache the registers, or the problem may be elsewhere. - Also, remember that miimon only checks for the device's +Also, remember that miimon only checks for the device's carrier state. It has no way to determine the state of devices on or beyond other ports of a switch, or if a switch is refusing to pass traffic while still maintaining carrier on. @@ -2059,7 +2094,7 @@ traffic while still maintaining carrier on. 9. SNMP agents =============== - If running SNMP agents, the bonding driver should be loaded +If running SNMP agents, the bonding driver should be loaded before any network drivers participating in a bond. This requirement is due to the interface index (ipAdEntIfIndex) being associated to the first interface found with a given IP address. That is, there is @@ -2070,6 +2105,8 @@ with the eth0 interface. This configuration is shown below, the IP address 192.168.1.1 has an interface index of 2 which indexes to eth0 in the ifDescr table (ifDescr.2). +:: + interfaces.ifTable.ifEntry.ifDescr.1 = lo interfaces.ifTable.ifEntry.ifDescr.2 = eth0 interfaces.ifTable.ifEntry.ifDescr.3 = eth1 @@ -2081,7 +2118,7 @@ in the ifDescr table (ifDescr.2). ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.74.20.94 = 4 ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.127.0.0.1 = 1 - This problem is avoided by loading the bonding driver before +This problem is avoided by loading the bonding driver before any network drivers participating in a bond. Below is an example of loading the bonding driver first, the IP address 192.168.1.1 is correctly associated with ifDescr.2. @@ -2097,7 +2134,7 @@ correctly associated with ifDescr.2. ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.10.74.20.94 = 5 ip.ipAddrTable.ipAddrEntry.ipAdEntIfIndex.127.0.0.1 = 1 - While some distributions may not report the interface name in +While some distributions may not report the interface name in ifDescr, the association between the IP address and IfIndex remains and SNMP functions such as Interface_Scan_Next will report that association. @@ -2105,34 +2142,34 @@ association. 10. Promiscuous mode ==================== - When running network monitoring tools, e.g., tcpdump, it is +When running network monitoring tools, e.g., tcpdump, it is common to enable promiscuous mode on the device, so that all traffic is seen (instead of seeing only traffic destined for the local host). The bonding driver handles promiscuous mode changes to the bonding master device (e.g., bond0), and propagates the setting to the slave devices. - For the balance-rr, balance-xor, broadcast, and 802.3ad modes, +For the balance-rr, balance-xor, broadcast, and 802.3ad modes, the promiscuous mode setting is propagated to all slaves. - For the active-backup, balance-tlb and balance-alb modes, the +For the active-backup, balance-tlb and balance-alb modes, the promiscuous mode setting is propagated only to the active slave. - For balance-tlb mode, the active slave is the slave currently +For balance-tlb mode, the active slave is the slave currently receiving inbound traffic. - For balance-alb mode, the active slave is the slave used as a +For balance-alb mode, the active slave is the slave used as a "primary." This slave is used for mode-specific control traffic, for sending to peers that are unassigned or if the load is unbalanced. - For the active-backup, balance-tlb and balance-alb modes, when +For the active-backup, balance-tlb and balance-alb modes, when the active slave changes (e.g., due to a link failure), the promiscuous setting will be propagated to the new active slave. 11. Configuring Bonding for High Availability ============================================= - High Availability refers to configurations that provide +High Availability refers to configurations that provide maximum network availability by having redundant or backup devices, links or switches between the host and the rest of the world. The goal is to provide the maximum availability of network connectivity @@ -2142,7 +2179,7 @@ could provide higher throughput. 11.1 High Availability in a Single Switch Topology -------------------------------------------------- - If two hosts (or a host and a single switch) are directly +If two hosts (or a host and a single switch) are directly connected via multiple physical links, then there is no availability penalty to optimizing for maximum bandwidth. In this case, there is only one switch (or peer), so if it fails, there is no alternative @@ -2150,32 +2187,32 @@ access to fail over to. Additionally, the bonding load balance modes support link monitoring of their members, so if individual links fail, the load will be rebalanced across the remaining devices. - See Section 12, "Configuring Bonding for Maximum Throughput" +See Section 12, "Configuring Bonding for Maximum Throughput" for information on configuring bonding with one peer device. 11.2 High Availability in a Multiple Switch Topology ---------------------------------------------------- - With multiple switches, the configuration of bonding and the +With multiple switches, the configuration of bonding and the network changes dramatically. In multiple switch topologies, there is a trade off between network availability and usable bandwidth. - Below is a sample network, configured to maximize the -availability of the network: - - | | - |port3 port3| - +-----+----+ +-----+----+ - | |port2 ISL port2| | - | switch A +--------------------------+ switch B | - | | | | - +-----+----+ +-----++---+ - |port1 port1| - | +-------+ | - +-------------+ host1 +---------------+ - eth0 +-------+ eth1 - - In this configuration, there is a link between the two +Below is a sample network, configured to maximize the +availability of the network:: + + | | + |port3 port3| + +-----+----+ +-----+----+ + | |port2 ISL port2| | + | switch A +--------------------------+ switch B | + | | | | + +-----+----+ +-----++---+ + |port1 port1| + | +-------+ | + +-------------+ host1 +---------------+ + eth0 +-------+ eth1 + +In this configuration, there is a link between the two switches (ISL, or inter switch link), and multiple ports connecting to the outside world ("port3" on each switch). There is no technical reason that this could not be extended to a third switch. @@ -2183,19 +2220,21 @@ reason that this could not be extended to a third switch. 11.2.1 HA Bonding Mode Selection for Multiple Switch Topology ------------------------------------------------------------- - In a topology such as the example above, the active-backup and +In a topology such as the example above, the active-backup and broadcast modes are the only useful bonding modes when optimizing for availability; the other modes require all links to terminate on the same peer for them to behave rationally. -active-backup: This is generally the preferred mode, particularly if +active-backup: + This is generally the preferred mode, particularly if the switches have an ISL and play together well. If the network configuration is such that one switch is specifically a backup switch (e.g., has lower capacity, higher cost, etc), then the primary option can be used to insure that the preferred link is always used when it is available. -broadcast: This mode is really a special purpose mode, and is suitable +broadcast: + This mode is really a special purpose mode, and is suitable only for very specific needs. For example, if the two switches are not connected (no ISL), and the networks beyond them are totally independent. In this case, if it is @@ -2205,7 +2244,7 @@ broadcast: This mode is really a special purpose mode, and is suitable 11.2.2 HA Link Monitoring Selection for Multiple Switch Topology ---------------------------------------------------------------- - The choice of link monitoring ultimately depends upon your +The choice of link monitoring ultimately depends upon your switch. If the switch can reliably fail ports in response to other failures, then either the MII or ARP monitors should work. For example, in the above example, if the "port3" link fails at the remote @@ -2213,7 +2252,7 @@ end, the MII monitor has no direct means to detect this. The ARP monitor could be configured with a target at the remote end of port3, thus detecting that failure without switch support. - In general, however, in a multiple switch topology, the ARP +In general, however, in a multiple switch topology, the ARP monitor can provide a higher level of reliability in detecting end to end connectivity failures (which may be caused by the failure of any individual component to pass traffic for any reason). Additionally, @@ -2222,7 +2261,7 @@ one for each switch in the network). This will insure that, regardless of which switch is active, the ARP monitor has a suitable target to query. - Note, also, that of late many switches now support a functionality +Note, also, that of late many switches now support a functionality generally referred to as "trunk failover." This is a feature of the switch that causes the link state of a particular switch port to be set down (or up) when the state of another switch port goes down (or up). @@ -2238,18 +2277,18 @@ suitable switches. 12.1 Maximizing Throughput in a Single Switch Topology ------------------------------------------------------ - In a single switch configuration, the best method to maximize +In a single switch configuration, the best method to maximize throughput depends upon the application and network environment. The various load balancing modes each have strengths and weaknesses in different environments, as detailed below. - For this discussion, we will break down the topologies into +For this discussion, we will break down the topologies into two categories. Depending upon the destination of most traffic, we categorize them into either "gatewayed" or "local" configurations. - In a gatewayed configuration, the "switch" is acting primarily +In a gatewayed configuration, the "switch" is acting primarily as a router, and the majority of traffic passes through this router to -other networks. An example would be the following: +other networks. An example would be the following:: +----------+ +----------+ @@ -2259,25 +2298,25 @@ other networks. An example would be the following: | |eth1 port2| | here somewhere +----------+ +----------+ - The router may be a dedicated router device, or another host +The router may be a dedicated router device, or another host acting as a gateway. For our discussion, the important point is that the majority of traffic from Host A will pass through the router to some other network before reaching its final destination. - In a gatewayed network configuration, although Host A may +In a gatewayed network configuration, although Host A may communicate with many other systems, all of its traffic will be sent and received via one other peer on the local network, the router. - Note that the case of two systems connected directly via +Note that the case of two systems connected directly via multiple physical links is, for purposes of configuring bonding, the same as a gatewayed configuration. In that case, it happens that all traffic is destined for the "gateway" itself, not some other network beyond the gateway. - In a local configuration, the "switch" is acting primarily as +In a local configuration, the "switch" is acting primarily as a switch, and the majority of traffic passes through this switch to reach other stations on the same network. An example would be the -following: +following:: +----------+ +----------+ +--------+ | |eth0 port1| +-------+ Host B | @@ -2287,19 +2326,19 @@ following: +----------+ +----------+port4 +--------+ - Again, the switch may be a dedicated switch device, or another +Again, the switch may be a dedicated switch device, or another host acting as a gateway. For our discussion, the important point is that the majority of traffic from Host A is destined for other hosts on the same local network (Hosts B and C in the above example). - In summary, in a gatewayed configuration, traffic to and from +In summary, in a gatewayed configuration, traffic to and from the bonded device will be to the same MAC level peer on the network (the gateway itself, i.e., the router), regardless of its final destination. In a local configuration, traffic flows directly to and from the final destinations, thus, each destination (Host B, Host C) will be addressed directly by their individual MAC addresses. - This distinction between a gatewayed and a local network +This distinction between a gatewayed and a local network configuration is important because many of the load balancing modes available use the MAC addresses of the local network source and destination to make load balancing decisions. The behavior of each @@ -2309,11 +2348,12 @@ mode is described below. 12.1.1 MT Bonding Mode Selection for Single Switch Topology ----------------------------------------------------------- - This configuration is the easiest to set up and to understand, +This configuration is the easiest to set up and to understand, although you will have to decide which bonding mode best suits your needs. The trade offs for each mode are detailed below: -balance-rr: This mode is the only mode that will permit a single +balance-rr: + This mode is the only mode that will permit a single TCP/IP connection to stripe traffic across multiple interfaces. It is therefore the only mode that will allow a single TCP/IP stream to utilize more than one interface's @@ -2351,7 +2391,8 @@ balance-rr: This mode is the only mode that will permit a single This mode requires the switch to have the appropriate ports configured for "etherchannel" or "trunking." -active-backup: There is not much advantage in this network topology to +active-backup: + There is not much advantage in this network topology to the active-backup mode, as the inactive backup devices are all connected to the same peer as the primary. In this case, a load balancing mode (with link monitoring) will provide the @@ -2361,7 +2402,8 @@ active-backup: There is not much advantage in this network topology to have value if the hardware available does not support any of the load balance modes. -balance-xor: This mode will limit traffic such that packets destined +balance-xor: + This mode will limit traffic such that packets destined for specific peers will always be sent over the same interface. Since the destination is determined by the MAC addresses involved, this mode works best in a "local" network @@ -2373,10 +2415,12 @@ balance-xor: This mode will limit traffic such that packets destined As with balance-rr, the switch ports need to be configured for "etherchannel" or "trunking." -broadcast: Like active-backup, there is not much advantage to this +broadcast: + Like active-backup, there is not much advantage to this mode in this type of network topology. -802.3ad: This mode can be a good choice for this type of network +802.3ad: + This mode can be a good choice for this type of network topology. The 802.3ad mode is an IEEE standard, so all peers that implement 802.3ad should interoperate well. The 802.3ad protocol includes automatic configuration of the aggregates, @@ -2390,7 +2434,7 @@ broadcast: Like active-backup, there is not much advantage to this the same speed and duplex. Also, as with all bonding load balance modes other than balance-rr, no single connection will be able to utilize more than a single interface's worth of - bandwidth. + bandwidth. Additionally, the linux bonding 802.3ad implementation distributes traffic by peer (using an XOR of MAC addresses @@ -2404,7 +2448,8 @@ broadcast: Like active-backup, there is not much advantage to this Finally, the 802.3ad mode mandates the use of the MII monitor, therefore, the ARP monitor is not available in this mode. -balance-tlb: The balance-tlb mode balances outgoing traffic by peer. +balance-tlb: + The balance-tlb mode balances outgoing traffic by peer. Since the balancing is done according to MAC address, in a "gatewayed" configuration (as described above), this mode will send all traffic across a single device. However, in a @@ -2422,7 +2467,8 @@ balance-tlb: The balance-tlb mode balances outgoing traffic by peer. network device driver of the slave interfaces, and the ARP monitor is not available. -balance-alb: This mode is everything that balance-tlb is, and more. +balance-alb: + This mode is everything that balance-tlb is, and more. It has all of the features (and restrictions) of balance-tlb, and will also balance incoming traffic from local network peers (as described in the Bonding Module Options section, @@ -2435,7 +2481,7 @@ balance-alb: This mode is everything that balance-tlb is, and more. 12.1.2 MT Link Monitoring for Single Switch Topology ---------------------------------------------------- - The choice of link monitoring may largely depend upon which +The choice of link monitoring may largely depend upon which mode you choose to use. The more advanced load balancing modes do not support the use of the ARP monitor, and are thus restricted to using the MII monitor (which does not provide as high a level of end to end @@ -2444,27 +2490,27 @@ assurance as the ARP monitor). 12.2 Maximum Throughput in a Multiple Switch Topology ----------------------------------------------------- - Multiple switches may be utilized to optimize for throughput +Multiple switches may be utilized to optimize for throughput when they are configured in parallel as part of an isolated network -between two or more systems, for example: - - +-----------+ - | Host A | - +-+---+---+-+ - | | | - +--------+ | +---------+ - | | | - +------+---+ +-----+----+ +-----+----+ - | Switch A | | Switch B | | Switch C | - +------+---+ +-----+----+ +-----+----+ - | | | - +--------+ | +---------+ - | | | - +-+---+---+-+ - | Host B | - +-----------+ - - In this configuration, the switches are isolated from one +between two or more systems, for example:: + + +-----------+ + | Host A | + +-+---+---+-+ + | | | + +--------+ | +---------+ + | | | + +------+---+ +-----+----+ +-----+----+ + | Switch A | | Switch B | | Switch C | + +------+---+ +-----+----+ +-----+----+ + | | | + +--------+ | +---------+ + | | | + +-+---+---+-+ + | Host B | + +-----------+ + +In this configuration, the switches are isolated from one another. One reason to employ a topology such as this is for an isolated network with many hosts (a cluster configured for high performance, for example), using multiple smaller switches can be more @@ -2472,14 +2518,14 @@ cost effective than a single larger switch, e.g., on a network with 24 hosts, three 24 port switches can be significantly less expensive than a single 72 port switch. - If access beyond the network is required, an individual host +If access beyond the network is required, an individual host can be equipped with an additional network device connected to an external network; this host then additionally acts as a gateway. 12.2.1 MT Bonding Mode Selection for Multiple Switch Topology ------------------------------------------------------------- - In actual practice, the bonding mode typically employed in +In actual practice, the bonding mode typically employed in configurations of this type is balance-rr. Historically, in this network configuration, the usual caveats about out of order packet delivery are mitigated by the use of network adapters that do not do @@ -2492,7 +2538,7 @@ utilize greater than one interface's bandwidth. 12.2.2 MT Link Monitoring for Multiple Switch Topology ------------------------------------------------------ - Again, in actual practice, the MII monitor is most often used +Again, in actual practice, the MII monitor is most often used in this configuration, as performance is given preference over availability. The ARP monitor will function in this topology, but its advantages over the MII monitor are mitigated by the volume of probes @@ -2505,10 +2551,10 @@ host in the network is configured with bonding). 13.1 Link Establishment and Failover Delays ------------------------------------------- - Some switches exhibit undesirable behavior with regard to the +Some switches exhibit undesirable behavior with regard to the timing of link up and down reporting by the switch. - First, when a link comes up, some switches may indicate that +First, when a link comes up, some switches may indicate that the link is up (carrier available), but not pass traffic over the interface for some period of time. This delay is typically due to some type of autonegotiation or routing protocol, but may also occur @@ -2517,12 +2563,12 @@ failure). If you find this to be a problem, specify an appropriate value to the updelay bonding module option to delay the use of the relevant interface(s). - Second, some switches may "bounce" the link state one or more +Second, some switches may "bounce" the link state one or more times while a link is changing state. This occurs most commonly while the switch is initializing. Again, an appropriate updelay value may help. - Note that when a bonding interface has no active links, the +Note that when a bonding interface has no active links, the driver will immediately reuse the first link that goes up, even if the updelay parameter has been specified (the updelay is ignored in this case). If there are slave interfaces waiting for the updelay timeout @@ -2532,7 +2578,7 @@ value of updelay has been overestimated, and since this occurs only in cases with no connectivity, there is no additional penalty for ignoring the updelay. - In addition to the concerns about switch timings, if your +In addition to the concerns about switch timings, if your switches take a long time to go into backup mode, it may be desirable to not activate a backup interface immediately after a link goes down. Failover may be delayed via the downdelay bonding module option. @@ -2540,31 +2586,31 @@ Failover may be delayed via the downdelay bonding module option. 13.2 Duplicated Incoming Packets -------------------------------- - NOTE: Starting with version 3.0.2, the bonding driver has logic to +NOTE: Starting with version 3.0.2, the bonding driver has logic to suppress duplicate packets, which should largely eliminate this problem. The following description is kept for reference. - It is not uncommon to observe a short burst of duplicated +It is not uncommon to observe a short burst of duplicated traffic when the bonding device is first used, or after it has been idle for some period of time. This is most easily observed by issuing a "ping" to some other host on the network, and noticing that the output from ping flags duplicates (typically one per slave). - For example, on a bond in active-backup mode with five slaves -all connected to one switch, the output may appear as follows: - -# ping -n 10.0.4.2 -PING 10.0.4.2 (10.0.4.2) from 10.0.3.10 : 56(84) bytes of data. -64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.7 ms -64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) -64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) -64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) -64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) -64 bytes from 10.0.4.2: icmp_seq=2 ttl=64 time=0.216 ms -64 bytes from 10.0.4.2: icmp_seq=3 ttl=64 time=0.267 ms -64 bytes from 10.0.4.2: icmp_seq=4 ttl=64 time=0.222 ms - - This is not due to an error in the bonding driver, rather, it +For example, on a bond in active-backup mode with five slaves +all connected to one switch, the output may appear as follows:: + + # ping -n 10.0.4.2 + PING 10.0.4.2 (10.0.4.2) from 10.0.3.10 : 56(84) bytes of data. + 64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.7 ms + 64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) + 64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) + 64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) + 64 bytes from 10.0.4.2: icmp_seq=1 ttl=64 time=13.8 ms (DUP!) + 64 bytes from 10.0.4.2: icmp_seq=2 ttl=64 time=0.216 ms + 64 bytes from 10.0.4.2: icmp_seq=3 ttl=64 time=0.267 ms + 64 bytes from 10.0.4.2: icmp_seq=4 ttl=64 time=0.222 ms + +This is not due to an error in the bonding driver, rather, it is a side effect of how many switches update their MAC forwarding tables. Initially, the switch does not associate the MAC address in the packet with a particular switch port, and so it may send the @@ -2574,7 +2620,7 @@ single switch, when the switch (temporarily) floods the traffic to all ports, the bond device receives multiple copies of the same packet (one per slave device). - The duplicated packet behavior is switch dependent, some +The duplicated packet behavior is switch dependent, some switches exhibit this, and some do not. On switches that display this behavior, it can be induced by clearing the MAC forwarding table (on most Cisco switches, the privileged command "clear mac address-table @@ -2583,16 +2629,16 @@ dynamic" will accomplish this). 14. Hardware Specific Considerations ==================================== - This section contains additional information for configuring +This section contains additional information for configuring bonding on specific hardware platforms, or for interfacing bonding with particular switches or other devices. 14.1 IBM BladeCenter -------------------- - This applies to the JS20 and similar systems. +This applies to the JS20 and similar systems. - On the JS20 blades, the bonding driver supports only +On the JS20 blades, the bonding driver supports only balance-rr, active-backup, balance-tlb and balance-alb modes. This is largely due to the network topology inside the BladeCenter, detailed below. @@ -2600,7 +2646,7 @@ below. JS20 network adapter information -------------------------------- - All JS20s come with two Broadcom Gigabit Ethernet ports +All JS20s come with two Broadcom Gigabit Ethernet ports integrated on the planar (that's "motherboard" in IBM-speak). In the BladeCenter chassis, the eth0 port of all JS20 blades is hard wired to I/O Module #1; similarly, all eth1 ports are wired to I/O Module #2. @@ -2608,36 +2654,36 @@ An add-on Broadcom daughter card can be installed on a JS20 to provide two more Gigabit Ethernet ports. These ports, eth2 and eth3, are wired to I/O Modules 3 and 4, respectively. - Each I/O Module may contain either a switch or a passthrough +Each I/O Module may contain either a switch or a passthrough module (which allows ports to be directly connected to an external switch). Some bonding modes require a specific BladeCenter internal network topology in order to function; these are detailed below. - Additional BladeCenter-specific networking information can be +Additional BladeCenter-specific networking information can be found in two IBM Redbooks (www.ibm.com/redbooks): -"IBM eServer BladeCenter Networking Options" -"IBM eServer BladeCenter Layer 2-7 Network Switching" +- "IBM eServer BladeCenter Networking Options" +- "IBM eServer BladeCenter Layer 2-7 Network Switching" BladeCenter networking configuration ------------------------------------ - Because a BladeCenter can be configured in a very large number +Because a BladeCenter can be configured in a very large number of ways, this discussion will be confined to describing basic configurations. - Normally, Ethernet Switch Modules (ESMs) are used in I/O +Normally, Ethernet Switch Modules (ESMs) are used in I/O modules 1 and 2. In this configuration, the eth0 and eth1 ports of a JS20 will be connected to different internal switches (in the respective I/O modules). - A passthrough module (OPM or CPM, optical or copper, +A passthrough module (OPM or CPM, optical or copper, passthrough module) connects the I/O module directly to an external switch. By using PMs in I/O module #1 and #2, the eth0 and eth1 interfaces of a JS20 can be redirected to the outside world and connected to a common external switch. - Depending upon the mix of ESMs and PMs, the network will +Depending upon the mix of ESMs and PMs, the network will appear to bonding as either a single switch topology (all PMs) or as a multiple switch topology (one or more ESMs, zero or more PMs). It is also possible to connect ESMs together, resulting in a configuration @@ -2647,24 +2693,24 @@ Topology," above. Requirements for specific modes ------------------------------- - The balance-rr mode requires the use of passthrough modules +The balance-rr mode requires the use of passthrough modules for devices in the bond, all connected to an common external switch. That switch must be configured for "etherchannel" or "trunking" on the appropriate ports, as is usual for balance-rr. - The balance-alb and balance-tlb modes will function with +The balance-alb and balance-tlb modes will function with either switch modules or passthrough modules (or a mix). The only specific requirement for these modes is that all network interfaces must be able to reach all destinations for traffic sent over the bonding device (i.e., the network must converge at some point outside the BladeCenter). - The active-backup mode has no additional requirements. +The active-backup mode has no additional requirements. Link monitoring issues ---------------------- - When an Ethernet Switch Module is in place, only the ARP +When an Ethernet Switch Module is in place, only the ARP monitor will reliably detect link loss to an external switch. This is nothing unusual, but examination of the BladeCenter cabinet would suggest that the "external" network ports are the ethernet ports for @@ -2672,166 +2718,173 @@ the system, when it fact there is a switch between these "external" ports and the devices on the JS20 system itself. The MII monitor is only able to detect link failures between the ESM and the JS20 system. - When a passthrough module is in place, the MII monitor does +When a passthrough module is in place, the MII monitor does detect failures to the "external" port, which is then directly connected to the JS20 system. Other concerns -------------- - The Serial Over LAN (SoL) link is established over the primary +The Serial Over LAN (SoL) link is established over the primary ethernet (eth0) only, therefore, any loss of link to eth0 will result in losing your SoL connection. It will not fail over with other network traffic, as the SoL system is beyond the control of the bonding driver. - It may be desirable to disable spanning tree on the switch +It may be desirable to disable spanning tree on the switch (either the internal Ethernet Switch Module, or an external switch) to avoid fail-over delay issues when using bonding. - + 15. Frequently Asked Questions ============================== 1. Is it SMP safe? +------------------- - Yes. The old 2.0.xx channel bonding patch was not SMP safe. +Yes. The old 2.0.xx channel bonding patch was not SMP safe. The new driver was designed to be SMP safe from the start. 2. What type of cards will work with it? +----------------------------------------- - Any Ethernet type cards (you can even mix cards - a Intel +Any Ethernet type cards (you can even mix cards - a Intel EtherExpress PRO/100 and a 3com 3c905b, for example). For most modes, devices need not be of the same speed. - Starting with version 3.2.1, bonding also supports Infiniband +Starting with version 3.2.1, bonding also supports Infiniband slaves in active-backup mode. 3. How many bonding devices can I have? +---------------------------------------- - There is no limit. +There is no limit. 4. How many slaves can a bonding device have? +---------------------------------------------- - This is limited only by the number of network interfaces Linux +This is limited only by the number of network interfaces Linux supports and/or the number of network cards you can place in your system. 5. What happens when a slave link dies? +---------------------------------------- - If link monitoring is enabled, then the failing device will be +If link monitoring is enabled, then the failing device will be disabled. The active-backup mode will fail over to a backup link, and other modes will ignore the failed link. The link will continue to be monitored, and should it recover, it will rejoin the bond (in whatever manner is appropriate for the mode). See the sections on High Availability and the documentation for each mode for additional information. - - Link monitoring can be enabled via either the miimon or + +Link monitoring can be enabled via either the miimon or arp_interval parameters (described in the module parameters section, above). In general, miimon monitors the carrier state as sensed by the underlying network device, and the arp monitor (arp_interval) monitors connectivity to another host on the local network. - If no link monitoring is configured, the bonding driver will +If no link monitoring is configured, the bonding driver will be unable to detect link failures, and will assume that all links are always available. This will likely result in lost packets, and a resulting degradation of performance. The precise performance loss depends upon the bonding mode and network configuration. 6. Can bonding be used for High Availability? +---------------------------------------------- - Yes. See the section on High Availability for details. +Yes. See the section on High Availability for details. 7. Which switches/systems does it work with? +--------------------------------------------- - The full answer to this depends upon the desired mode. +The full answer to this depends upon the desired mode. - In the basic balance modes (balance-rr and balance-xor), it +In the basic balance modes (balance-rr and balance-xor), it works with any system that supports etherchannel (also called trunking). Most managed switches currently available have such support, and many unmanaged switches as well. - The advanced balance modes (balance-tlb and balance-alb) do +The advanced balance modes (balance-tlb and balance-alb) do not have special switch requirements, but do need device drivers that support specific features (described in the appropriate section under module parameters, above). - In 802.3ad mode, it works with systems that support IEEE +In 802.3ad mode, it works with systems that support IEEE 802.3ad Dynamic Link Aggregation. Most managed and many unmanaged switches currently available support 802.3ad. - The active-backup mode should work with any Layer-II switch. +The active-backup mode should work with any Layer-II switch. 8. Where does a bonding device get its MAC address from? +--------------------------------------------------------- - When using slave devices that have fixed MAC addresses, or when +When using slave devices that have fixed MAC addresses, or when the fail_over_mac option is enabled, the bonding device's MAC address is the MAC address of the active slave. - For other configurations, if not explicitly configured (with +For other configurations, if not explicitly configured (with ifconfig or ip link), the MAC address of the bonding device is taken from its first slave device. This MAC address is then passed to all following slaves and remains persistent (even if the first slave is removed) until the bonding device is brought down or reconfigured. - If you wish to change the MAC address, you can set it with -ifconfig or ip link: +If you wish to change the MAC address, you can set it with +ifconfig or ip link:: -# ifconfig bond0 hw ether 00:11:22:33:44:55 + # ifconfig bond0 hw ether 00:11:22:33:44:55 -# ip link set bond0 address 66:77:88:99:aa:bb + # ip link set bond0 address 66:77:88:99:aa:bb - The MAC address can be also changed by bringing down/up the -device and then changing its slaves (or their order): +The MAC address can be also changed by bringing down/up the +device and then changing its slaves (or their order):: -# ifconfig bond0 down ; modprobe -r bonding -# ifconfig bond0 .... up -# ifenslave bond0 eth... + # ifconfig bond0 down ; modprobe -r bonding + # ifconfig bond0 .... up + # ifenslave bond0 eth... - This method will automatically take the address from the next +This method will automatically take the address from the next slave that is added. - To restore your slaves' MAC addresses, you need to detach them -from the bond (`ifenslave -d bond0 eth0'). The bonding driver will +To restore your slaves' MAC addresses, you need to detach them +from the bond (``ifenslave -d bond0 eth0``). The bonding driver will then restore the MAC addresses that the slaves had before they were enslaved. 16. Resources and Links ======================= - The latest version of the bonding driver can be found in the latest +The latest version of the bonding driver can be found in the latest version of the linux kernel, found on http://kernel.org - The latest version of this document can be found in the latest kernel -source (named Documentation/networking/bonding.txt). +The latest version of this document can be found in the latest kernel +source (named Documentation/networking/bonding.rst). - Discussions regarding the usage of the bonding driver take place on the +Discussions regarding the usage of the bonding driver take place on the bonding-devel mailing list, hosted at sourceforge.net. If you have questions or problems, post them to the list. The list address is: bonding-devel@lists.sourceforge.net - The administrative interface (to subscribe or unsubscribe) can +The administrative interface (to subscribe or unsubscribe) can be found at: https://lists.sourceforge.net/lists/listinfo/bonding-devel - Discussions regarding the development of the bonding driver take place +Discussions regarding the development of the bonding driver take place on the main Linux network mailing list, hosted at vger.kernel.org. The list address is: netdev@vger.kernel.org - The administrative interface (to subscribe or unsubscribe) can +The administrative interface (to subscribe or unsubscribe) can be found at: http://vger.kernel.org/vger-lists.html#netdev Donald Becker's Ethernet Drivers and diag programs may be found at : - - http://web.archive.org/web/*/http://www.scyld.com/network/ + + - http://web.archive.org/web/%2E/http://www.scyld.com/network/ You will also find a lot of information regarding Ethernet, NWay, MII, etc. at www.scyld.com. - --- END -- diff --git a/Documentation/networking/caif/caif.rst b/Documentation/networking/caif/caif.rst index 07afc8063d4d..a07213030ccf 100644 --- a/Documentation/networking/caif/caif.rst +++ b/Documentation/networking/caif/caif.rst @@ -1,5 +1,3 @@ -:orphan: - .. SPDX-License-Identifier: GPL-2.0 .. include:: <isonum.txt> diff --git a/Documentation/networking/caif/index.rst b/Documentation/networking/caif/index.rst new file mode 100644 index 000000000000..86e5b7832ec3 --- /dev/null +++ b/Documentation/networking/caif/index.rst @@ -0,0 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 + +CAIF +==== + +Contents: + +.. toctree:: + :maxdepth: 2 + + linux_caif + caif + spi_porting diff --git a/Documentation/networking/caif/Linux-CAIF.txt b/Documentation/networking/caif/linux_caif.rst index 0aa4bd381bec..a0480862ab8c 100644 --- a/Documentation/networking/caif/Linux-CAIF.txt +++ b/Documentation/networking/caif/linux_caif.rst @@ -1,12 +1,19 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: <isonum.txt> + +========== Linux CAIF -=========== -copyright (C) ST-Ericsson AB 2010 -Author: Sjur Brendeland/ sjur.brandeland@stericsson.com -License terms: GNU General Public License (GPL) version 2 +========== + +Copyright |copy| ST-Ericsson AB 2010 + +:Author: Sjur Brendeland/ sjur.brandeland@stericsson.com +:License terms: GNU General Public License (GPL) version 2 Introduction ------------- +============ + CAIF is a MUX protocol used by ST-Ericsson cellular modems for communication between Modem and host. The host processes can open virtual AT channels, initiate GPRS Data connections, Video channels and Utility Channels. @@ -16,13 +23,16 @@ ST-Ericsson modems support a number of transports between modem and host. Currently, UART and Loopback are available for Linux. -Architecture: ------------- +Architecture +============ + The implementation of CAIF is divided into: + * CAIF Socket Layer and GPRS IP Interface. * CAIF Core Protocol Implementation * CAIF Link Layer, implemented as NET devices. +:: RTNL ! @@ -46,12 +56,12 @@ The implementation of CAIF is divided into: -I M P L E M E N T A T I O N -=========================== +Implementation +============== CAIF Core Protocol Layer -========================================= +------------------------ CAIF Core layer implements the CAIF protocol as defined by ST-Ericsson. It implements the CAIF protocol stack in a layered approach, where @@ -59,8 +69,11 @@ each layer described in the specification is implemented as a separate layer. The architecture is inspired by the design patterns "Protocol Layer" and "Protocol Packet". -== CAIF structure == +CAIF structure +^^^^^^^^^^^^^^ + The Core CAIF implementation contains: + - Simple implementation of CAIF. - Layered architecture (a la Streams), each layer in the CAIF specification is implemented in a separate c-file. @@ -73,7 +86,8 @@ The Core CAIF implementation contains: to the called function (except for framing layers' receive function) Layered Architecture --------------------- +==================== + The CAIF protocol can be divided into two parts: Support functions and Protocol Implementation. The support functions include: @@ -112,7 +126,7 @@ The CAIF Protocol implementation contains: - CFSERL CAIF Serial layer. Handles concatenation/split of frames into CAIF Frames with correct length. - +:: +---------+ | Config | @@ -143,18 +157,24 @@ The CAIF Protocol implementation contains: In this layered approach the following "rules" apply. + - All layers embed the same structure "struct cflayer" - A layer does not depend on any other layer's private data. - - Layers are stacked by setting the pointers + - Layers are stacked by setting the pointers:: + layer->up , layer->dn - - In order to send data upwards, each layer should do + + - In order to send data upwards, each layer should do:: + layer->up->receive(layer->up, packet); - - In order to send data downwards, each layer should do + + - In order to send data downwards, each layer should do:: + layer->dn->transmit(layer->dn, packet); CAIF Socket and IP interface -=========================== +============================ The IP interface and CAIF socket API are implemented on top of the CAIF Core protocol. The IP Interface and CAIF socket have an instance of diff --git a/Documentation/networking/caif/spi_porting.rst b/Documentation/networking/caif/spi_porting.rst new file mode 100644 index 000000000000..d49f874b20ac --- /dev/null +++ b/Documentation/networking/caif/spi_porting.rst @@ -0,0 +1,229 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================ +CAIF SPI porting +================ + +CAIF SPI basics +=============== + +Running CAIF over SPI needs some extra setup, owing to the nature of SPI. +Two extra GPIOs have been added in order to negotiate the transfers +between the master and the slave. The minimum requirement for running +CAIF over SPI is a SPI slave chip and two GPIOs (more details below). +Please note that running as a slave implies that you need to keep up +with the master clock. An overrun or underrun event is fatal. + +CAIF SPI framework +================== + +To make porting as easy as possible, the CAIF SPI has been divided in +two parts. The first part (called the interface part) deals with all +generic functionality such as length framing, SPI frame negotiation +and SPI frame delivery and transmission. The other part is the CAIF +SPI slave device part, which is the module that you have to write if +you want to run SPI CAIF on a new hardware. This part takes care of +the physical hardware, both with regard to SPI and to GPIOs. + +- Implementing a CAIF SPI device: + + - Functionality provided by the CAIF SPI slave device: + + In order to implement a SPI device you will, as a minimum, + need to implement the following + functions: + + :: + + int (*init_xfer) (struct cfspi_xfer * xfer, struct cfspi_dev *dev): + + This function is called by the CAIF SPI interface to give + you a chance to set up your hardware to be ready to receive + a stream of data from the master. The xfer structure contains + both physical and logical addresses, as well as the total length + of the transfer in both directions.The dev parameter can be used + to map to different CAIF SPI slave devices. + + :: + + void (*sig_xfer) (bool xfer, struct cfspi_dev *dev): + + This function is called by the CAIF SPI interface when the output + (SPI_INT) GPIO needs to change state. The boolean value of the xfer + variable indicates whether the GPIO should be asserted (HIGH) or + deasserted (LOW). The dev parameter can be used to map to different CAIF + SPI slave devices. + + - Functionality provided by the CAIF SPI interface: + + :: + + void (*ss_cb) (bool assert, struct cfspi_ifc *ifc); + + This function is called by the CAIF SPI slave device in order to + signal a change of state of the input GPIO (SS) to the interface. + Only active edges are mandatory to be reported. + This function can be called from IRQ context (recommended in order + not to introduce latency). The ifc parameter should be the pointer + returned from the platform probe function in the SPI device structure. + + :: + + void (*xfer_done_cb) (struct cfspi_ifc *ifc); + + This function is called by the CAIF SPI slave device in order to + report that a transfer is completed. This function should only be + called once both the transmission and the reception are completed. + This function can be called from IRQ context (recommended in order + not to introduce latency). The ifc parameter should be the pointer + returned from the platform probe function in the SPI device structure. + + - Connecting the bits and pieces: + + - Filling in the SPI slave device structure: + + Connect the necessary callback functions. + + Indicate clock speed (used to calculate toggle delays). + + Chose a suitable name (helps debugging if you use several CAIF + SPI slave devices). + + Assign your private data (can be used to map to your + structure). + + - Filling in the SPI slave platform device structure: + + Add name of driver to connect to ("cfspi_sspi"). + + Assign the SPI slave device structure as platform data. + +Padding +======= + +In order to optimize throughput, a number of SPI padding options are provided. +Padding can be enabled independently for uplink and downlink transfers. +Padding can be enabled for the head, the tail and for the total frame size. +The padding needs to be correctly configured on both sides of the link. +The padding can be changed via module parameters in cfspi_sspi.c or via +the sysfs directory of the cfspi_sspi driver (before device registration). + +- CAIF SPI device template:: + + /* + * Copyright (C) ST-Ericsson AB 2010 + * Author: Daniel Martensson / Daniel.Martensson@stericsson.com + * License terms: GNU General Public License (GPL), version 2. + * + */ + + #include <linux/init.h> + #include <linux/module.h> + #include <linux/device.h> + #include <linux/wait.h> + #include <linux/interrupt.h> + #include <linux/dma-mapping.h> + #include <net/caif/caif_spi.h> + + MODULE_LICENSE("GPL"); + + struct sspi_struct { + struct cfspi_dev sdev; + struct cfspi_xfer *xfer; + }; + + static struct sspi_struct slave; + static struct platform_device slave_device; + + static irqreturn_t sspi_irq(int irq, void *arg) + { + /* You only need to trigger on an edge to the active state of the + * SS signal. Once a edge is detected, the ss_cb() function should be + * called with the parameter assert set to true. It is OK + * (and even advised) to call the ss_cb() function in IRQ context in + * order not to add any delay. */ + + return IRQ_HANDLED; + } + + static void sspi_complete(void *context) + { + /* Normally the DMA or the SPI framework will call you back + * in something similar to this. The only thing you need to + * do is to call the xfer_done_cb() function, providing the pointer + * to the CAIF SPI interface. It is OK to call this function + * from IRQ context. */ + } + + static int sspi_init_xfer(struct cfspi_xfer *xfer, struct cfspi_dev *dev) + { + /* Store transfer info. For a normal implementation you should + * set up your DMA here and make sure that you are ready to + * receive the data from the master SPI. */ + + struct sspi_struct *sspi = (struct sspi_struct *)dev->priv; + + sspi->xfer = xfer; + + return 0; + } + + void sspi_sig_xfer(bool xfer, struct cfspi_dev *dev) + { + /* If xfer is true then you should assert the SPI_INT to indicate to + * the master that you are ready to receive the data from the master + * SPI. If xfer is false then you should de-assert SPI_INT to indicate + * that the transfer is done. + */ + + struct sspi_struct *sspi = (struct sspi_struct *)dev->priv; + } + + static void sspi_release(struct device *dev) + { + /* + * Here you should release your SPI device resources. + */ + } + + static int __init sspi_init(void) + { + /* Here you should initialize your SPI device by providing the + * necessary functions, clock speed, name and private data. Once + * done, you can register your device with the + * platform_device_register() function. This function will return + * with the CAIF SPI interface initialized. This is probably also + * the place where you should set up your GPIOs, interrupts and SPI + * resources. */ + + int res = 0; + + /* Initialize slave device. */ + slave.sdev.init_xfer = sspi_init_xfer; + slave.sdev.sig_xfer = sspi_sig_xfer; + slave.sdev.clk_mhz = 13; + slave.sdev.priv = &slave; + slave.sdev.name = "spi_sspi"; + slave_device.dev.release = sspi_release; + + /* Initialize platform device. */ + slave_device.name = "cfspi_sspi"; + slave_device.dev.platform_data = &slave.sdev; + + /* Register platform device. */ + res = platform_device_register(&slave_device); + if (res) { + printk(KERN_WARNING "sspi_init: failed to register dev.\n"); + return -ENODEV; + } + + return res; + } + + static void __exit sspi_exit(void) + { + platform_device_del(&slave_device); + } + + module_init(sspi_init); + module_exit(sspi_exit); diff --git a/Documentation/networking/caif/spi_porting.txt b/Documentation/networking/caif/spi_porting.txt deleted file mode 100644 index 9efd0687dc4c..000000000000 --- a/Documentation/networking/caif/spi_porting.txt +++ /dev/null @@ -1,208 +0,0 @@ -- CAIF SPI porting - - -- CAIF SPI basics: - -Running CAIF over SPI needs some extra setup, owing to the nature of SPI. -Two extra GPIOs have been added in order to negotiate the transfers - between the master and the slave. The minimum requirement for running -CAIF over SPI is a SPI slave chip and two GPIOs (more details below). -Please note that running as a slave implies that you need to keep up -with the master clock. An overrun or underrun event is fatal. - -- CAIF SPI framework: - -To make porting as easy as possible, the CAIF SPI has been divided in -two parts. The first part (called the interface part) deals with all -generic functionality such as length framing, SPI frame negotiation -and SPI frame delivery and transmission. The other part is the CAIF -SPI slave device part, which is the module that you have to write if -you want to run SPI CAIF on a new hardware. This part takes care of -the physical hardware, both with regard to SPI and to GPIOs. - -- Implementing a CAIF SPI device: - - - Functionality provided by the CAIF SPI slave device: - - In order to implement a SPI device you will, as a minimum, - need to implement the following - functions: - - int (*init_xfer) (struct cfspi_xfer * xfer, struct cfspi_dev *dev): - - This function is called by the CAIF SPI interface to give - you a chance to set up your hardware to be ready to receive - a stream of data from the master. The xfer structure contains - both physical and logical addresses, as well as the total length - of the transfer in both directions.The dev parameter can be used - to map to different CAIF SPI slave devices. - - void (*sig_xfer) (bool xfer, struct cfspi_dev *dev): - - This function is called by the CAIF SPI interface when the output - (SPI_INT) GPIO needs to change state. The boolean value of the xfer - variable indicates whether the GPIO should be asserted (HIGH) or - deasserted (LOW). The dev parameter can be used to map to different CAIF - SPI slave devices. - - - Functionality provided by the CAIF SPI interface: - - void (*ss_cb) (bool assert, struct cfspi_ifc *ifc); - - This function is called by the CAIF SPI slave device in order to - signal a change of state of the input GPIO (SS) to the interface. - Only active edges are mandatory to be reported. - This function can be called from IRQ context (recommended in order - not to introduce latency). The ifc parameter should be the pointer - returned from the platform probe function in the SPI device structure. - - void (*xfer_done_cb) (struct cfspi_ifc *ifc); - - This function is called by the CAIF SPI slave device in order to - report that a transfer is completed. This function should only be - called once both the transmission and the reception are completed. - This function can be called from IRQ context (recommended in order - not to introduce latency). The ifc parameter should be the pointer - returned from the platform probe function in the SPI device structure. - - - Connecting the bits and pieces: - - - Filling in the SPI slave device structure: - - Connect the necessary callback functions. - Indicate clock speed (used to calculate toggle delays). - Chose a suitable name (helps debugging if you use several CAIF - SPI slave devices). - Assign your private data (can be used to map to your structure). - - - Filling in the SPI slave platform device structure: - Add name of driver to connect to ("cfspi_sspi"). - Assign the SPI slave device structure as platform data. - -- Padding: - -In order to optimize throughput, a number of SPI padding options are provided. -Padding can be enabled independently for uplink and downlink transfers. -Padding can be enabled for the head, the tail and for the total frame size. -The padding needs to be correctly configured on both sides of the link. -The padding can be changed via module parameters in cfspi_sspi.c or via -the sysfs directory of the cfspi_sspi driver (before device registration). - -- CAIF SPI device template: - -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Daniel Martensson / Daniel.Martensson@stericsson.com - * License terms: GNU General Public License (GPL), version 2. - * - */ - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/device.h> -#include <linux/wait.h> -#include <linux/interrupt.h> -#include <linux/dma-mapping.h> -#include <net/caif/caif_spi.h> - -MODULE_LICENSE("GPL"); - -struct sspi_struct { - struct cfspi_dev sdev; - struct cfspi_xfer *xfer; -}; - -static struct sspi_struct slave; -static struct platform_device slave_device; - -static irqreturn_t sspi_irq(int irq, void *arg) -{ - /* You only need to trigger on an edge to the active state of the - * SS signal. Once a edge is detected, the ss_cb() function should be - * called with the parameter assert set to true. It is OK - * (and even advised) to call the ss_cb() function in IRQ context in - * order not to add any delay. */ - - return IRQ_HANDLED; -} - -static void sspi_complete(void *context) -{ - /* Normally the DMA or the SPI framework will call you back - * in something similar to this. The only thing you need to - * do is to call the xfer_done_cb() function, providing the pointer - * to the CAIF SPI interface. It is OK to call this function - * from IRQ context. */ -} - -static int sspi_init_xfer(struct cfspi_xfer *xfer, struct cfspi_dev *dev) -{ - /* Store transfer info. For a normal implementation you should - * set up your DMA here and make sure that you are ready to - * receive the data from the master SPI. */ - - struct sspi_struct *sspi = (struct sspi_struct *)dev->priv; - - sspi->xfer = xfer; - - return 0; -} - -void sspi_sig_xfer(bool xfer, struct cfspi_dev *dev) -{ - /* If xfer is true then you should assert the SPI_INT to indicate to - * the master that you are ready to receive the data from the master - * SPI. If xfer is false then you should de-assert SPI_INT to indicate - * that the transfer is done. - */ - - struct sspi_struct *sspi = (struct sspi_struct *)dev->priv; -} - -static void sspi_release(struct device *dev) -{ - /* - * Here you should release your SPI device resources. - */ -} - -static int __init sspi_init(void) -{ - /* Here you should initialize your SPI device by providing the - * necessary functions, clock speed, name and private data. Once - * done, you can register your device with the - * platform_device_register() function. This function will return - * with the CAIF SPI interface initialized. This is probably also - * the place where you should set up your GPIOs, interrupts and SPI - * resources. */ - - int res = 0; - - /* Initialize slave device. */ - slave.sdev.init_xfer = sspi_init_xfer; - slave.sdev.sig_xfer = sspi_sig_xfer; - slave.sdev.clk_mhz = 13; - slave.sdev.priv = &slave; - slave.sdev.name = "spi_sspi"; - slave_device.dev.release = sspi_release; - - /* Initialize platform device. */ - slave_device.name = "cfspi_sspi"; - slave_device.dev.platform_data = &slave.sdev; - - /* Register platform device. */ - res = platform_device_register(&slave_device); - if (res) { - printk(KERN_WARNING "sspi_init: failed to register dev.\n"); - return -ENODEV; - } - - return res; -} - -static void __exit sspi_exit(void) -{ - platform_device_del(&slave_device); -} - -module_init(sspi_init); -module_exit(sspi_exit); diff --git a/Documentation/networking/cdc_mbim.txt b/Documentation/networking/cdc_mbim.rst index 4e68f0bc5dba..0048409c06b4 100644 --- a/Documentation/networking/cdc_mbim.txt +++ b/Documentation/networking/cdc_mbim.rst @@ -1,5 +1,8 @@ - cdc_mbim - Driver for CDC MBIM Mobile Broadband modems - ======================================================== +.. SPDX-License-Identifier: GPL-2.0 + +====================================================== +cdc_mbim - Driver for CDC MBIM Mobile Broadband modems +====================================================== The cdc_mbim driver supports USB devices conforming to the "Universal Serial Bus Communications Class Subclass Specification for Mobile @@ -19,9 +22,9 @@ by a cdc_ncm driver parameter: prefer_mbim ----------- -Type: Boolean -Valid Range: N/Y (0-1) -Default Value: Y (MBIM is preferred) +:Type: Boolean +:Valid Range: N/Y (0-1) +:Default Value: Y (MBIM is preferred) This parameter sets the system policy for NCM/MBIM functions. Such functions will be handled by either the cdc_ncm driver or the cdc_mbim @@ -44,11 +47,13 @@ userspace MBIM management application always is required to enable a MBIM function. Such userspace applications includes, but are not limited to: + - mbimcli (included with the libmbim [3] library), and - ModemManager [4] Establishing a MBIM IP session reequires at least these actions by the management application: + - open the control channel - configure network connection settings - connect to network @@ -76,7 +81,7 @@ complies with all the control channel requirements in [1]. The cdc-wdmX device is created as a child of the MBIM control interface USB device. The character device associated with a specific -MBIM function can be looked up using sysfs. For example: +MBIM function can be looked up using sysfs. For example:: bjorn@nemi:~$ ls /sys/bus/usb/drivers/cdc_mbim/2-4:2.12/usbmisc cdc-wdm0 @@ -119,13 +124,15 @@ negotiated control message size. /dev/cdc-wdmX ioctl() --------------------- +--------------------- IOCTL_WDM_MAX_COMMAND: Get Maximum Command Size This ioctl returns the wMaxControlMessage field of the CDC MBIM functional descriptor for MBIM devices. This is intended as a convenience, eliminating the need to parse the USB descriptors from userspace. +:: + #include <stdio.h> #include <fcntl.h> #include <sys/ioctl.h> @@ -178,7 +185,7 @@ VLAN links prior to establishing MBIM IP sessions where the SessionId is greater than 0. These links can be added by using the normal VLAN kernel interfaces, either ioctl or netlink. -For example, adding a link for a MBIM IP session with SessionId 3: +For example, adding a link for a MBIM IP session with SessionId 3:: ip link add link wwan0 name wwan0.3 type vlan id 3 @@ -207,6 +214,7 @@ the stream to the end user in an appropriate way for the stream type. The network device ABI requires a dummy ethernet header for every DSS data frame being transported. The contents of this header is arbitrary, with the following exceptions: + - TX frames using an IP protocol (0x0800 or 0x86dd) will be dropped - RX frames will have the protocol field set to ETH_P_802_3 (but will not be properly formatted 802.3 frames) @@ -218,7 +226,7 @@ adding the dummy ethernet header on TX and stripping it on RX. This is a simple example using tools commonly available, exporting DssSessionId 5 as a pty character device pointed to by a /dev/nmea -symlink: +symlink:: ip link add link wwan0 name wwan0.dss5 type vlan id 261 ip link set dev wwan0.dss5 up @@ -236,7 +244,7 @@ map frames to the correct DSS session and adding 18 byte VLAN ethernet headers with the appropriate tag on TX. In this case using a socket filter is recommended, matching only the DSS VLAN subset. This avoid unnecessary copying of unrelated IP session data to userspace. For -example: +example:: static struct sock_filter dssfilter[] = { /* use special negative offsets to get VLAN tag */ @@ -249,11 +257,11 @@ example: BPF_JUMP(BPF_JMP|BPF_JGE|BPF_K, 512, 3, 0), /* 511 is last DSS VLAN */ /* verify ethertype */ - BPF_STMT(BPF_LD|BPF_H|BPF_ABS, 2 * ETH_ALEN), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, ETH_P_802_3, 0, 1), + BPF_STMT(BPF_LD|BPF_H|BPF_ABS, 2 * ETH_ALEN), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, ETH_P_802_3, 0, 1), - BPF_STMT(BPF_RET|BPF_K, (u_int)-1), /* accept */ - BPF_STMT(BPF_RET|BPF_K, 0), /* ignore */ + BPF_STMT(BPF_RET|BPF_K, (u_int)-1), /* accept */ + BPF_STMT(BPF_RET|BPF_K, 0), /* ignore */ }; @@ -266,6 +274,7 @@ network device. This mapping implies a few restrictions on multiplexed IPS and DSS sessions, which may not always be practical: + - no IPS or DSS session can use a frame size greater than the MTU on IP session 0 - no IPS or DSS session can be in the up state unless the network @@ -280,7 +289,7 @@ device. Tip: It might be less confusing to the end user to name this VLAN subdevice after the MBIM SessionID instead of the VLAN ID. For -example: +example:: ip link add link wwan0 name wwan0.0 type vlan id 4094 @@ -290,7 +299,7 @@ VLAN mapping Summarizing the cdc_mbim driver mapping described above, we have this relationship between VLAN tags on the wwanY network device and MBIM -sessions on the shared USB data channel: +sessions on the shared USB data channel:: VLAN ID MBIM type MBIM SessionID Notes --------------------------------------------------------- @@ -310,30 +319,37 @@ sessions on the shared USB data channel: References ========== -[1] USB Implementers Forum, Inc. - "Universal Serial Bus - Communications Class Subclass Specification for Mobile Broadband - Interface Model", Revision 1.0 (Errata 1), May 1, 2013 + 1) USB Implementers Forum, Inc. - "Universal Serial Bus + Communications Class Subclass Specification for Mobile Broadband + Interface Model", Revision 1.0 (Errata 1), May 1, 2013 + - http://www.usb.org/developers/docs/devclass_docs/ -[2] USB Implementers Forum, Inc. - "Universal Serial Bus - Communications Class Subclass Specifications for Network Control - Model Devices", Revision 1.0 (Errata 1), November 24, 2010 + 2) USB Implementers Forum, Inc. - "Universal Serial Bus + Communications Class Subclass Specifications for Network Control + Model Devices", Revision 1.0 (Errata 1), November 24, 2010 + - http://www.usb.org/developers/docs/devclass_docs/ -[3] libmbim - "a glib-based library for talking to WWAN modems and - devices which speak the Mobile Interface Broadband Model (MBIM) - protocol" + 3) libmbim - "a glib-based library for talking to WWAN modems and + devices which speak the Mobile Interface Broadband Model (MBIM) + protocol" + - http://www.freedesktop.org/wiki/Software/libmbim/ -[4] ModemManager - "a DBus-activated daemon which controls mobile - broadband (2G/3G/4G) devices and connections" + 4) ModemManager - "a DBus-activated daemon which controls mobile + broadband (2G/3G/4G) devices and connections" + - http://www.freedesktop.org/wiki/Software/ModemManager/ -[5] "MBIM (Mobile Broadband Interface Model) Registry" + 5) "MBIM (Mobile Broadband Interface Model) Registry" + - http://compliance.usb.org/mbim/ -[6] "/sys/kernel/debug/usb/devices output format" + 6) "/sys/kernel/debug/usb/devices output format" + - Documentation/driver-api/usb/usb.rst -[7] "/sys/bus/usb/devices/.../descriptors" + 7) "/sys/bus/usb/devices/.../descriptors" + - Documentation/ABI/stable/sysfs-bus-usb diff --git a/Documentation/networking/cops.rst b/Documentation/networking/cops.rst new file mode 100644 index 000000000000..964ba80599a9 --- /dev/null +++ b/Documentation/networking/cops.rst @@ -0,0 +1,80 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================== +The COPS LocalTalk Linux driver (cops.c) +======================================== + +By Jay Schulist <jschlst@samba.org> + +This driver has two modes and they are: Dayna mode and Tangent mode. +Each mode corresponds with the type of card. It has been found +that there are 2 main types of cards and all other cards are +the same and just have different names or only have minor differences +such as more IO ports. As this driver is tested it will +become more clear exactly what cards are supported. + +Right now these cards are known to work with the COPS driver. The +LT-200 cards work in a somewhat more limited capacity than the +DL200 cards, which work very well and are in use by many people. + +TANGENT driver mode: + - Tangent ATB-II, Novell NL-1000, Daystar Digital LT-200 + +DAYNA driver mode: + - Dayna DL2000/DaynaTalk PC (Half Length), COPS LT-95, + - Farallon PhoneNET PC III, Farallon PhoneNET PC II + +Other cards possibly supported mode unknown though: + - Dayna DL2000 (Full length) + +The COPS driver defaults to using Dayna mode. To change the driver's +mode if you built a driver with dual support use board_type=1 or +board_type=2 for Dayna or Tangent with insmod. + +Operation/loading of the driver +=============================== + +Use modprobe like this: /sbin/modprobe cops.o (IO #) (IRQ #) +If you do not specify any options the driver will try and use the IO = 0x240, +IRQ = 5. As of right now I would only use IRQ 5 for the card, if autoprobing. + +To load multiple COPS driver Localtalk cards you can do one of the following:: + + insmod cops io=0x240 irq=5 + insmod -o cops2 cops io=0x260 irq=3 + +Or in lilo.conf put something like this:: + + append="ether=5,0x240,lt0 ether=3,0x260,lt1" + +Then bring up the interface with ifconfig. It will look something like this:: + + lt0 Link encap:UNSPEC HWaddr 00-00-00-00-00-00-00-F7-00-00-00-00-00-00-00-00 + inet addr:192.168.1.2 Bcast:192.168.1.255 Mask:255.255.255.0 + UP BROADCAST RUNNING NOARP MULTICAST MTU:600 Metric:1 + RX packets:0 errors:0 dropped:0 overruns:0 frame:0 + TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 coll:0 + +Netatalk Configuration +====================== + +You will need to configure atalkd with something like the following to make +it work with the cops.c driver. + +* For single LTalk card use:: + + dummy -seed -phase 2 -net 2000 -addr 2000.10 -zone "1033" + lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033" + +* For multiple cards, Ethernet and LocalTalk:: + + eth0 -seed -phase 2 -net 3000 -addr 3000.20 -zone "1033" + lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033" + +* For multiple LocalTalk cards, and an Ethernet card. + +* Order seems to matter here, Ethernet last:: + + lt0 -seed -phase 1 -net 1000 -addr 1000.10 -zone "LocalTalk1" + lt1 -seed -phase 1 -net 2000 -addr 2000.20 -zone "LocalTalk2" + eth0 -seed -phase 2 -net 3000 -addr 3000.30 -zone "EtherTalk" diff --git a/Documentation/networking/cops.txt b/Documentation/networking/cops.txt deleted file mode 100644 index 3e344b448e07..000000000000 --- a/Documentation/networking/cops.txt +++ /dev/null @@ -1,63 +0,0 @@ -Text File for the COPS LocalTalk Linux driver (cops.c). - By Jay Schulist <jschlst@samba.org> - -This driver has two modes and they are: Dayna mode and Tangent mode. -Each mode corresponds with the type of card. It has been found -that there are 2 main types of cards and all other cards are -the same and just have different names or only have minor differences -such as more IO ports. As this driver is tested it will -become more clear exactly what cards are supported. - -Right now these cards are known to work with the COPS driver. The -LT-200 cards work in a somewhat more limited capacity than the -DL200 cards, which work very well and are in use by many people. - -TANGENT driver mode: - Tangent ATB-II, Novell NL-1000, Daystar Digital LT-200 -DAYNA driver mode: - Dayna DL2000/DaynaTalk PC (Half Length), COPS LT-95, - Farallon PhoneNET PC III, Farallon PhoneNET PC II -Other cards possibly supported mode unknown though: - Dayna DL2000 (Full length) - -The COPS driver defaults to using Dayna mode. To change the driver's -mode if you built a driver with dual support use board_type=1 or -board_type=2 for Dayna or Tangent with insmod. - -** Operation/loading of the driver. -Use modprobe like this: /sbin/modprobe cops.o (IO #) (IRQ #) -If you do not specify any options the driver will try and use the IO = 0x240, -IRQ = 5. As of right now I would only use IRQ 5 for the card, if autoprobing. - -To load multiple COPS driver Localtalk cards you can do one of the following. - -insmod cops io=0x240 irq=5 -insmod -o cops2 cops io=0x260 irq=3 - -Or in lilo.conf put something like this: - append="ether=5,0x240,lt0 ether=3,0x260,lt1" - -Then bring up the interface with ifconfig. It will look something like this: -lt0 Link encap:UNSPEC HWaddr 00-00-00-00-00-00-00-F7-00-00-00-00-00-00-00-00 - inet addr:192.168.1.2 Bcast:192.168.1.255 Mask:255.255.255.0 - UP BROADCAST RUNNING NOARP MULTICAST MTU:600 Metric:1 - RX packets:0 errors:0 dropped:0 overruns:0 frame:0 - TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 coll:0 - -** Netatalk Configuration -You will need to configure atalkd with something like the following to make -it work with the cops.c driver. - -* For single LTalk card use. -dummy -seed -phase 2 -net 2000 -addr 2000.10 -zone "1033" -lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033" - -* For multiple cards, Ethernet and LocalTalk. -eth0 -seed -phase 2 -net 3000 -addr 3000.20 -zone "1033" -lt0 -seed -phase 1 -net 1000 -addr 1000.50 -zone "1033" - -* For multiple LocalTalk cards, and an Ethernet card. -* Order seems to matter here, Ethernet last. -lt0 -seed -phase 1 -net 1000 -addr 1000.10 -zone "LocalTalk1" -lt1 -seed -phase 1 -net 2000 -addr 2000.20 -zone "LocalTalk2" -eth0 -seed -phase 2 -net 3000 -addr 3000.30 -zone "EtherTalk" diff --git a/Documentation/networking/cxacru.txt b/Documentation/networking/cxacru.rst index 2cce04457b4d..6088af2ffeda 100644 --- a/Documentation/networking/cxacru.txt +++ b/Documentation/networking/cxacru.rst @@ -1,3 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================== +ATM cxacru device driver +======================== + Firmware is required for this device: http://accessrunner.sourceforge.net/ While it is capable of managing/maintaining the ADSL connection without the @@ -19,29 +25,35 @@ several sysfs attribute files for retrieving device statistics: * adsl_headend * adsl_headend_environment - Information about the remote headend. + + - Information about the remote headend. * adsl_config - Configuration writing interface. - Write parameters in hexadecimal format <index>=<value>, - separated by whitespace, e.g.: + + - Configuration writing interface. + - Write parameters in hexadecimal format <index>=<value>, + separated by whitespace, e.g.: + "1=0 a=5" - Up to 7 parameters at a time will be sent and the modem will restart - the ADSL connection when any value is set. These are logged for future - reference. + + - Up to 7 parameters at a time will be sent and the modem will restart + the ADSL connection when any value is set. These are logged for future + reference. * downstream_attenuation (dB) * downstream_bits_per_frame * downstream_rate (kbps) * downstream_snr_margin (dB) - Downstream stats. + + - Downstream stats. * upstream_attenuation (dB) * upstream_bits_per_frame * upstream_rate (kbps) * upstream_snr_margin (dB) * transmitter_power (dBm/Hz) - Upstream stats. + + - Upstream stats. * downstream_crc_errors * downstream_fec_errors @@ -49,48 +61,56 @@ several sysfs attribute files for retrieving device statistics: * upstream_crc_errors * upstream_fec_errors * upstream_hec_errors - Error counts. + + - Error counts. * line_startable - Indicates that ADSL support on the device - is/can be enabled, see adsl_start. + + - Indicates that ADSL support on the device + is/can be enabled, see adsl_start. * line_status - "initialising" - "down" - "attempting to activate" - "training" - "channel analysis" - "exchange" - "waiting" - "up" + + - "initialising" + - "down" + - "attempting to activate" + - "training" + - "channel analysis" + - "exchange" + - "waiting" + - "up" Changes between "down" and "attempting to activate" if there is no signal. * link_status - "not connected" - "connected" - "lost" + + - "not connected" + - "connected" + - "lost" * mac_address * modulation - "" (when not connected) - "ANSI T1.413" - "ITU-T G.992.1 (G.DMT)" - "ITU-T G.992.2 (G.LITE)" + + - "" (when not connected) + - "ANSI T1.413" + - "ITU-T G.992.1 (G.DMT)" + - "ITU-T G.992.2 (G.LITE)" * startup_attempts - Count of total attempts to initialise ADSL. + + - Count of total attempts to initialise ADSL. To enable/disable ADSL, the following can be written to the adsl_state file: - "start" - "stop - "restart" (stops, waits 1.5s, then starts) - "poll" (used to resume status polling if it was disabled due to failure) -Changes in adsl/line state are reported via kernel log messages: + - "start" + - "stop + - "restart" (stops, waits 1.5s, then starts) + - "poll" (used to resume status polling if it was disabled due to failure) + +Changes in adsl/line state are reported via kernel log messages:: + [4942145.150704] ATM dev 0: ADSL state: running [4942243.663766] ATM dev 0: ADSL line: down [4942249.665075] ATM dev 0: ADSL line: attempting to activate diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.rst index 55c575fcaf17..dde16be04456 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.rst @@ -1,16 +1,18 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============= DCCP protocol ============= -Contents -======== -- Introduction -- Missing features -- Socket options -- Sysctl variables -- IOCTLs -- Other tunables -- Notes +.. Contents + - Introduction + - Missing features + - Socket options + - Sysctl variables + - IOCTLs + - Other tunables + - Notes Introduction @@ -38,6 +40,7 @@ The Linux DCCP implementation does not currently support all the features that a specified in RFCs 4340...42. The known bugs are at: + http://www.linuxfoundation.org/collaborate/workgroups/networking/todo#DCCP For more up-to-date versions of the DCCP implementation, please consider using @@ -54,7 +57,8 @@ defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an u32 priority value as ancillary data to sendmsg(), where higher numbers indicate a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to -be formatted using a cmsg(3) message header filled in as follows: +be formatted using a cmsg(3) message header filled in as follows:: + cmsg->cmsg_level = SOL_DCCP; cmsg->cmsg_type = DCCP_SCM_PRIORITY; cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ @@ -94,7 +98,7 @@ must be registered on the socket before calling connect() or listen(). DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. -Please note that the getsockopt argument type here is `int', not uint8_t. +Please note that the getsockopt argument type here is ``int``, not uint8_t. DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. @@ -113,6 +117,7 @@ be enabled at the receiver, too with suitable choice of CsCov. DCCP_SOCKOPT_SEND_CSCOV sets the sender checksum coverage. Values in the range 0..15 are acceptable. The default setting is 0 (full coverage), values between 1..15 indicate partial coverage. + DCCP_SOCKOPT_RECV_CSCOV is for the receiver and has a different meaning: it sets a threshold, where again values 0..15 are acceptable. The default of 0 means that all packets with a partial coverage will be discarded. @@ -123,11 +128,13 @@ DCCP_SOCKOPT_RECV_CSCOV is for the receiver and has a different meaning: it The following two options apply to CCID 3 exclusively and are getsockopt()-only. In either case, a TFRC info struct (defined in <linux/tfrc.h>) is returned. + DCCP_SOCKOPT_CCID_RX_INFO - Returns a `struct tfrc_rx_info' in optval; the buffer for optval and + Returns a ``struct tfrc_rx_info`` in optval; the buffer for optval and optlen must be set to at least sizeof(struct tfrc_rx_info). + DCCP_SOCKOPT_CCID_TX_INFO - Returns a `struct tfrc_tx_info' in optval; the buffer for optval and + Returns a ``struct tfrc_tx_info`` in optval; the buffer for optval and optlen must be set to at least sizeof(struct tfrc_tx_info). On unidirectional connections it is useful to close the unused half-connection @@ -182,7 +189,7 @@ sync_ratelimit = 125 ms IOCTLS ====== FIONREAD - Works as in udp(7): returns in the `int' argument pointer the size of + Works as in udp(7): returns in the ``int`` argument pointer the size of the next pending datagram in bytes, or 0 when no datagram is pending. @@ -191,10 +198,12 @@ Other tunables Per-route rto_min support CCID-2 supports the RTAX_RTO_MIN per-route setting for the minimum value of the RTO timer. This setting can be modified via the 'rto_min' option - of iproute2; for example: + of iproute2; for example:: + > ip route change 10.0.0.0/24 rto_min 250j dev wlan0 > ip route add 10.0.0.254/32 rto_min 800j dev wlan0 > ip route show dev wlan0 + CCID-3 also supports the rto_min setting: it is used to define the lower bound for the expiry of the nofeedback timer. This can be useful on LANs with very low RTTs (e.g., loopback, Gbit ethernet). diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.rst index 13a857753208..4cc8bb2dad50 100644 --- a/Documentation/networking/dctcp.txt +++ b/Documentation/networking/dctcp.rst @@ -1,11 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + +====================== DCTCP (DataCenter TCP) ----------------------- +====================== DCTCP is an enhancement to the TCP congestion control algorithm for data center networks and leverages Explicit Congestion Notification (ECN) in the data center network to provide multi-bit feedback to the end hosts. -To enable it on end hosts: +To enable it on end hosts:: sysctl -w net.ipv4.tcp_congestion_control=dctcp sysctl -w net.ipv4.tcp_ecn_fallback=0 (optional) @@ -25,14 +28,19 @@ SIGCOMM/SIGMETRICS papers: i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: - "Data Center TCP (DCTCP)", Data Center Networks session + + "Data Center TCP (DCTCP)", Data Center Networks session" + Proc. ACM SIGCOMM, New Delhi, 2010. + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192 ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: + "Analysis of DCTCP: Stability, Convergence, and Fairness" Proc. ACM SIGMETRICS, San Jose, 2011. + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf IETF informational draft: diff --git a/Documentation/networking/decnet.txt b/Documentation/networking/decnet.rst index d192f8b9948b..b8bc11ff8370 100644 --- a/Documentation/networking/decnet.txt +++ b/Documentation/networking/decnet.rst @@ -1,26 +1,31 @@ - Linux DECnet Networking Layer Information - =========================================== +.. SPDX-License-Identifier: GPL-2.0 -1) Other documentation.... +========================================= +Linux DECnet Networking Layer Information +========================================= - o Project Home Pages - http://www.chygwyn.com/ - Kernel info - http://linux-decnet.sourceforge.net/ - Userland tools - http://www.sourceforge.net/projects/linux-decnet/ - Status page +1. Other documentation.... +========================== -2) Configuring the kernel + - Project Home Pages + - http://www.chygwyn.com/ - Kernel info + - http://linux-decnet.sourceforge.net/ - Userland tools + - http://www.sourceforge.net/projects/linux-decnet/ - Status page + +2. Configuring the kernel +========================= Be sure to turn on the following options: - CONFIG_DECNET (obviously) - CONFIG_PROC_FS (to see what's going on) - CONFIG_SYSCTL (for easy configuration) + - CONFIG_DECNET (obviously) + - CONFIG_PROC_FS (to see what's going on) + - CONFIG_SYSCTL (for easy configuration) if you want to try out router support (not properly debugged yet) you'll need the following options as well... - CONFIG_DECNET_ROUTER (to be able to add/delete routes) - CONFIG_NETFILTER (will be required for the DECnet routing daemon) + - CONFIG_DECNET_ROUTER (to be able to add/delete routes) + - CONFIG_NETFILTER (will be required for the DECnet routing daemon) Don't turn on SIOCGIFCONF support for DECnet unless you are really sure that you need it, in general you won't and it can cause ifconfig to @@ -29,7 +34,7 @@ malfunction. Run time configuration has changed slightly from the 2.4 system. If you want to configure an endnode, then the simplified procedure is as follows: - o Set the MAC address on your ethernet card before starting _any_ other + - Set the MAC address on your ethernet card before starting _any_ other network protocols. As soon as your network card is brought into the UP state, DECnet should @@ -37,7 +42,8 @@ start working. If you need something more complicated or are unsure how to set the MAC address, see the next section. Also all configurations which worked with 2.4 will work under 2.5 with no change. -3) Command line options +3. Command line options +======================= You can set a DECnet address on the kernel command line for compatibility with the 2.4 configuration procedure, but in general it's not needed any more. @@ -56,7 +62,7 @@ interface then you won't see any entries in /proc/net/neigh for the local host until such time as you start a connection. This doesn't affect the operation of the local communications in any other way though. -The kernel command line takes options looking like the following: +The kernel command line takes options looking like the following:: decnet.addr=1,2 @@ -82,7 +88,7 @@ address of the node in order for it to be autoconfigured (and then appear in FTP sites called dn2ethaddr which can compute the correct ethernet address to use. The address can be set by ifconfig either before or at the time the device is brought up. If you are using RedHat you can -add the line: +add the line:: MACADDR=AA:00:04:00:03:04 @@ -95,7 +101,7 @@ verify with iproute2). The default device for routing can be set through the /proc filesystem by setting /proc/sys/net/decnet/default_device to the device you want DECnet to route packets out of when no specific route -is available. Usually this will be eth0, for example: +is available. Usually this will be eth0, for example:: echo -n "eth0" >/proc/sys/net/decnet/default_device @@ -106,7 +112,9 @@ confirm that by looking in the default_device file of course. There is a list of what the other files under /proc/sys/net/decnet/ do on the kernel patch web site (shown above). -4) Run time kernel configuration +4. Run time kernel configuration +================================ + This is either done through the sysctl/proc interface (see the kernel web pages for details on what the various options do) or through the iproute2 @@ -122,20 +130,21 @@ since its the _only_ way to add and delete routes currently. Eventually there will be a routing daemon to send and receive routing messages for each interface and update the kernel routing tables accordingly. The routing daemon will use netfilter to listen to routing packets, and -rtnetlink to update the kernels routing tables. +rtnetlink to update the kernels routing tables. The DECnet raw socket layer has been removed since it was there purely for use by the routing daemon which will now use netfilter (a much cleaner and more generic solution) instead. -5) How can I tell if its working ? +5. How can I tell if its working? +================================= Here is a quick guide of what to look for in order to know if your DECnet kernel subsystem is working. - Is the node address set (see /proc/sys/net/decnet/node_address) - - Is the node of the correct type - (see /proc/sys/net/decnet/conf/<dev>/forwarding) + - Is the node of the correct type + (see /proc/sys/net/decnet/conf/<dev>/forwarding) - Is the Ethernet MAC address of each Ethernet card set to match the DECnet address. If in doubt use the dn2ethaddr utility available at the ftp archive. @@ -160,7 +169,8 @@ kernel subsystem is working. network, and see if you can obtain the same results. - At this point you are on your own... :-) -6) How to send a bug report +6. How to send a bug report +=========================== If you've found a bug and want to report it, then there are several things you can do to help me work out exactly what it is that is wrong. Useful @@ -175,18 +185,19 @@ information (_most_ of which _is_ _essential_) includes: - How much data was being transferred ? - Was the network congested ? - How can the problem be reproduced ? - - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of + - Can you use tcpdump to get a trace ? (N.B. Most (all?) versions of tcpdump don't understand how to dump DECnet properly, so including the hex listing of the packet contents is _essential_, usually the -x flag. You may also need to increase the length grabbed with the -s flag. The -e flag also provides very useful information (ethernet MAC addresses)) -7) MAC FAQ +7. MAC FAQ +========== A quick FAQ on ethernet MAC addresses to explain how Linux and DECnet -interact and how to get the best performance from your hardware. +interact and how to get the best performance from your hardware. -Ethernet cards are designed to normally only pass received network frames +Ethernet cards are designed to normally only pass received network frames to a host computer when they are addressed to it, or to the broadcast address. Linux has an interface which allows the setting of extra addresses for @@ -197,8 +208,8 @@ significant processor time and bus bandwidth can be used up on a busy network (see the NAPI documentation for a longer explanation of these effects). -DECnet makes use of this interface to allow running DECnet on an ethernet -card which has already been configured using TCP/IP (presumably using the +DECnet makes use of this interface to allow running DECnet on an ethernet +card which has already been configured using TCP/IP (presumably using the built in MAC address of the card, as usual) and/or to allow multiple DECnet addresses on each physical interface. If you do this, be aware that if your ethernet card doesn't support perfect hashing in its MAC address filter @@ -210,7 +221,8 @@ to gain the best efficiency. Better still is to use a card which supports NAPI as well. -8) Mailing list +8. Mailing list +=============== If you are keen to get involved in development, or want to ask questions about configuration, or even just report bugs, then there is a mailing @@ -218,7 +230,8 @@ list that you can join, details are at: http://sourceforge.net/mail/?group_id=4993 -9) Legal Info +9. Legal Info +============= The Linux DECnet project team have placed their code under the GPL. The software is provided "as is" and without warranty express or implied. diff --git a/Documentation/networking/defza.txt b/Documentation/networking/defza.rst index 663e4a906751..73c2f793ea26 100644 --- a/Documentation/networking/defza.txt +++ b/Documentation/networking/defza.rst @@ -1,4 +1,10 @@ -Notes on the DEC FDDIcontroller 700 (DEFZA-xx) driver v.1.1.4. +.. SPDX-License-Identifier: GPL-2.0 + +===================================================== +Notes on the DEC FDDIcontroller 700 (DEFZA-xx) driver +===================================================== + +:Version: v.1.1.4 DEC FDDIcontroller 700 is DEC's first-generation TURBOchannel FDDI diff --git a/Documentation/networking/device_drivers/intel/e100.rst b/Documentation/networking/device_drivers/intel/e100.rst index caf023cc88de..3ac21e7119a7 100644 --- a/Documentation/networking/device_drivers/intel/e100.rst +++ b/Documentation/networking/device_drivers/intel/e100.rst @@ -33,7 +33,7 @@ The following features are now available in supported kernels: - SNMP Channel Bonding documentation can be found in the Linux kernel source: -/Documentation/networking/bonding.txt +/Documentation/networking/bonding.rst Identifying Your Adapter diff --git a/Documentation/networking/device_drivers/intel/ixgb.rst b/Documentation/networking/device_drivers/intel/ixgb.rst index 945018207a92..ab624f1a44a8 100644 --- a/Documentation/networking/device_drivers/intel/ixgb.rst +++ b/Documentation/networking/device_drivers/intel/ixgb.rst @@ -37,7 +37,7 @@ The following features are available in this kernel: - SNMP Channel Bonding documentation can be found in the Linux kernel source: -/Documentation/networking/bonding.txt +/Documentation/networking/bonding.rst The driver information previously displayed in the /proc filesystem is not supported in this release. Alternatively, you can use ethtool (version 1.6 diff --git a/Documentation/networking/dns_resolver.txt b/Documentation/networking/dns_resolver.rst index eaa8f9a6fd5d..add4d59a99a5 100644 --- a/Documentation/networking/dns_resolver.txt +++ b/Documentation/networking/dns_resolver.rst @@ -1,8 +1,10 @@ - =================== - DNS Resolver Module - =================== +.. SPDX-License-Identifier: GPL-2.0 -Contents: +=================== +DNS Resolver Module +=================== + +.. Contents: - Overview. - Compilation. @@ -12,8 +14,7 @@ Contents: - Debugging. -======== -OVERVIEW +Overview ======== The DNS resolver module provides a way for kernel services to make DNS queries @@ -33,50 +34,50 @@ It does not yet support the following AFS features: This code is extracted from the CIFS filesystem. -=========== -COMPILATION +Compilation =========== -The module should be enabled by turning on the kernel configuration options: +The module should be enabled by turning on the kernel configuration options:: CONFIG_DNS_RESOLVER - tristate "DNS Resolver support" -========== -SETTING UP +Setting up ========== To set up this facility, the /etc/request-key.conf file must be altered so that /sbin/request-key can appropriately direct the upcalls. For example, to handle basic dname to IPv4/IPv6 address resolution, the following line should be -added: +added:: + #OP TYPE DESC CO-INFO PROGRAM ARG1 ARG2 ARG3 ... #====== ============ ======= ======= ========================== create dns_resolver * * /usr/sbin/cifs.upcall %k To direct a query for query type 'foo', a line of the following should be added -before the more general line given above as the first match is the one taken. +before the more general line given above as the first match is the one taken:: create dns_resolver foo:* * /usr/sbin/dns.foo %k -===== -USAGE +Usage ===== To make use of this facility, one of the following functions that are -implemented in the module can be called after doing: +implemented in the module can be called after doing:: #include <linux/dns_resolver.h> - (1) int dns_query(const char *type, const char *name, size_t namelen, - const char *options, char **_result, time_t *_expiry); + :: + + int dns_query(const char *type, const char *name, size_t namelen, + const char *options, char **_result, time_t *_expiry); This is the basic access function. It looks for a cached DNS query and if it doesn't find it, it upcalls to userspace to make a new DNS query, which may then be cached. The key description is constructed as a string of the - form: + form:: [<type>:]<name> @@ -107,16 +108,14 @@ This can be cleared by any process that has the CAP_SYS_ADMIN capability by the use of KEYCTL_KEYRING_CLEAR on the keyring ID. -=============================== -READING DNS KEYS FROM USERSPACE +Reading DNS Keys from Userspace =============================== Keys of dns_resolver type can be read from userspace using keyctl_read() or "keyctl read/print/pipe". -========= -MECHANISM +Mechanism ========= The dnsresolver module registers a key type called "dns_resolver". Keys of @@ -147,11 +146,10 @@ See <file:Documentation/security/keys/request-key.rst> for further information about request-key function. -========= -DEBUGGING +Debugging ========= Debugging messages can be turned on dynamically by writing a 1 into the -following file: +following file:: - /sys/module/dnsresolver/parameters/debug + /sys/module/dnsresolver/parameters/debug diff --git a/Documentation/networking/driver.txt b/Documentation/networking/driver.rst index da59e2884130..c8f59dbda46f 100644 --- a/Documentation/networking/driver.txt +++ b/Documentation/networking/driver.rst @@ -1,4 +1,8 @@ -Document about softnet driver issues +.. SPDX-License-Identifier: GPL-2.0 + +===================== +Softnet Driver Issues +===================== Transmit path guidelines: @@ -8,7 +12,7 @@ Transmit path guidelines: transmit function will become busy. Instead it must maintain the queue properly. For example, - for a driver implementing scatter-gather this means: + for a driver implementing scatter-gather this means:: static netdev_tx_t drv_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -38,25 +42,25 @@ Transmit path guidelines: return NETDEV_TX_OK; } - And then at the end of your TX reclamation event handling: + And then at the end of your TX reclamation event handling:: if (netif_queue_stopped(dp->dev) && - TX_BUFFS_AVAIL(dp) > (MAX_SKB_FRAGS + 1)) + TX_BUFFS_AVAIL(dp) > (MAX_SKB_FRAGS + 1)) netif_wake_queue(dp->dev); - For a non-scatter-gather supporting card, the three tests simply become: + For a non-scatter-gather supporting card, the three tests simply become:: /* This is a hard error log it. */ if (TX_BUFFS_AVAIL(dp) <= 0) - and: + and:: if (TX_BUFFS_AVAIL(dp) == 0) - and: + and:: if (netif_queue_stopped(dp->dev) && - TX_BUFFS_AVAIL(dp) > 0) + TX_BUFFS_AVAIL(dp) > 0) netif_wake_queue(dp->dev); 2) An ndo_start_xmit method must not modify the shared parts of a @@ -86,7 +90,7 @@ Close/stop guidelines: 1) After the ndo_stop routine has been called, the hardware must not receive or transmit any data. All in flight packets must - be aborted. If necessary, poll or wait for completion of + be aborted. If necessary, poll or wait for completion of any reset commands. 2) The ndo_stop routine will be called by unregister_netdevice diff --git a/Documentation/networking/eql.txt b/Documentation/networking/eql.rst index 0f1550150f05..a628c4c81166 100644 --- a/Documentation/networking/eql.txt +++ b/Documentation/networking/eql.rst @@ -1,5 +1,11 @@ - EQL Driver: Serial IP Load Balancing HOWTO +.. SPDX-License-Identifier: GPL-2.0 + +========================================== +EQL Driver: Serial IP Load Balancing HOWTO +========================================== + Simon "Guru Aleph-Null" Janes, simon@ncm.com + v1.1, February 27, 1995 This is the manual for the EQL device driver. EQL is a software device @@ -12,7 +18,8 @@ which was only created to patch cleanly in the very latest kernel source trees. (Yes, it worked fine.) - 1. Introduction +1. Introduction +=============== Which is worse? A huge fee for a 56K leased line or two phone lines? It's probably the former. If you find yourself craving more bandwidth, @@ -41,47 +48,40 @@ Hey, we can all dream you know... - 2. Kernel Configuration +2. Kernel Configuration +======================= Here I describe the general steps of getting a kernel up and working with the eql driver. From patching, building, to installing. - 2.1. Patching The Kernel +2.1. Patching The Kernel +------------------------ If you do not have or cannot get a copy of the kernel with the eql driver folded into it, get your copy of the driver from ftp://slaughter.ncm.com/pub/Linux/LOAD_BALANCING/eql-1.1.tar.gz. Unpack this archive someplace obvious like /usr/local/src/. It will - create the following files: - - + create the following files:: - ______________________________________________________________________ -rw-r--r-- guru/ncm 198 Jan 19 18:53 1995 eql-1.1/NO-WARRANTY -rw-r--r-- guru/ncm 30620 Feb 27 21:40 1995 eql-1.1/eql-1.1.patch -rwxr-xr-x guru/ncm 16111 Jan 12 22:29 1995 eql-1.1/eql_enslave -rw-r--r-- guru/ncm 2195 Jan 10 21:48 1995 eql-1.1/eql_enslave.c - ______________________________________________________________________ Unpack a recent kernel (something after 1.1.92) someplace convenient like say /usr/src/linux-1.1.92.eql. Use symbolic links to point /usr/src/linux to this development directory. - Apply the patch by running the commands: + Apply the patch by running the commands:: - - ______________________________________________________________________ cd /usr/src patch </usr/local/src/eql-1.1/eql-1.1.patch - ______________________________________________________________________ - - - - 2.2. Building The Kernel +2.2. Building The Kernel +------------------------ After patching the kernel, run make config and configure the kernel for your hardware. @@ -90,7 +90,8 @@ After configuration, make and install according to your habit. - 3. Network Configuration +3. Network Configuration +======================== So far, I have only used the eql device with the DSLIP SLIP connection manager by Matt Dillon (-- "The man who sold his soul to code so much @@ -100,37 +101,27 @@ connection. - 3.1. /etc/rc.d/rc.inet1 +3.1. /etc/rc.d/rc.inet1 +----------------------- In rc.inet1, ifconfig the eql device to the IP address you usually use for your machine, and the MTU you prefer for your SLIP lines. One could argue that MTU should be roughly half the usual size for two modems, one-third for three, one-fourth for four, etc... But going too far below 296 is probably overkill. Here is an example ifconfig - command that sets up the eql device: - + command that sets up the eql device:: - - ______________________________________________________________________ ifconfig eql 198.67.33.239 mtu 1006 - ______________________________________________________________________ - - - - Once the eql device is up and running, add a static default route to it in the routing table using the cool new route syntax that makes - life so much easier: + life so much easier:: - - - ______________________________________________________________________ route add default eql - ______________________________________________________________________ - 3.2. Enslaving Devices By Hand +3.2. Enslaving Devices By Hand +------------------------------ Enslaving devices by hand requires two utility programs: eql_enslave and eql_emancipate (-- eql_emancipate hasn't been written because when @@ -140,87 +131,56 @@ The syntax for enslaving a device is "eql_enslave <master-name> - <slave-name> <estimated-bps>". Here are some example enslavings: - + <slave-name> <estimated-bps>". Here are some example enslavings:: - - ______________________________________________________________________ eql_enslave eql sl0 28800 eql_enslave eql ppp0 14400 eql_enslave eql sl1 57600 - ______________________________________________________________________ - - - - When you want to free a device from its life of slavery, you can either down the device with ifconfig (eql will automatically bury the dead slave and remove it from its queue) or use eql_emancipate to free it. (-- Or just ifconfig it down, and the eql driver will take it out - for you.--) - - + for you.--):: - ______________________________________________________________________ eql_emancipate eql sl0 eql_emancipate eql ppp0 eql_emancipate eql sl1 - ______________________________________________________________________ - - - - 3.3. DSLIP Configuration for the eql Device +3.3. DSLIP Configuration for the eql Device +------------------------------------------- The general idea is to bring up and keep up as many SLIP connections as you need, automatically. - 3.3.1. /etc/slip/runslip.conf - - Here is an example runslip.conf: - - - - - - - - - - - +3.3.1. /etc/slip/runslip.conf +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + Here is an example runslip.conf:: + name sl-line-1 + enabled + baud 38400 + mtu 576 + ducmd -e /etc/slip/dialout/cua2-288.xp -t 9 + command eql_enslave eql $interface 28800 + address 198.67.33.239 + line /dev/cua2 + name sl-line-2 + enabled + baud 38400 + mtu 576 + ducmd -e /etc/slip/dialout/cua3-288.xp -t 9 + command eql_enslave eql $interface 28800 + address 198.67.33.239 + line /dev/cua3 - ______________________________________________________________________ - name sl-line-1 - enabled - baud 38400 - mtu 576 - ducmd -e /etc/slip/dialout/cua2-288.xp -t 9 - command eql_enslave eql $interface 28800 - address 198.67.33.239 - line /dev/cua2 - name sl-line-2 - enabled - baud 38400 - mtu 576 - ducmd -e /etc/slip/dialout/cua3-288.xp -t 9 - command eql_enslave eql $interface 28800 - address 198.67.33.239 - line /dev/cua3 - ______________________________________________________________________ - - - - - - 3.4. Using PPP and the eql Device +3.4. Using PPP and the eql Device +--------------------------------- I have not yet done any load-balancing testing for PPP devices, mainly because I don't have a PPP-connection manager like SLIP has with @@ -235,7 +195,8 @@ year. - 4. About the Slave Scheduler Algorithm +4. About the Slave Scheduler Algorithm +====================================== The slave scheduler probably could be replaced with a dozen other things and push traffic much faster. The formula in the current set @@ -254,7 +215,8 @@ traffic and the "slower" modem starved. - 5. Testers' Reports +5. Testers' Reports +=================== Some people have experimented with the eql device with newer kernels (than 1.1.75). I have since updated the driver to patch @@ -262,87 +224,29 @@ balancing" driver config option. - o icee from LinuxNET patched 1.1.86 without any rejects and was able + - icee from LinuxNET patched 1.1.86 without any rejects and was able to boot the kernel and enslave a couple of ISDN PPP links. - 5.1. Randolph Bentson's Test Report - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +5.1. Randolph Bentson's Test Report +----------------------------------- + :: + From bentson@grieg.seaslug.org Wed Feb 8 19:08:09 1995 + Date: Tue, 7 Feb 95 22:57 PST + From: Randolph Bentson <bentson@grieg.seaslug.org> + To: guru@ncm.com + Subject: EQL driver tests + I have been checking out your eql driver. (Nice work, that!) + Although you may already done this performance testing, here + are some data I've discovered. + Randolph Bentson + bentson@grieg.seaslug.org - - - - - - - - - - - - - - - - - - - - - - From bentson@grieg.seaslug.org Wed Feb 8 19:08:09 1995 - Date: Tue, 7 Feb 95 22:57 PST - From: Randolph Bentson <bentson@grieg.seaslug.org> - To: guru@ncm.com - Subject: EQL driver tests - - - I have been checking out your eql driver. (Nice work, that!) - Although you may already done this performance testing, here - are some data I've discovered. - - Randolph Bentson - bentson@grieg.seaslug.org - - --------------------------------------------------------- +------------------------------------------------------------------ A pseudo-device driver, EQL, written by Simon Janes, can be used @@ -363,7 +267,7 @@ Once a link was established, I timed a binary ftp transfer of 289284 bytes of data. If there were no overhead (packet headers, inter-character and inter-packet delays, etc.) the transfers - would take the following times: + would take the following times:: bits/sec seconds 345600 8.3 @@ -388,141 +292,82 @@ that the connection establishment seemed fragile for the higher speeds. Once established, the connection seemed robust enough.) - #lines speed mtu seconds theory actual %of - kbit/sec duration speed speed max - 3 115200 900 _ 345600 - 3 115200 400 18.1 345600 159825 46 - 2 115200 900 _ 230400 - 2 115200 600 18.1 230400 159825 69 - 2 115200 400 19.3 230400 149888 65 - 4 57600 900 _ 234600 - 4 57600 600 _ 234600 - 4 57600 400 _ 234600 - 3 57600 600 20.9 172800 138413 80 - 3 57600 900 21.2 172800 136455 78 - 3 115200 600 21.7 345600 133311 38 - 3 57600 400 22.5 172800 128571 74 - 4 38400 900 25.2 153600 114795 74 - 4 38400 600 26.4 153600 109577 71 - 4 38400 400 27.3 153600 105965 68 - 2 57600 900 29.1 115200 99410.3 86 - 1 115200 900 30.7 115200 94229.3 81 - 2 57600 600 30.2 115200 95789.4 83 - 3 38400 900 30.3 115200 95473.3 82 - 3 38400 600 31.2 115200 92719.2 80 - 1 115200 600 31.3 115200 92423 80 - 2 57600 400 32.3 115200 89561.6 77 - 1 115200 400 32.8 115200 88196.3 76 - 3 38400 400 33.5 115200 86353.4 74 - 2 38400 900 43.7 76800 66197.7 86 - 2 38400 600 44 76800 65746.4 85 - 2 38400 400 47.2 76800 61289 79 - 4 19200 900 50.8 76800 56945.7 74 - 4 19200 400 53.2 76800 54376.7 70 - 4 19200 600 53.7 76800 53870.4 70 - 1 57600 900 54.6 57600 52982.4 91 - 1 57600 600 56.2 57600 51474 89 - 3 19200 900 60.5 57600 47815.5 83 - 1 57600 400 60.2 57600 48053.8 83 - 3 19200 600 62 57600 46658.7 81 - 3 19200 400 64.7 57600 44711.6 77 - 1 38400 900 79.4 38400 36433.8 94 - 1 38400 600 82.4 38400 35107.3 91 - 2 19200 900 84.4 38400 34275.4 89 - 1 38400 400 86.8 38400 33327.6 86 - 2 19200 600 87.6 38400 33023.3 85 - 2 19200 400 91.2 38400 31719.7 82 - 4 9600 900 94.7 38400 30547.4 79 - 4 9600 400 106 38400 27290.9 71 - 4 9600 600 110 38400 26298.5 68 - 3 9600 900 118 28800 24515.6 85 - 3 9600 600 120 28800 24107 83 - 3 9600 400 131 28800 22082.7 76 - 1 19200 900 155 19200 18663.5 97 - 1 19200 600 161 19200 17968 93 - 1 19200 400 170 19200 17016.7 88 - 2 9600 600 176 19200 16436.6 85 - 2 9600 900 180 19200 16071.3 83 - 2 9600 400 181 19200 15982.5 83 - 1 9600 900 305 9600 9484.72 98 - 1 9600 600 314 9600 9212.87 95 - 1 9600 400 332 9600 8713.37 90 - - - - - - 5.2. Anthony Healy's Report - - - - - - - - Date: Mon, 13 Feb 1995 16:17:29 +1100 (EST) - From: Antony Healey <ahealey@st.nepean.uws.edu.au> - To: Simon Janes <guru@ncm.com> - Subject: Re: Load Balancing - - Hi Simon, + ====== ======== === ======== ======= ======= === + #lines speed mtu seconds theory actual %of + kbit/sec duration speed speed max + ====== ======== === ======== ======= ======= === + 3 115200 900 _ 345600 + 3 115200 400 18.1 345600 159825 46 + 2 115200 900 _ 230400 + 2 115200 600 18.1 230400 159825 69 + 2 115200 400 19.3 230400 149888 65 + 4 57600 900 _ 234600 + 4 57600 600 _ 234600 + 4 57600 400 _ 234600 + 3 57600 600 20.9 172800 138413 80 + 3 57600 900 21.2 172800 136455 78 + 3 115200 600 21.7 345600 133311 38 + 3 57600 400 22.5 172800 128571 74 + 4 38400 900 25.2 153600 114795 74 + 4 38400 600 26.4 153600 109577 71 + 4 38400 400 27.3 153600 105965 68 + 2 57600 900 29.1 115200 99410.3 86 + 1 115200 900 30.7 115200 94229.3 81 + 2 57600 600 30.2 115200 95789.4 83 + 3 38400 900 30.3 115200 95473.3 82 + 3 38400 600 31.2 115200 92719.2 80 + 1 115200 600 31.3 115200 92423 80 + 2 57600 400 32.3 115200 89561.6 77 + 1 115200 400 32.8 115200 88196.3 76 + 3 38400 400 33.5 115200 86353.4 74 + 2 38400 900 43.7 76800 66197.7 86 + 2 38400 600 44 76800 65746.4 85 + 2 38400 400 47.2 76800 61289 79 + 4 19200 900 50.8 76800 56945.7 74 + 4 19200 400 53.2 76800 54376.7 70 + 4 19200 600 53.7 76800 53870.4 70 + 1 57600 900 54.6 57600 52982.4 91 + 1 57600 600 56.2 57600 51474 89 + 3 19200 900 60.5 57600 47815.5 83 + 1 57600 400 60.2 57600 48053.8 83 + 3 19200 600 62 57600 46658.7 81 + 3 19200 400 64.7 57600 44711.6 77 + 1 38400 900 79.4 38400 36433.8 94 + 1 38400 600 82.4 38400 35107.3 91 + 2 19200 900 84.4 38400 34275.4 89 + 1 38400 400 86.8 38400 33327.6 86 + 2 19200 600 87.6 38400 33023.3 85 + 2 19200 400 91.2 38400 31719.7 82 + 4 9600 900 94.7 38400 30547.4 79 + 4 9600 400 106 38400 27290.9 71 + 4 9600 600 110 38400 26298.5 68 + 3 9600 900 118 28800 24515.6 85 + 3 9600 600 120 28800 24107 83 + 3 9600 400 131 28800 22082.7 76 + 1 19200 900 155 19200 18663.5 97 + 1 19200 600 161 19200 17968 93 + 1 19200 400 170 19200 17016.7 88 + 2 9600 600 176 19200 16436.6 85 + 2 9600 900 180 19200 16071.3 83 + 2 9600 400 181 19200 15982.5 83 + 1 9600 900 305 9600 9484.72 98 + 1 9600 600 314 9600 9212.87 95 + 1 9600 400 332 9600 8713.37 90 + ====== ======== === ======== ======= ======= === + +5.2. Anthony Healy's Report +--------------------------- + + :: + + Date: Mon, 13 Feb 1995 16:17:29 +1100 (EST) + From: Antony Healey <ahealey@st.nepean.uws.edu.au> + To: Simon Janes <guru@ncm.com> + Subject: Re: Load Balancing + + Hi Simon, I've installed your patch and it works great. I have trialed it over twin SL/IP lines, just over null modems, but I was able to data at over 48Kb/s [ISDN link -Simon]. I managed a transfer of up to 7.5 Kbyte/s on one go, but averaged around 6.4 Kbyte/s, which I think is pretty cool. :) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/Documentation/networking/fib_trie.txt b/Documentation/networking/fib_trie.rst index fe719388518b..f1435b7fcdb7 100644 --- a/Documentation/networking/fib_trie.txt +++ b/Documentation/networking/fib_trie.rst @@ -1,8 +1,12 @@ - LC-trie implementation notes. +.. SPDX-License-Identifier: GPL-2.0 + +============================ +LC-trie implementation notes +============================ Node types ---------- -leaf +leaf An end node with data. This has a copy of the relevant key, along with 'hlist' with routing table entries sorted by prefix length. See struct leaf and struct leaf_info. @@ -13,7 +17,7 @@ trie node or tnode A few concepts explained ------------------------ -Bits (tnode) +Bits (tnode) The number of bits in the key segment used for indexing into the child array - the "child index". See Level Compression. @@ -23,7 +27,7 @@ Pos (tnode) Path Compression / skipped bits Any given tnode is linked to from the child array of its parent, using - a segment of the key specified by the parent's "pos" and "bits" + a segment of the key specified by the parent's "pos" and "bits" In certain cases, this tnode's own "pos" will not be immediately adjacent to the parent (pos+bits), but there will be some bits in the key skipped over because they represent a single path with no @@ -56,8 +60,8 @@ full_children Comments --------- -We have tried to keep the structure of the code as close to fib_hash as -possible to allow verification and help up reviewing. +We have tried to keep the structure of the code as close to fib_hash as +possible to allow verification and help up reviewing. fib_find_node() A good start for understanding this code. This function implements a diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.rst index 2f0f8b17dade..a1d3e192b9fa 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.rst @@ -1,3 +1,6 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================================= Linux Socket Filtering aka Berkeley Packet Filter (BPF) ======================================================= @@ -42,10 +45,10 @@ displays what is being placed into this structure. Although we were only speaking about sockets here, BPF in Linux is used in many more places. There's xt_bpf for netfilter, cls_bpf in the kernel -qdisc layer, SECCOMP-BPF (SECure COMPuting [1]), and lots of other places +qdisc layer, SECCOMP-BPF (SECure COMPuting [1]_), and lots of other places such as team driver, PTP code, etc where BPF is being used. - [1] Documentation/userspace-api/seccomp_filter.rst +.. [1] Documentation/userspace-api/seccomp_filter.rst Original BPF paper: @@ -59,23 +62,23 @@ Structure --------- User space applications include <linux/filter.h> which contains the -following relevant structures: +following relevant structures:: -struct sock_filter { /* Filter block */ - __u16 code; /* Actual filter code */ - __u8 jt; /* Jump true */ - __u8 jf; /* Jump false */ - __u32 k; /* Generic multiuse field */ -}; + struct sock_filter { /* Filter block */ + __u16 code; /* Actual filter code */ + __u8 jt; /* Jump true */ + __u8 jf; /* Jump false */ + __u32 k; /* Generic multiuse field */ + }; Such a structure is assembled as an array of 4-tuples, that contains a code, jt, jf and k value. jt and jf are jump offsets and k a generic -value to be used for a provided code. +value to be used for a provided code:: -struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ - unsigned short len; /* Number of filter blocks */ - struct sock_filter __user *filter; -}; + struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ + unsigned short len; /* Number of filter blocks */ + struct sock_filter __user *filter; + }; For socket filtering, a pointer to this structure (as shown in follow-up example) is being passed to the kernel through setsockopt(2). @@ -83,55 +86,57 @@ follow-up example) is being passed to the kernel through setsockopt(2). Example ------- -#include <sys/socket.h> -#include <sys/types.h> -#include <arpa/inet.h> -#include <linux/if_ether.h> -/* ... */ - -/* From the example above: tcpdump -i em1 port 22 -dd */ -struct sock_filter code[] = { - { 0x28, 0, 0, 0x0000000c }, - { 0x15, 0, 8, 0x000086dd }, - { 0x30, 0, 0, 0x00000014 }, - { 0x15, 2, 0, 0x00000084 }, - { 0x15, 1, 0, 0x00000006 }, - { 0x15, 0, 17, 0x00000011 }, - { 0x28, 0, 0, 0x00000036 }, - { 0x15, 14, 0, 0x00000016 }, - { 0x28, 0, 0, 0x00000038 }, - { 0x15, 12, 13, 0x00000016 }, - { 0x15, 0, 12, 0x00000800 }, - { 0x30, 0, 0, 0x00000017 }, - { 0x15, 2, 0, 0x00000084 }, - { 0x15, 1, 0, 0x00000006 }, - { 0x15, 0, 8, 0x00000011 }, - { 0x28, 0, 0, 0x00000014 }, - { 0x45, 6, 0, 0x00001fff }, - { 0xb1, 0, 0, 0x0000000e }, - { 0x48, 0, 0, 0x0000000e }, - { 0x15, 2, 0, 0x00000016 }, - { 0x48, 0, 0, 0x00000010 }, - { 0x15, 0, 1, 0x00000016 }, - { 0x06, 0, 0, 0x0000ffff }, - { 0x06, 0, 0, 0x00000000 }, -}; - -struct sock_fprog bpf = { - .len = ARRAY_SIZE(code), - .filter = code, -}; - -sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); -if (sock < 0) - /* ... bail out ... */ - -ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)); -if (ret < 0) - /* ... bail out ... */ - -/* ... */ -close(sock); +:: + + #include <sys/socket.h> + #include <sys/types.h> + #include <arpa/inet.h> + #include <linux/if_ether.h> + /* ... */ + + /* From the example above: tcpdump -i em1 port 22 -dd */ + struct sock_filter code[] = { + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 8, 0x000086dd }, + { 0x30, 0, 0, 0x00000014 }, + { 0x15, 2, 0, 0x00000084 }, + { 0x15, 1, 0, 0x00000006 }, + { 0x15, 0, 17, 0x00000011 }, + { 0x28, 0, 0, 0x00000036 }, + { 0x15, 14, 0, 0x00000016 }, + { 0x28, 0, 0, 0x00000038 }, + { 0x15, 12, 13, 0x00000016 }, + { 0x15, 0, 12, 0x00000800 }, + { 0x30, 0, 0, 0x00000017 }, + { 0x15, 2, 0, 0x00000084 }, + { 0x15, 1, 0, 0x00000006 }, + { 0x15, 0, 8, 0x00000011 }, + { 0x28, 0, 0, 0x00000014 }, + { 0x45, 6, 0, 0x00001fff }, + { 0xb1, 0, 0, 0x0000000e }, + { 0x48, 0, 0, 0x0000000e }, + { 0x15, 2, 0, 0x00000016 }, + { 0x48, 0, 0, 0x00000010 }, + { 0x15, 0, 1, 0x00000016 }, + { 0x06, 0, 0, 0x0000ffff }, + { 0x06, 0, 0, 0x00000000 }, + }; + + struct sock_fprog bpf = { + .len = ARRAY_SIZE(code), + .filter = code, + }; + + sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (sock < 0) + /* ... bail out ... */ + + ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)); + if (ret < 0) + /* ... bail out ... */ + + /* ... */ + close(sock); The above example code attaches a socket filter for a PF_PACKET socket in order to let all IPv4/IPv6 packets with port 22 pass. The rest will @@ -178,15 +183,17 @@ closely modelled after Steven McCanne's and Van Jacobson's BPF paper. The BPF architecture consists of the following basic elements: + ======= ==================================================== Element Description - + ======= ==================================================== A 32 bit wide accumulator X 32 bit wide X register M[] 16 x 32 bit wide misc registers aka "scratch memory - store", addressable from 0 to 15 + store", addressable from 0 to 15 + ======= ==================================================== A program, that is translated by bpf_asm into "opcodes" is an array that -consists of the following elements (as already mentioned): +consists of the following elements (as already mentioned):: op:16, jt:8, jf:8, k:32 @@ -201,8 +208,9 @@ and return instructions that are also represented in bpf_asm syntax. This table lists all bpf_asm instructions available resp. what their underlying opcodes as defined in linux/filter.h stand for: + =========== =================== ===================== Instruction Addressing mode Description - + =========== =================== ===================== ld 1, 2, 3, 4, 12 Load word into A ldi 4 Load word into A ldh 1, 2 Load half-word into A @@ -241,11 +249,13 @@ opcodes as defined in linux/filter.h stand for: txa Copy X into A ret 4, 11 Return + =========== =================== ===================== The next table shows addressing formats from the 2nd column: + =============== =================== =============================================== Addressing mode Syntax Description - + =============== =================== =============================================== 0 x/%x Register X 1 [k] BHW at byte offset k in the packet 2 [x + k] BHW at the offset X + k in the packet @@ -259,6 +269,7 @@ The next table shows addressing formats from the 2nd column: 10 x/%x,Lt Jump to Lt if predicate is true 11 a/%a Accumulator A 12 extension BPF extension + =============== =================== =============================================== The Linux kernel also has a couple of BPF extensions that are used along with the class of load instructions by "overloading" the k argument with @@ -267,8 +278,9 @@ extensions are loaded into A. Possible BPF extensions are shown in the following table: + =================================== ================================================= Extension Description - + =================================== ================================================= len skb->len proto skb->protocol type skb->pkt_type @@ -285,18 +297,19 @@ Possible BPF extensions are shown in the following table: vlan_avail skb_vlan_tag_present(skb) vlan_tpid skb->vlan_proto rand prandom_u32() + =================================== ================================================= These extensions can also be prefixed with '#'. Examples for low-level BPF: -** ARP packets: +**ARP packets**:: ldh [12] jne #0x806, drop ret #-1 drop: ret #0 -** IPv4 TCP packets: +**IPv4 TCP packets**:: ldh [12] jne #0x800, drop @@ -305,14 +318,15 @@ Examples for low-level BPF: ret #-1 drop: ret #0 -** (Accelerated) VLAN w/ id 10: +**(Accelerated) VLAN w/ id 10**:: ld vlan_tci jneq #10, drop ret #-1 drop: ret #0 -** icmp random packet sampling, 1 in 4 +**icmp random packet sampling, 1 in 4**: + ldh [12] jne #0x800, drop ldb [23] @@ -324,7 +338,7 @@ Examples for low-level BPF: ret #-1 drop: ret #0 -** SECCOMP filter example: +**SECCOMP filter example**:: ld [4] /* offsetof(struct seccomp_data, arch) */ jne #0xc000003e, bad /* AUDIT_ARCH_X86_64 */ @@ -345,18 +359,18 @@ Examples for low-level BPF: The above example code can be placed into a file (here called "foo"), and then be passed to the bpf_asm tool for generating opcodes, output that xt_bpf and cls_bpf understands and can directly be loaded with. Example with above -ARP code: +ARP code:: -$ ./bpf_asm foo -4,40 0 0 12,21 0 1 2054,6 0 0 4294967295,6 0 0 0, + $ ./bpf_asm foo + 4,40 0 0 12,21 0 1 2054,6 0 0 4294967295,6 0 0 0, -In copy and paste C-like output: +In copy and paste C-like output:: -$ ./bpf_asm -c foo -{ 0x28, 0, 0, 0x0000000c }, -{ 0x15, 0, 1, 0x00000806 }, -{ 0x06, 0, 0, 0xffffffff }, -{ 0x06, 0, 0, 0000000000 }, + $ ./bpf_asm -c foo + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 1, 0x00000806 }, + { 0x06, 0, 0, 0xffffffff }, + { 0x06, 0, 0, 0000000000 }, In particular, as usage with xt_bpf or cls_bpf can result in more complex BPF filters that might not be obvious at first, it's good to test filters before @@ -365,9 +379,9 @@ bpf_dbg under tools/bpf/ in the kernel source directory. This debugger allows for testing BPF filters against given pcap files, single stepping through the BPF code on the pcap's packets and to do BPF machine register dumps. -Starting bpf_dbg is trivial and just requires issuing: +Starting bpf_dbg is trivial and just requires issuing:: -# ./bpf_dbg + # ./bpf_dbg In case input and output do not equal stdin/stdout, bpf_dbg takes an alternative stdin source as a first argument, and an alternative stdout @@ -381,84 +395,100 @@ Interaction in bpf_dbg happens through a shell that also has auto-completion support (follow-up example commands starting with '>' denote bpf_dbg shell). The usual workflow would be to ... -> load bpf 6,40 0 0 12,21 0 3 2048,48 0 0 23,21 0 1 1,6 0 0 65535,6 0 0 0 +* load bpf 6,40 0 0 12,21 0 3 2048,48 0 0 23,21 0 1 1,6 0 0 65535,6 0 0 0 Loads a BPF filter from standard output of bpf_asm, or transformed via - e.g. `tcpdump -iem1 -ddd port 22 | tr '\n' ','`. Note that for JIT + e.g. ``tcpdump -iem1 -ddd port 22 | tr '\n' ','``. Note that for JIT debugging (next section), this command creates a temporary socket and loads the BPF code into the kernel. Thus, this will also be useful for JIT developers. -> load pcap foo.pcap +* load pcap foo.pcap + Loads standard tcpdump pcap file. -> run [<n>] +* run [<n>] + bpf passes:1 fails:9 Runs through all packets from a pcap to account how many passes and fails the filter will generate. A limit of packets to traverse can be given. -> disassemble -l0: ldh [12] -l1: jeq #0x800, l2, l5 -l2: ldb [23] -l3: jeq #0x1, l4, l5 -l4: ret #0xffff -l5: ret #0 +* disassemble:: + + l0: ldh [12] + l1: jeq #0x800, l2, l5 + l2: ldb [23] + l3: jeq #0x1, l4, l5 + l4: ret #0xffff + l5: ret #0 + Prints out BPF code disassembly. -> dump -/* { op, jt, jf, k }, */ -{ 0x28, 0, 0, 0x0000000c }, -{ 0x15, 0, 3, 0x00000800 }, -{ 0x30, 0, 0, 0x00000017 }, -{ 0x15, 0, 1, 0x00000001 }, -{ 0x06, 0, 0, 0x0000ffff }, -{ 0x06, 0, 0, 0000000000 }, +* dump:: + + /* { op, jt, jf, k }, */ + { 0x28, 0, 0, 0x0000000c }, + { 0x15, 0, 3, 0x00000800 }, + { 0x30, 0, 0, 0x00000017 }, + { 0x15, 0, 1, 0x00000001 }, + { 0x06, 0, 0, 0x0000ffff }, + { 0x06, 0, 0, 0000000000 }, + Prints out C-style BPF code dump. -> breakpoint 0 -breakpoint at: l0: ldh [12] -> breakpoint 1 -breakpoint at: l1: jeq #0x800, l2, l5 +* breakpoint 0:: + + breakpoint at: l0: ldh [12] + +* breakpoint 1:: + + breakpoint at: l1: jeq #0x800, l2, l5 + ... + Sets breakpoints at particular BPF instructions. Issuing a `run` command will walk through the pcap file continuing from the current packet and break when a breakpoint is being hit (another `run` will continue from the currently active breakpoint executing next instructions): - > run - -- register dump -- - pc: [0] <-- program counter - code: [40] jt[0] jf[0] k[12] <-- plain BPF code of current instruction - curr: l0: ldh [12] <-- disassembly of current instruction - A: [00000000][0] <-- content of A (hex, decimal) - X: [00000000][0] <-- content of X (hex, decimal) - M[0,15]: [00000000][0] <-- folded content of M (hex, decimal) - -- packet dump -- <-- Current packet from pcap (hex) - len: 42 - 0: 00 19 cb 55 55 a4 00 14 a4 43 78 69 08 06 00 01 - 16: 08 00 06 04 00 01 00 14 a4 43 78 69 0a 3b 01 26 - 32: 00 00 00 00 00 00 0a 3b 01 01 - (breakpoint) - > - -> breakpoint -breakpoints: 0 1 - Prints currently set breakpoints. - -> step [-<n>, +<n>] + * run:: + + -- register dump -- + pc: [0] <-- program counter + code: [40] jt[0] jf[0] k[12] <-- plain BPF code of current instruction + curr: l0: ldh [12] <-- disassembly of current instruction + A: [00000000][0] <-- content of A (hex, decimal) + X: [00000000][0] <-- content of X (hex, decimal) + M[0,15]: [00000000][0] <-- folded content of M (hex, decimal) + -- packet dump -- <-- Current packet from pcap (hex) + len: 42 + 0: 00 19 cb 55 55 a4 00 14 a4 43 78 69 08 06 00 01 + 16: 08 00 06 04 00 01 00 14 a4 43 78 69 0a 3b 01 26 + 32: 00 00 00 00 00 00 0a 3b 01 01 + (breakpoint) + > + + * breakpoint:: + + breakpoints: 0 1 + + Prints currently set breakpoints. + +* step [-<n>, +<n>] + Performs single stepping through the BPF program from the current pc offset. Thus, on each step invocation, above register dump is issued. This can go forwards and backwards in time, a plain `step` will break on the next BPF instruction, thus +1. (No `run` needs to be issued here.) -> select <n> +* select <n> + Selects a given packet from the pcap file to continue from. Thus, on the next `run` or `step`, the BPF program is being evaluated against the user pre-selected packet. Numbering starts just as in Wireshark with index 1. -> quit -# +* quit + Exits bpf_dbg. JIT compiler @@ -468,23 +498,23 @@ The Linux kernel has a built-in BPF JIT compiler for x86_64, SPARC, PowerPC, ARM, ARM64, MIPS, RISC-V and s390 and can be enabled through CONFIG_BPF_JIT. The JIT compiler is transparently invoked for each attached filter from user space or for internal kernel users if it has -been previously enabled by root: +been previously enabled by root:: echo 1 > /proc/sys/net/core/bpf_jit_enable For JIT developers, doing audits etc, each compile run can output the generated -opcode image into the kernel log via: +opcode image into the kernel log via:: echo 2 > /proc/sys/net/core/bpf_jit_enable -Example output from dmesg: +Example output from dmesg:: -[ 3389.935842] flen=6 proglen=70 pass=3 image=ffffffffa0069c8f -[ 3389.935847] JIT code: 00000000: 55 48 89 e5 48 83 ec 60 48 89 5d f8 44 8b 4f 68 -[ 3389.935849] JIT code: 00000010: 44 2b 4f 6c 4c 8b 87 d8 00 00 00 be 0c 00 00 00 -[ 3389.935850] JIT code: 00000020: e8 1d 94 ff e0 3d 00 08 00 00 75 16 be 17 00 00 -[ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00 -[ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3 + [ 3389.935842] flen=6 proglen=70 pass=3 image=ffffffffa0069c8f + [ 3389.935847] JIT code: 00000000: 55 48 89 e5 48 83 ec 60 48 89 5d f8 44 8b 4f 68 + [ 3389.935849] JIT code: 00000010: 44 2b 4f 6c 4c 8b 87 d8 00 00 00 be 0c 00 00 00 + [ 3389.935850] JIT code: 00000020: e8 1d 94 ff e0 3d 00 08 00 00 75 16 be 17 00 00 + [ 3389.935851] JIT code: 00000030: 00 e8 28 94 ff e0 83 f8 01 75 07 b8 ff ff 00 00 + [ 3389.935852] JIT code: 00000040: eb 02 31 c0 c9 c3 When CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1 and setting any other value than that will return in failure. This is even the case for @@ -493,78 +523,78 @@ is discouraged and introspection through bpftool (under tools/bpf/bpftool/) is t generally recommended approach instead. In the kernel source tree under tools/bpf/, there's bpf_jit_disasm for -generating disassembly out of the kernel log's hexdump: - -# ./bpf_jit_disasm -70 bytes emitted from JIT compiler (pass:3, flen:6) -ffffffffa0069c8f + <x>: - 0: push %rbp - 1: mov %rsp,%rbp - 4: sub $0x60,%rsp - 8: mov %rbx,-0x8(%rbp) - c: mov 0x68(%rdi),%r9d - 10: sub 0x6c(%rdi),%r9d - 14: mov 0xd8(%rdi),%r8 - 1b: mov $0xc,%esi - 20: callq 0xffffffffe0ff9442 - 25: cmp $0x800,%eax - 2a: jne 0x0000000000000042 - 2c: mov $0x17,%esi - 31: callq 0xffffffffe0ff945e - 36: cmp $0x1,%eax - 39: jne 0x0000000000000042 - 3b: mov $0xffff,%eax - 40: jmp 0x0000000000000044 - 42: xor %eax,%eax - 44: leaveq - 45: retq - -Issuing option `-o` will "annotate" opcodes to resulting assembler -instructions, which can be very useful for JIT developers: - -# ./bpf_jit_disasm -o -70 bytes emitted from JIT compiler (pass:3, flen:6) -ffffffffa0069c8f + <x>: - 0: push %rbp - 55 - 1: mov %rsp,%rbp - 48 89 e5 - 4: sub $0x60,%rsp - 48 83 ec 60 - 8: mov %rbx,-0x8(%rbp) - 48 89 5d f8 - c: mov 0x68(%rdi),%r9d - 44 8b 4f 68 - 10: sub 0x6c(%rdi),%r9d - 44 2b 4f 6c - 14: mov 0xd8(%rdi),%r8 - 4c 8b 87 d8 00 00 00 - 1b: mov $0xc,%esi - be 0c 00 00 00 - 20: callq 0xffffffffe0ff9442 - e8 1d 94 ff e0 - 25: cmp $0x800,%eax - 3d 00 08 00 00 - 2a: jne 0x0000000000000042 - 75 16 - 2c: mov $0x17,%esi - be 17 00 00 00 - 31: callq 0xffffffffe0ff945e - e8 28 94 ff e0 - 36: cmp $0x1,%eax - 83 f8 01 - 39: jne 0x0000000000000042 - 75 07 - 3b: mov $0xffff,%eax - b8 ff ff 00 00 - 40: jmp 0x0000000000000044 - eb 02 - 42: xor %eax,%eax - 31 c0 - 44: leaveq - c9 - 45: retq - c3 +generating disassembly out of the kernel log's hexdump:: + + # ./bpf_jit_disasm + 70 bytes emitted from JIT compiler (pass:3, flen:6) + ffffffffa0069c8f + <x>: + 0: push %rbp + 1: mov %rsp,%rbp + 4: sub $0x60,%rsp + 8: mov %rbx,-0x8(%rbp) + c: mov 0x68(%rdi),%r9d + 10: sub 0x6c(%rdi),%r9d + 14: mov 0xd8(%rdi),%r8 + 1b: mov $0xc,%esi + 20: callq 0xffffffffe0ff9442 + 25: cmp $0x800,%eax + 2a: jne 0x0000000000000042 + 2c: mov $0x17,%esi + 31: callq 0xffffffffe0ff945e + 36: cmp $0x1,%eax + 39: jne 0x0000000000000042 + 3b: mov $0xffff,%eax + 40: jmp 0x0000000000000044 + 42: xor %eax,%eax + 44: leaveq + 45: retq + + Issuing option `-o` will "annotate" opcodes to resulting assembler + instructions, which can be very useful for JIT developers: + + # ./bpf_jit_disasm -o + 70 bytes emitted from JIT compiler (pass:3, flen:6) + ffffffffa0069c8f + <x>: + 0: push %rbp + 55 + 1: mov %rsp,%rbp + 48 89 e5 + 4: sub $0x60,%rsp + 48 83 ec 60 + 8: mov %rbx,-0x8(%rbp) + 48 89 5d f8 + c: mov 0x68(%rdi),%r9d + 44 8b 4f 68 + 10: sub 0x6c(%rdi),%r9d + 44 2b 4f 6c + 14: mov 0xd8(%rdi),%r8 + 4c 8b 87 d8 00 00 00 + 1b: mov $0xc,%esi + be 0c 00 00 00 + 20: callq 0xffffffffe0ff9442 + e8 1d 94 ff e0 + 25: cmp $0x800,%eax + 3d 00 08 00 00 + 2a: jne 0x0000000000000042 + 75 16 + 2c: mov $0x17,%esi + be 17 00 00 00 + 31: callq 0xffffffffe0ff945e + e8 28 94 ff e0 + 36: cmp $0x1,%eax + 83 f8 01 + 39: jne 0x0000000000000042 + 75 07 + 3b: mov $0xffff,%eax + b8 ff ff 00 00 + 40: jmp 0x0000000000000044 + eb 02 + 42: xor %eax,%eax + 31 c0 + 44: leaveq + c9 + 45: retq + c3 For BPF JIT developers, bpf_jit_disasm, bpf_asm and bpf_dbg provides a useful toolchain for developing and testing the kernel's JIT compiler. @@ -663,9 +693,9 @@ Some core changes of the new internal format: - Conditional jt/jf targets replaced with jt/fall-through: - While the original design has constructs such as "if (cond) jump_true; - else jump_false;", they are being replaced into alternative constructs like - "if (cond) jump_true; /* else fall-through */". + While the original design has constructs such as ``if (cond) jump_true; + else jump_false;``, they are being replaced into alternative constructs like + ``if (cond) jump_true; /* else fall-through */``. - Introduces bpf_call insn and register passing convention for zero overhead calls from/to other kernel functions: @@ -684,32 +714,32 @@ Some core changes of the new internal format: a return value of the function. Since R6 - R9 are callee saved, their state is preserved across the call. - For example, consider three C functions: + For example, consider three C functions:: - u64 f1() { return (*_f2)(1); } - u64 f2(u64 a) { return f3(a + 1, a); } - u64 f3(u64 a, u64 b) { return a - b; } + u64 f1() { return (*_f2)(1); } + u64 f2(u64 a) { return f3(a + 1, a); } + u64 f3(u64 a, u64 b) { return a - b; } - GCC can compile f1, f3 into x86_64: + GCC can compile f1, f3 into x86_64:: - f1: - movl $1, %edi - movq _f2(%rip), %rax - jmp *%rax - f3: - movq %rdi, %rax - subq %rsi, %rax - ret + f1: + movl $1, %edi + movq _f2(%rip), %rax + jmp *%rax + f3: + movq %rdi, %rax + subq %rsi, %rax + ret - Function f2 in eBPF may look like: + Function f2 in eBPF may look like:: - f2: - bpf_mov R2, R1 - bpf_add R1, 1 - bpf_call f3 - bpf_exit + f2: + bpf_mov R2, R1 + bpf_add R1, 1 + bpf_call f3 + bpf_exit - If f2 is JITed and the pointer stored to '_f2'. The calls f1 -> f2 -> f3 and + If f2 is JITed and the pointer stored to ``_f2``. The calls f1 -> f2 -> f3 and returns will be seamless. Without JIT, __bpf_prog_run() interpreter needs to be used to call into f2. @@ -722,6 +752,8 @@ Some core changes of the new internal format: On 64-bit architectures all register map to HW registers one to one. For example, x86_64 JIT compiler can map them as ... + :: + R0 - rax R1 - rdi R2 - rsi @@ -737,7 +769,7 @@ Some core changes of the new internal format: ... since x86_64 ABI mandates rdi, rsi, rdx, rcx, r8, r9 for argument passing and rbx, r12 - r15 are callee saved. - Then the following internal BPF pseudo-program: + Then the following internal BPF pseudo-program:: bpf_mov R6, R1 /* save ctx */ bpf_mov R2, 2 @@ -755,7 +787,7 @@ Some core changes of the new internal format: bpf_add R0, R7 bpf_exit - After JIT to x86_64 may look like: + After JIT to x86_64 may look like:: push %rbp mov %rsp,%rbp @@ -781,21 +813,21 @@ Some core changes of the new internal format: leaveq retq - Which is in this example equivalent in C to: + Which is in this example equivalent in C to:: u64 bpf_filter(u64 ctx) { - return foo(ctx, 2, 3, 4, 5) + bar(ctx, 6, 7, 8, 9); + return foo(ctx, 2, 3, 4, 5) + bar(ctx, 6, 7, 8, 9); } In-kernel functions foo() and bar() with prototype: u64 (*)(u64 arg1, u64 arg2, u64 arg3, u64 arg4, u64 arg5); will receive arguments in proper - registers and place their return value into '%rax' which is R0 in eBPF. + registers and place their return value into ``%rax`` which is R0 in eBPF. Prologue and epilogue are emitted by JIT and are implicit in the interpreter. R0-R5 are scratch registers, so eBPF program needs to preserve them across the calls as defined by calling convention. - For example the following program is invalid: + For example the following program is invalid:: bpf_mov R1, 1 bpf_call foo @@ -814,7 +846,7 @@ The input context pointer for invoking the interpreter function is generic, its content is defined by a specific use case. For seccomp register R1 points to seccomp_data, for converted BPF filters R1 points to a skb. -A program, that is translated internally consists of the following elements: +A program, that is translated internally consists of the following elements:: op:16, jt:8, jf:8, k:32 ==> op:8, dst_reg:4, src_reg:4, off:16, imm:32 @@ -824,7 +856,7 @@ instructions must be multiple of 8 bytes to preserve backward compatibility. Internal BPF is a general purpose RISC instruction set. Not every register and every instruction are used during translation from original BPF to new format. -For example, socket filters are not using 'exclusive add' instruction, but +For example, socket filters are not using ``exclusive add`` instruction, but tracing filters may do to maintain counters of events, for example. Register R9 is not used by socket filters either, but more complex filters may be running out of registers and would have to resort to spill/fill to stack. @@ -849,7 +881,7 @@ eBPF opcode encoding eBPF is reusing most of the opcode encoding from classic to simplify conversion of classic BPF to eBPF. For arithmetic and jump instructions the 8-bit 'code' -field is divided into three parts: +field is divided into three parts:: +----------------+--------+--------------------+ | 4 bits | 1 bit | 3 bits | @@ -859,8 +891,9 @@ field is divided into three parts: Three LSB bits store instruction class which is one of: - Classic BPF classes: eBPF classes: - + =================== =============== + Classic BPF classes eBPF classes + =================== =============== BPF_LD 0x00 BPF_LD 0x00 BPF_LDX 0x01 BPF_LDX 0x01 BPF_ST 0x02 BPF_ST 0x02 @@ -869,25 +902,28 @@ Three LSB bits store instruction class which is one of: BPF_JMP 0x05 BPF_JMP 0x05 BPF_RET 0x06 BPF_JMP32 0x06 BPF_MISC 0x07 BPF_ALU64 0x07 + =================== =============== When BPF_CLASS(code) == BPF_ALU or BPF_JMP, 4th bit encodes source operand ... - BPF_K 0x00 - BPF_X 0x08 + :: + + BPF_K 0x00 + BPF_X 0x08 - * in classic BPF, this means: + * in classic BPF, this means:: - BPF_SRC(code) == BPF_X - use register X as source operand - BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand + BPF_SRC(code) == BPF_X - use register X as source operand + BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand - * in eBPF, this means: + * in eBPF, this means:: - BPF_SRC(code) == BPF_X - use 'src_reg' register as source operand - BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand + BPF_SRC(code) == BPF_X - use 'src_reg' register as source operand + BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand ... and four MSB bits store operation code. -If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of: +If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of:: BPF_ADD 0x00 BPF_SUB 0x10 @@ -904,7 +940,7 @@ If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of: BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */ BPF_END 0xd0 /* eBPF only: endianness conversion */ -If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of: +If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of:: BPF_JA 0x00 /* BPF_JMP only */ BPF_JEQ 0x10 @@ -934,7 +970,7 @@ exactly the same operations as BPF_ALU, but with 64-bit wide operands instead. So BPF_ADD | BPF_X | BPF_ALU64 means 64-bit addition, i.e.: dst_reg = dst_reg + src_reg -Classic BPF wastes the whole BPF_RET class to represent a single 'ret' +Classic BPF wastes the whole BPF_RET class to represent a single ``ret`` operation. Classic BPF_RET | BPF_K means copy imm32 into return register and perform function exit. eBPF is modeled to match CPU, so BPF_JMP | BPF_EXIT in eBPF means function exit only. The eBPF program needs to store return @@ -942,7 +978,7 @@ value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is used as BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide operands for the comparisons instead. -For load and store instructions the 8-bit 'code' field is divided as: +For load and store instructions the 8-bit 'code' field is divided as:: +--------+--------+-------------------+ | 3 bits | 2 bits | 3 bits | @@ -952,19 +988,21 @@ For load and store instructions the 8-bit 'code' field is divided as: Size modifier is one of ... +:: + BPF_W 0x00 /* word */ BPF_H 0x08 /* half word */ BPF_B 0x10 /* byte */ BPF_DW 0x18 /* eBPF only, double word */ -... which encodes size of load/store operation: +... which encodes size of load/store operation:: B - 1 byte H - 2 byte W - 4 byte DW - 8 byte (eBPF only) -Mode modifier is one of: +Mode modifier is one of:: BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ BPF_ABS 0x20 @@ -979,7 +1017,7 @@ eBPF has two non-generic instructions: (BPF_ABS | <size> | BPF_LD) and They had to be carried over from classic to have strong performance of socket filters running in eBPF interpreter. These instructions can only -be used when interpreter context is a pointer to 'struct sk_buff' and +be used when interpreter context is a pointer to ``struct sk_buff`` and have seven implicit operands. Register R6 is an implicit input that must contain pointer to sk_buff. Register R0 is an implicit output which contains the data fetched from the packet. Registers R1-R5 are scratch registers @@ -992,26 +1030,26 @@ the interpreter will abort the execution of the program. JIT compilers therefore must preserve this property. src_reg and imm32 fields are explicit inputs to these instructions. -For example: +For example:: BPF_IND | BPF_W | BPF_LD means: R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32)) and R1 - R5 were scratched. -Unlike classic BPF instruction set, eBPF has generic load/store operations: +Unlike classic BPF instruction set, eBPF has generic load/store operations:: -BPF_MEM | <size> | BPF_STX: *(size *) (dst_reg + off) = src_reg -BPF_MEM | <size> | BPF_ST: *(size *) (dst_reg + off) = imm32 -BPF_MEM | <size> | BPF_LDX: dst_reg = *(size *) (src_reg + off) -BPF_XADD | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg -BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg + BPF_MEM | <size> | BPF_STX: *(size *) (dst_reg + off) = src_reg + BPF_MEM | <size> | BPF_ST: *(size *) (dst_reg + off) = imm32 + BPF_MEM | <size> | BPF_LDX: dst_reg = *(size *) (src_reg + off) + BPF_XADD | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg + BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and 2 byte atomic increments are not supported. eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists -of two consecutive 'struct bpf_insn' 8-byte blocks and interpreted as single +of two consecutive ``struct bpf_insn`` 8-byte blocks and interpreted as single instruction that loads 64-bit immediate value into a dst_reg. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. @@ -1037,38 +1075,48 @@ since addition of two valid pointers makes invalid pointer. (In 'secure' mode verifier will reject any type of pointer arithmetic to make sure that kernel addresses don't leak to unprivileged users) -If register was never written to, it's not readable: +If register was never written to, it's not readable:: + bpf_mov R0 = R2 bpf_exit + will be rejected, since R2 is unreadable at the start of the program. After kernel function call, R1-R5 are reset to unreadable and R0 has a return type of the function. Since R6-R9 are callee saved, their state is preserved across the call. + +:: + bpf_mov R6 = 1 bpf_call foo bpf_mov R0 = R6 bpf_exit + is a correct program. If there was R1 instead of R6, it would have been rejected. load/store instructions are allowed only with registers of valid types, which are PTR_TO_CTX, PTR_TO_MAP, PTR_TO_STACK. They are bounds and alignment checked. -For example: +For example:: + bpf_mov R1 = 1 bpf_mov R2 = 2 bpf_xadd *(u32 *)(R1 + 3) += R2 bpf_exit + will be rejected, since R1 doesn't have a valid pointer type at the time of execution of instruction bpf_xadd. -At the start R1 type is PTR_TO_CTX (a pointer to generic 'struct bpf_context') +At the start R1 type is PTR_TO_CTX (a pointer to generic ``struct bpf_context``) A callback is used to customize verifier to restrict eBPF program access to only certain fields within ctx structure with specified size and alignment. -For example, the following insn: +For example, the following insn:: + bpf_ld R0 = *(u32 *)(R6 + 8) + intends to load a word from address R6 + 8 and store it into R0 If R6=PTR_TO_CTX, via is_valid_access() callback the verifier will know that offset 8 of size 4 bytes can be accessed for reading, otherwise @@ -1079,10 +1127,13 @@ so it will fail verification, since it's out of bounds. The verifier will allow eBPF program to read data from stack only after it wrote into it. + Classic BPF verifier does similar check with M[0-15] memory slots. -For example: +For example:: + bpf_ld R0 = *(u32 *)(R10 - 4) bpf_exit + is invalid program. Though R10 is correct read-only register and has type PTR_TO_STACK and R10 - 4 is within stack bounds, there were no stores into that location. @@ -1113,48 +1164,61 @@ Register value tracking ----------------------- In order to determine the safety of an eBPF program, the verifier must track the range of possible values in each register and also in each stack slot. -This is done with 'struct bpf_reg_state', defined in include/linux/ +This is done with ``struct bpf_reg_state``, defined in include/linux/ bpf_verifier.h, which unifies tracking of scalar and pointer values. Each register state has a type, which is either NOT_INIT (the register has not been written to), SCALAR_VALUE (some value which is not usable as a pointer), or a pointer type. The types of pointers describe their base, as follows: - PTR_TO_CTX Pointer to bpf_context. - CONST_PTR_TO_MAP Pointer to struct bpf_map. "Const" because arithmetic - on these pointers is forbidden. - PTR_TO_MAP_VALUE Pointer to the value stored in a map element. + + + PTR_TO_CTX + Pointer to bpf_context. + CONST_PTR_TO_MAP + Pointer to struct bpf_map. "Const" because arithmetic + on these pointers is forbidden. + PTR_TO_MAP_VALUE + Pointer to the value stored in a map element. PTR_TO_MAP_VALUE_OR_NULL - Either a pointer to a map value, or NULL; map accesses - (see section 'eBPF maps', below) return this type, - which becomes a PTR_TO_MAP_VALUE when checked != NULL. - Arithmetic on these pointers is forbidden. - PTR_TO_STACK Frame pointer. - PTR_TO_PACKET skb->data. - PTR_TO_PACKET_END skb->data + headlen; arithmetic forbidden. - PTR_TO_SOCKET Pointer to struct bpf_sock_ops, implicitly refcounted. + Either a pointer to a map value, or NULL; map accesses + (see section 'eBPF maps', below) return this type, + which becomes a PTR_TO_MAP_VALUE when checked != NULL. + Arithmetic on these pointers is forbidden. + PTR_TO_STACK + Frame pointer. + PTR_TO_PACKET + skb->data. + PTR_TO_PACKET_END + skb->data + headlen; arithmetic forbidden. + PTR_TO_SOCKET + Pointer to struct bpf_sock_ops, implicitly refcounted. PTR_TO_SOCKET_OR_NULL - Either a pointer to a socket, or NULL; socket lookup - returns this type, which becomes a PTR_TO_SOCKET when - checked != NULL. PTR_TO_SOCKET is reference-counted, - so programs must release the reference through the - socket release function before the end of the program. - Arithmetic on these pointers is forbidden. + Either a pointer to a socket, or NULL; socket lookup + returns this type, which becomes a PTR_TO_SOCKET when + checked != NULL. PTR_TO_SOCKET is reference-counted, + so programs must release the reference through the + socket release function before the end of the program. + Arithmetic on these pointers is forbidden. + However, a pointer may be offset from this base (as a result of pointer arithmetic), and this is tracked in two parts: the 'fixed offset' and 'variable offset'. The former is used when an exactly-known value (e.g. an immediate operand) is added to a pointer, while the latter is used for values which are not exactly known. The variable offset is also used in SCALAR_VALUEs, to track the range of possible values in the register. + The verifier's knowledge about the variable offset consists of: + * minimum and maximum values as unsigned * minimum and maximum values as signed + * knowledge of the values of individual bits, in the form of a 'tnum': a u64 -'mask' and a u64 'value'. 1s in the mask represent bits whose value is unknown; -1s in the value represent bits known to be 1. Bits known to be 0 have 0 in both -mask and value; no bit should ever be 1 in both. For example, if a byte is read -into a register from memory, the register's top 56 bits are known zero, while -the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we -then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0; -0x1ff), because of potential carries. + 'mask' and a u64 'value'. 1s in the mask represent bits whose value is unknown; + 1s in the value represent bits known to be 1. Bits known to be 0 have 0 in both + mask and value; no bit should ever be 1 in both. For example, if a byte is read + into a register from memory, the register's top 56 bits are known zero, while + the low 8 are unknown - which is represented as the tnum (0x0; 0xff). If we + then OR this with 0x40, we get (0x40; 0xbf), then if we add 1 we get (0x0; + 0x1ff), because of potential carries. Besides arithmetic, the register state can also be updated by conditional branches. For instance, if a SCALAR_VALUE is compared > 8, in the 'true' branch @@ -1188,7 +1252,7 @@ The 'id' field is also used on PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL, common to all copies of the pointer returned from a socket lookup. This has similar behaviour to the handling for PTR_TO_MAP_VALUE_OR_NULL->PTR_TO_MAP_VALUE, but it also handles reference tracking for the pointer. PTR_TO_SOCKET implicitly -represents a reference to the corresponding 'struct sock'. To ensure that the +represents a reference to the corresponding ``struct sock``. To ensure that the reference is not leaked, it is imperative to NULL-check the reference and in the non-NULL case, and pass the valid reference to the socket release function. @@ -1196,17 +1260,18 @@ Direct packet access -------------------- In cls_bpf and act_bpf programs the verifier allows direct access to the packet data via skb->data and skb->data_end pointers. -Ex: -1: r4 = *(u32 *)(r1 +80) /* load skb->data_end */ -2: r3 = *(u32 *)(r1 +76) /* load skb->data */ -3: r5 = r3 -4: r5 += 14 -5: if r5 > r4 goto pc+16 -R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp -6: r0 = *(u16 *)(r3 +12) /* access 12 and 13 bytes of the packet */ +Ex:: + + 1: r4 = *(u32 *)(r1 +80) /* load skb->data_end */ + 2: r3 = *(u32 *)(r1 +76) /* load skb->data */ + 3: r5 = r3 + 4: r5 += 14 + 5: if r5 > r4 goto pc+16 + R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp + 6: r0 = *(u16 *)(r3 +12) /* access 12 and 13 bytes of the packet */ this 2byte load from the packet is safe to do, since the program author -did check 'if (skb->data + 14 > skb->data_end) goto err' at insn #5 which +did check ``if (skb->data + 14 > skb->data_end) goto err`` at insn #5 which means that in the fall-through case the register R3 (which points to skb->data) has at least 14 directly accessible bytes. The verifier marks it as R3=pkt(id=0,off=0,r=14). @@ -1215,52 +1280,58 @@ off=0 means that no additional constants were added. r=14 is the range of safe access which means that bytes [R3, R3 + 14) are ok. Note that R5 is marked as R5=pkt(id=0,off=14,r=14). It also points to the packet data, but constant 14 was added to the register, so -it now points to 'skb->data + 14' and accessible range is [R5, R5 + 14 - 14) +it now points to ``skb->data + 14`` and accessible range is [R5, R5 + 14 - 14) which is zero bytes. -More complex packet access may look like: - R0=inv1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp - 6: r0 = *(u8 *)(r3 +7) /* load 7th byte from the packet */ - 7: r4 = *(u8 *)(r3 +12) - 8: r4 *= 14 - 9: r3 = *(u32 *)(r1 +76) /* load skb->data */ -10: r3 += r4 -11: r2 = r1 -12: r2 <<= 48 -13: r2 >>= 48 -14: r3 += r2 -15: r2 = r3 -16: r2 += 8 -17: r1 = *(u32 *)(r1 +80) /* load skb->data_end */ -18: if r2 > r1 goto pc+2 - R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)) R5=pkt(id=0,off=14,r=14) R10=fp -19: r1 = *(u8 *)(r3 +4) +More complex packet access may look like:: + + + R0=inv1 R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp + 6: r0 = *(u8 *)(r3 +7) /* load 7th byte from the packet */ + 7: r4 = *(u8 *)(r3 +12) + 8: r4 *= 14 + 9: r3 = *(u32 *)(r1 +76) /* load skb->data */ + 10: r3 += r4 + 11: r2 = r1 + 12: r2 <<= 48 + 13: r2 >>= 48 + 14: r3 += r2 + 15: r2 = r3 + 16: r2 += 8 + 17: r1 = *(u32 *)(r1 +80) /* load skb->data_end */ + 18: if r2 > r1 goto pc+2 + R0=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=pkt_end R2=pkt(id=2,off=8,r=8) R3=pkt(id=2,off=0,r=8) R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)) R5=pkt(id=0,off=14,r=14) R10=fp + 19: r1 = *(u8 *)(r3 +4) + The state of the register R3 is R3=pkt(id=2,off=0,r=8) -id=2 means that two 'r3 += rX' instructions were seen, so r3 points to some +id=2 means that two ``r3 += rX`` instructions were seen, so r3 points to some offset within a packet and since the program author did -'if (r3 + 8 > r1) goto err' at insn #18, the safe range is [R3, R3 + 8). +``if (r3 + 8 > r1) goto err`` at insn #18, the safe range is [R3, R3 + 8). The verifier only allows 'add'/'sub' operations on packet registers. Any other operation will set the register state to 'SCALAR_VALUE' and it won't be available for direct packet access. -Operation 'r3 += rX' may overflow and become less than original skb->data, -therefore the verifier has to prevent that. So when it sees 'r3 += rX' + +Operation ``r3 += rX`` may overflow and become less than original skb->data, +therefore the verifier has to prevent that. So when it sees ``r3 += rX`` instruction and rX is more than 16-bit value, any subsequent bounds-check of r3 against skb->data_end will not give us 'range' information, so attempts to read through the pointer will give "invalid access to packet" error. -Ex. after insn 'r4 = *(u8 *)(r3 +12)' (insn #7 above) the state of r4 is + +Ex. after insn ``r4 = *(u8 *)(r3 +12)`` (insn #7 above) the state of r4 is R4=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) which means that upper 56 bits of the register are guaranteed to be zero, and nothing is known about the lower -8 bits. After insn 'r4 *= 14' the state becomes +8 bits. After insn ``r4 *= 14`` the state becomes R4=inv(id=0,umax_value=3570,var_off=(0x0; 0xfffe)), since multiplying an 8-bit value by constant 14 will keep upper 52 bits as zero, also the least significant -bit will be zero as 14 is even. Similarly 'r2 >>= 48' will make +bit will be zero as 14 is even. Similarly ``r2 >>= 48`` will make R2=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)), since the shift is not sign extending. This logic is implemented in adjust_reg_min_max_vals() function, which calls adjust_ptr_min_max_vals() for adding pointer to scalar (or vice versa) and adjust_scalar_min_max_vals() for operations on two scalars. The end result is that bpf program author can access packet directly -using normal C code as: +using normal C code as:: + void *data = (void *)(long)skb->data; void *data_end = (void *)(long)skb->data_end; struct eth_hdr *eth = data; @@ -1268,13 +1339,14 @@ using normal C code as: struct udphdr *udp = data + sizeof(*eth) + sizeof(*iph); if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*udp) > data_end) - return 0; + return 0; if (eth->h_proto != htons(ETH_P_IP)) - return 0; + return 0; if (iph->protocol != IPPROTO_UDP || iph->ihl != 5) - return 0; + return 0; if (udp->dest == 53 || udp->source == 9) - ...; + ...; + which makes such programs easier to write comparing to LD_ABS insn and significantly faster. @@ -1284,23 +1356,24 @@ eBPF maps and userspace. The maps are accessed from user space via BPF syscall, which has commands: + - create a map with given type and attributes - map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) + ``map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)`` using attr->map_type, attr->key_size, attr->value_size, attr->max_entries returns process-local file descriptor or negative error - lookup key in a given map - err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) + ``err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)`` using attr->map_fd, attr->key, attr->value returns zero and stores found elem into value or negative error - create or update key/value pair in a given map - err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) + ``err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)`` using attr->map_fd, attr->key, attr->value returns zero or negative error - find and delete element by key in a given map - err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) + ``err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)`` using attr->map_fd, attr->key - to delete map: close(fd) @@ -1312,10 +1385,11 @@ are concurrently updating. maps can have different types: hash, array, bloom filter, radix-tree, etc. The map is defined by: - . type - . max number of elements - . key size in bytes - . value size in bytes + + - type + - max number of elements + - key size in bytes + - value size in bytes Pruning ------- @@ -1339,57 +1413,75 @@ Understanding eBPF verifier messages The following are few examples of invalid eBPF programs and verifier error messages as seen in the log: -Program with unreachable instructions: -static struct bpf_insn prog[] = { +Program with unreachable instructions:: + + static struct bpf_insn prog[] = { BPF_EXIT_INSN(), BPF_EXIT_INSN(), -}; + }; + Error: + unreachable insn 1 -Program that reads uninitialized register: +Program that reads uninitialized register:: + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), BPF_EXIT_INSN(), -Error: + +Error:: + 0: (bf) r0 = r2 R2 !read_ok -Program that doesn't initialize R0 before exiting: +Program that doesn't initialize R0 before exiting:: + BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), BPF_EXIT_INSN(), -Error: + +Error:: + 0: (bf) r2 = r1 1: (95) exit R0 !read_ok -Program that accesses stack out of bounds: - BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), - BPF_EXIT_INSN(), -Error: - 0: (7a) *(u64 *)(r10 +8) = 0 - invalid stack off=8 size=8 +Program that accesses stack out of bounds:: + + BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), + BPF_EXIT_INSN(), + +Error:: + + 0: (7a) *(u64 *)(r10 +8) = 0 + invalid stack off=8 size=8 + +Program that doesn't initialize stack before passing its address into function:: -Program that doesn't initialize stack before passing its address into function: BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), -Error: + +Error:: + 0: (bf) r2 = r10 1: (07) r2 += -8 2: (b7) r1 = 0x0 3: (85) call 1 invalid indirect read from stack off -8+0 size 8 -Program that uses invalid map_fd=0 while calling to map_lookup_elem() function: +Program that uses invalid map_fd=0 while calling to map_lookup_elem() function:: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), -Error: + +Error:: + 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 @@ -1398,7 +1490,8 @@ Error: fd 0 is not pointing to valid bpf_map Program that doesn't check return value of map_lookup_elem() before accessing -map element: +map element:: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), @@ -1406,7 +1499,9 @@ map element: BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), BPF_EXIT_INSN(), -Error: + +Error:: + 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 @@ -1416,7 +1511,8 @@ Error: R0 invalid mem access 'map_value_or_null' Program that correctly checks map_lookup_elem() returned value for NULL, but -accesses the memory with incorrect alignment: +accesses the memory with incorrect alignment:: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), @@ -1425,7 +1521,9 @@ accesses the memory with incorrect alignment: BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), BPF_EXIT_INSN(), -Error: + +Error:: + 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 @@ -1438,7 +1536,8 @@ Error: Program that correctly checks map_lookup_elem() returned value for NULL and accesses memory with correct alignment in one side of 'if' branch, but fails -to do so in the other side of 'if' branch: +to do so in the other side of 'if' branch:: + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), @@ -1449,7 +1548,9 @@ to do so in the other side of 'if' branch: BPF_EXIT_INSN(), BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), -Error: + +Error:: + 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 @@ -1465,8 +1566,8 @@ Error: R0 invalid mem access 'imm' Program that performs a socket lookup then sets the pointer to NULL without -checking it: -value: +checking it:: + BPF_MOV64_IMM(BPF_REG_2, 0), BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@ -1477,7 +1578,9 @@ value: BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), -Error: + +Error:: + 0: (b7) r2 = 0 1: (63) *(u32 *)(r10 -8) = r2 2: (bf) r2 = r10 @@ -1491,7 +1594,8 @@ Error: Unreleased reference id=1, alloc_insn=7 Program that performs a socket lookup but does not NULL-check the returned -value: +value:: + BPF_MOV64_IMM(BPF_REG_2, 0), BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@ -1501,7 +1605,9 @@ value: BPF_MOV64_IMM(BPF_REG_5, 0), BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), BPF_EXIT_INSN(), -Error: + +Error:: + 0: (b7) r2 = 0 1: (63) *(u32 *)(r10 -8) = r2 2: (bf) r2 = r10 @@ -1519,7 +1625,7 @@ Testing Next to the BPF toolchain, the kernel also ships a test module that contains various test cases for classic and internal BPF that can be executed against the BPF interpreter and JIT compiler. It can be found in lib/test_bpf.c and -enabled via Kconfig: +enabled via Kconfig:: CONFIG_TEST_BPF=m @@ -1540,6 +1646,6 @@ The document was written in the hope that it is found useful and in order to give potential BPF hackers or security auditors a better overview of the underlying architecture. -Jay Schulist <jschlst@samba.org> -Daniel Borkmann <daniel@iogearbox.net> -Alexei Starovoitov <ast@kernel.org> +- Jay Schulist <jschlst@samba.org> +- Daniel Borkmann <daniel@iogearbox.net> +- Alexei Starovoitov <ast@kernel.org> diff --git a/Documentation/networking/fore200e.txt b/Documentation/networking/fore200e.rst index 1f98f62b4370..55df9ec09ac8 100644 --- a/Documentation/networking/fore200e.txt +++ b/Documentation/networking/fore200e.rst @@ -1,6 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 +============================================= FORE Systems PCA-200E/SBA-200E ATM NIC driver ---------------------------------------------- +============================================= This driver adds support for the FORE Systems 200E-series ATM adapters to the Linux operating system. It is based on the earlier PCA-200E driver @@ -27,8 +29,8 @@ in the linux/drivers/atm directory for details and restrictions. Firmware Updates ---------------- -The FORE Systems 200E-series driver is shipped with firmware data being -uploaded to the ATM adapters at system boot time or at module loading time. +The FORE Systems 200E-series driver is shipped with firmware data being +uploaded to the ATM adapters at system boot time or at module loading time. The supplied firmware images should work with all adapters. However, if you encounter problems (the firmware doesn't start or the driver diff --git a/Documentation/networking/framerelay.txt b/Documentation/networking/framerelay.rst index 1a0b720440dd..6d904399ec6d 100644 --- a/Documentation/networking/framerelay.txt +++ b/Documentation/networking/framerelay.rst @@ -1,4 +1,10 @@ -Frame Relay (FR) support for linux is built into a two tiered system of device +.. SPDX-License-Identifier: GPL-2.0 + +================ +Frame Relay (FR) +================ + +Frame Relay (FR) support for linux is built into a two tiered system of device drivers. The upper layer implements RFC1490 FR specification, and uses the Data Link Connection Identifier (DLCI) as its hardware address. Usually these are assigned by your network supplier, they give you the number/numbers of @@ -7,18 +13,18 @@ the Virtual Connections (VC) assigned to you. Each DLCI is a point-to-point link between your machine and a remote one. As such, a separate device is needed to accommodate the routing. Within the net-tools archives is 'dlcicfg'. This program will communicate with the -base "DLCI" device, and create new net devices named 'dlci00', 'dlci01'... +base "DLCI" device, and create new net devices named 'dlci00', 'dlci01'... The configuration script will ask you how many DLCIs you need, as well as how many DLCIs you want to assign to each Frame Relay Access Device (FRAD). The DLCI uses a number of function calls to communicate with the FRAD, all -of which are stored in the FRAD's private data area. assoc/deassoc, +of which are stored in the FRAD's private data area. assoc/deassoc, activate/deactivate and dlci_config. The DLCI supplies a receive function to the FRAD to accept incoming packets. With this initial offering, only 1 FRAD driver is available. With many thanks -to Sangoma Technologies, David Mandelstam & Gene Kozin, the S502A, S502E & -S508 are supported. This driver is currently set up for only FR, but as +to Sangoma Technologies, David Mandelstam & Gene Kozin, the S502A, S502E & +S508 are supported. This driver is currently set up for only FR, but as Sangoma makes more firmware modules available, it can be updated to provide them as well. @@ -32,8 +38,7 @@ an initial configuration. Additional FRAD device drivers can be added as hardware is available. At this time, the dlcicfg and fradcfg programs have not been incorporated into -the net-tools distribution. They can be found at ftp.invlogic.com, in +the net-tools distribution. They can be found at ftp.invlogic.com, in /pub/linux. Note that with OS/2 FTPD, you end up in /pub by default, so just -use 'cd linux'. v0.10 is for use on pre-2.0.3 and earlier, v0.15 is for +use 'cd linux'. v0.10 is for use on pre-2.0.3 and earlier, v0.15 is for pre-2.0.4 and later. - diff --git a/Documentation/networking/gen_stats.txt b/Documentation/networking/gen_stats.rst index 179b18ce45ff..595a83b9a61b 100644 --- a/Documentation/networking/gen_stats.txt +++ b/Documentation/networking/gen_stats.rst @@ -1,67 +1,76 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================================== Generic networking statistics for netlink users -====================================================================== +=============================================== Statistic counters are grouped into structs: +==================== ===================== ===================== Struct TLV type Description ----------------------------------------------------------------------- +==================== ===================== ===================== gnet_stats_basic TCA_STATS_BASIC Basic statistics gnet_stats_rate_est TCA_STATS_RATE_EST Rate estimator gnet_stats_queue TCA_STATS_QUEUE Queue statistics none TCA_STATS_APP Application specific +==================== ===================== ===================== Collecting: ----------- -Declare the statistic structs you need: -struct mystruct { - struct gnet_stats_basic bstats; - struct gnet_stats_queue qstats; - ... -}; +Declare the statistic structs you need:: + + struct mystruct { + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + ... + }; + +Update statistics, in dequeue() methods only, (while owning qdisc->running):: -Update statistics, in dequeue() methods only, (while owning qdisc->running) -mystruct->tstats.packet++; -mystruct->qstats.backlog += skb->pkt_len; + mystruct->tstats.packet++; + mystruct->qstats.backlog += skb->pkt_len; Export to userspace (Dump): --------------------------- -my_dumping_routine(struct sk_buff *skb, ...) -{ - struct gnet_dump dump; +:: - if (gnet_stats_start_copy(skb, TCA_STATS2, &mystruct->lock, &dump, - TCA_PAD) < 0) - goto rtattr_failure; + my_dumping_routine(struct sk_buff *skb, ...) + { + struct gnet_dump dump; - if (gnet_stats_copy_basic(&dump, &mystruct->bstats) < 0 || - gnet_stats_copy_queue(&dump, &mystruct->qstats) < 0 || - gnet_stats_copy_app(&dump, &xstats, sizeof(xstats)) < 0) - goto rtattr_failure; + if (gnet_stats_start_copy(skb, TCA_STATS2, &mystruct->lock, &dump, + TCA_PAD) < 0) + goto rtattr_failure; - if (gnet_stats_finish_copy(&dump) < 0) - goto rtattr_failure; - ... -} + if (gnet_stats_copy_basic(&dump, &mystruct->bstats) < 0 || + gnet_stats_copy_queue(&dump, &mystruct->qstats) < 0 || + gnet_stats_copy_app(&dump, &xstats, sizeof(xstats)) < 0) + goto rtattr_failure; + + if (gnet_stats_finish_copy(&dump) < 0) + goto rtattr_failure; + ... + } TCA_STATS/TCA_XSTATS backward compatibility: -------------------------------------------- Prior users of struct tc_stats and xstats can maintain backward compatibility by calling the compat wrappers to keep providing the -existing TLV types. +existing TLV types:: -my_dumping_routine(struct sk_buff *skb, ...) -{ - if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, - TCA_XSTATS, &mystruct->lock, &dump, - TCA_PAD) < 0) - goto rtattr_failure; - ... -} + my_dumping_routine(struct sk_buff *skb, ...) + { + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, + TCA_XSTATS, &mystruct->lock, &dump, + TCA_PAD) < 0) + goto rtattr_failure; + ... + } A struct tc_stats will be filled out during gnet_stats_copy_* calls and appended to the skb. TCA_XSTATS is provided if gnet_stats_copy_app @@ -77,7 +86,7 @@ are responsible for making sure that the lock is initialized. Rate Estimator: --------------- +--------------- 0) Prepare an estimator attribute. Most likely this would be in user space. The value of this TLV should contain a tc_estimator structure. @@ -92,18 +101,19 @@ Rate Estimator: TCA_RATE to your code in the kernel. In the kernel when setting up: + 1) make sure you have basic stats and rate stats setup first. 2) make sure you have initialized stats lock that is used to setup such stats. -3) Now initialize a new estimator: +3) Now initialize a new estimator:: - int ret = gen_new_estimator(my_basicstats,my_rate_est_stats, - mystats_lock, attr_with_tcestimator_struct); + int ret = gen_new_estimator(my_basicstats,my_rate_est_stats, + mystats_lock, attr_with_tcestimator_struct); - if ret == 0 - success - else - failed + if ret == 0 + success + else + failed From now on, every time you dump my_rate_est_stats it will contain up-to-date info. @@ -115,5 +125,5 @@ are still valid (i.e still exist) at the time of making this call. Authors: -------- -Thomas Graf <tgraf@suug.ch> -Jamal Hadi Salim <hadi@cyberus.ca> +- Thomas Graf <tgraf@suug.ch> +- Jamal Hadi Salim <hadi@cyberus.ca> diff --git a/Documentation/networking/generic-hdlc.txt b/Documentation/networking/generic-hdlc.rst index 4eb3cc40b702..1c3bb5cb98d4 100644 --- a/Documentation/networking/generic-hdlc.txt +++ b/Documentation/networking/generic-hdlc.rst @@ -1,14 +1,22 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== Generic HDLC layer +================== + Krzysztof Halasa <khc@pm.waw.pl> Generic HDLC layer currently supports: + 1. Frame Relay (ANSI, CCITT, Cisco and no LMI) + - Normal (routed) and Ethernet-bridged (Ethernet device emulation) interfaces can share a single PVC. - ARP support (no InARP support in the kernel - there is an experimental InARP user-space daemon available on: http://www.kernel.org/pub/linux/utils/net/hdlc/). + 2. raw HDLC - either IP (IPv4) interface or Ethernet device emulation 3. Cisco HDLC 4. PPP @@ -24,19 +32,24 @@ with IEEE 802.1Q (VLANs) and 802.1D (Ethernet bridging). Make sure the hdlc.o and the hardware driver are loaded. It should create a number of "hdlc" (hdlc0 etc) network devices, one for each WAN port. You'll need the "sethdlc" utility, get it from: + http://www.kernel.org/pub/linux/utils/net/hdlc/ -Compile sethdlc.c utility: +Compile sethdlc.c utility:: + gcc -O2 -Wall -o sethdlc sethdlc.c + Make sure you're using a correct version of sethdlc for your kernel. Use sethdlc to set physical interface, clock rate, HDLC mode used, and add any required PVCs if using Frame Relay. -Usually you want something like: +Usually you want something like:: sethdlc hdlc0 clock int rate 128000 sethdlc hdlc0 cisco interval 10 timeout 25 -or + +or:: + sethdlc hdlc0 rs232 clock ext sethdlc hdlc0 fr lmi ansi sethdlc hdlc0 create 99 @@ -49,46 +62,63 @@ any IP address to it) before using pvc devices. Setting interface: -* v35 | rs232 | x21 | t1 | e1 - sets physical interface for a given port - if the card has software-selectable interfaces - loopback - activate hardware loopback (for testing only) -* clock ext - both RX clock and TX clock external -* clock int - both RX clock and TX clock internal -* clock txint - RX clock external, TX clock internal -* clock txfromrx - RX clock external, TX clock derived from RX clock -* rate - sets clock rate in bps (for "int" or "txint" clock only) +* v35 | rs232 | x21 | t1 | e1 + - sets physical interface for a given port + if the card has software-selectable interfaces + loopback + - activate hardware loopback (for testing only) +* clock ext + - both RX clock and TX clock external +* clock int + - both RX clock and TX clock internal +* clock txint + - RX clock external, TX clock internal +* clock txfromrx + - RX clock external, TX clock derived from RX clock +* rate + - sets clock rate in bps (for "int" or "txint" clock only) Setting protocol: * hdlc - sets raw HDLC (IP-only) mode + nrz / nrzi / fm-mark / fm-space / manchester - sets transmission code + no-parity / crc16 / crc16-pr0 (CRC16 with preset zeros) / crc32-itu + crc16-itu (CRC16 with ITU-T polynomial) / crc16-itu-pr0 - sets parity * hdlc-eth - Ethernet device emulation using HDLC. Parity and encoding as above. * cisco - sets Cisco HDLC mode (IP, IPv6 and IPX supported) + interval - time in seconds between keepalive packets + timeout - time in seconds after last received keepalive packet before - we assume the link is down + we assume the link is down * ppp - sets synchronous PPP mode * x25 - sets X.25 mode * fr - Frame Relay mode + lmi ansi / ccitt / cisco / none - LMI (link management) type + dce - Frame Relay DCE (network) side LMI instead of default DTE (user). + It has nothing to do with clocks! - t391 - link integrity verification polling timer (in seconds) - user - t392 - polling verification timer (in seconds) - network - n391 - full status polling counter - user - n392 - error threshold - both user and network - n393 - monitored events count - both user and network + + - t391 - link integrity verification polling timer (in seconds) - user + - t392 - polling verification timer (in seconds) - network + - n391 - full status polling counter - user + - n392 - error threshold - both user and network + - n393 - monitored events count - both user and network Frame-Relay only: + * create n | delete n - adds / deletes PVC interface with DLCI #n. Newly created interface will be named pvc0, pvc1 etc. @@ -101,26 +131,34 @@ Frame-Relay only: Board-specific issues --------------------- -n2.o and c101.o need parameters to work: +n2.o and c101.o need parameters to work:: insmod n2 hw=io,irq,ram,ports[:io,irq,...] -example: + +example:: + insmod n2 hw=0x300,10,0xD0000,01 -or +or:: + insmod c101 hw=irq,ram[:irq,...] -example: + +example:: + insmod c101 hw=9,0xdc000 -If built into the kernel, these drivers need kernel (command line) parameters: +If built into the kernel, these drivers need kernel (command line) parameters:: + n2.hw=io,irq,ram,ports:... -or + +or:: + c101.hw=irq,ram:... If you have a problem with N2, C101 or PLX200SYN card, you can issue the -"private" command to see port's packet descriptor rings (in kernel logs): +"private" command to see port's packet descriptor rings (in kernel logs):: sethdlc hdlc0 private diff --git a/Documentation/networking/generic_netlink.txt b/Documentation/networking/generic_netlink.rst index 3e071115ca90..59e04ccf80c1 100644 --- a/Documentation/networking/generic_netlink.txt +++ b/Documentation/networking/generic_netlink.rst @@ -1,3 +1,9 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Generic Netlink +=============== + A wiki document on how to use Generic Netlink can be found here: * http://www.linuxfoundation.org/collaborate/workgroups/networking/generic_netlink_howto diff --git a/Documentation/networking/gtp.txt b/Documentation/networking/gtp.rst index 6966bbec1ecb..1563fb94b289 100644 --- a/Documentation/networking/gtp.txt +++ b/Documentation/networking/gtp.rst @@ -1,12 +1,18 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== The Linux kernel GTP tunneling module -====================================================================== -Documentation by Harald Welte <laforge@gnumonks.org> and - Andreas Schultz <aschultz@tpip.net> +===================================== + +Documentation by + Harald Welte <laforge@gnumonks.org> and + Andreas Schultz <aschultz@tpip.net> In 'drivers/net/gtp.c' you are finding a kernel-level implementation of a GTP tunnel endpoint. -== What is GTP == +What is GTP +=========== GTP is the Generic Tunnel Protocol, which is a 3GPP protocol used for tunneling User-IP payload between a mobile station (phone, modem) @@ -41,7 +47,8 @@ publicly via the 3GPP website at http://www.3gpp.org/DynaReport/29060.htm A direct PDF link to v13.6.0 is provided for convenience below: http://www.etsi.org/deliver/etsi_ts/129000_129099/129060/13.06.00_60/ts_129060v130600p.pdf -== The Linux GTP tunnelling module == +The Linux GTP tunnelling module +=============================== The module implements the function of a tunnel endpoint, i.e. it is able to decapsulate tunneled IP packets in the uplink originated by @@ -70,7 +77,8 @@ Userspace :) The official homepage of the module is at https://osmocom.org/projects/linux-kernel-gtp-u/wiki -== Userspace Programs with Linux Kernel GTP-U support == +Userspace Programs with Linux Kernel GTP-U support +================================================== At the time of this writing, there are at least two Free Software implementations that implement GTP-C and can use the netlink interface @@ -82,7 +90,8 @@ to make use of the Linux kernel GTP-U support: * ergw (GGSN + P-GW in Erlang): https://github.com/travelping/ergw -== Userspace Library / Command Line Utilities == +Userspace Library / Command Line Utilities +========================================== There is a userspace library called 'libgtpnl' which is based on libmnl and which implements a C-language API towards the netlink @@ -90,7 +99,8 @@ interface provided by the Kernel GTP module: http://git.osmocom.org/libgtpnl/ -== Protocol Versions == +Protocol Versions +================= There are two different versions of GTP-U: v0 [GSM TS 09.60] and v1 [3GPP TS 29.281]. Both are implemented in the Kernel GTP module. @@ -105,7 +115,8 @@ doesn't implement GTP-C, we don't have to worry about this. It's the responsibility of the control plane implementation in userspace to implement that. -== IPv6 == +IPv6 +==== The 3GPP specifications indicate either IPv4 or IPv6 can be used both on the inner (user) IP layer, or on the outer (transport) layer. @@ -114,22 +125,25 @@ Unfortunately, the Kernel module currently supports IPv6 neither for the User IP payload, nor for the outer IP layer. Patches or other Contributions to fix this are most welcome! -== Mailing List == +Mailing List +============ -If yo have questions regarding how to use the Kernel GTP module from +If you have questions regarding how to use the Kernel GTP module from your own software, or want to contribute to the code, please use the osmocom-net-grps mailing list for related discussion. The list can be reached at osmocom-net-gprs@lists.osmocom.org and the mailman interface for managing your subscription is at https://lists.osmocom.org/mailman/listinfo/osmocom-net-gprs -== Issue Tracker == +Issue Tracker +============= The Osmocom project maintains an issue tracker for the Kernel GTP-U module at https://osmocom.org/projects/linux-kernel-gtp-u/issues -== History / Acknowledgements == +History / Acknowledgements +========================== The Module was originally created in 2012 by Harald Welte, but never completed. Pablo came in to finish the mess Harald left behind. But @@ -139,9 +153,11 @@ In 2015, Andreas Schultz came to the rescue and fixed lots more bugs, extended it with new features and finally pushed all of us to get it mainline, where it was merged in 4.7.0. -== Architectural Details == +Architectural Details +===================== -=== Local GTP-U entity and tunnel identification === +Local GTP-U entity and tunnel identification +-------------------------------------------- GTP-U uses UDP for transporting PDU's. The receiving UDP port is 2152 for GTPv1-U and 3386 for GTPv0-U. @@ -164,15 +180,15 @@ Therefore: destination IP and the tunnel endpoint id. The source IP and port have no meaning and can change at any time. -[3GPP TS 29.281] Section 4.3.0 defines this so: +[3GPP TS 29.281] Section 4.3.0 defines this so:: -> The TEID in the GTP-U header is used to de-multiplex traffic -> incoming from remote tunnel endpoints so that it is delivered to the -> User plane entities in a way that allows multiplexing of different -> users, different packet protocols and different QoS levels. -> Therefore no two remote GTP-U endpoints shall send traffic to a -> GTP-U protocol entity using the same TEID value except -> for data forwarding as part of mobility procedures. + The TEID in the GTP-U header is used to de-multiplex traffic + incoming from remote tunnel endpoints so that it is delivered to the + User plane entities in a way that allows multiplexing of different + users, different packet protocols and different QoS levels. + Therefore no two remote GTP-U endpoints shall send traffic to a + GTP-U protocol entity using the same TEID value except + for data forwarding as part of mobility procedures. The definition above only defines that two remote GTP-U endpoints *should not* send to the same TEID, it *does not* forbid or exclude @@ -183,7 +199,8 @@ multiple or unknown peers. Therefore, the receiving side identifies tunnels exclusively based on TEIDs, not based on the source IP! -== APN vs. Network Device == +APN vs. Network Device +====================== The GTP-U driver creates a Linux network device for each Gi/SGi interface. @@ -201,29 +218,33 @@ number of Gi/SGi interfaces implemented by a GGSN/P-GW. [3GPP TS 29.061] Section 11.3 makes it clear that the selection of a specific Gi/SGi interfaces is made through the Access Point Name -(APN): - -> 2. each private network manages its own addressing. In general this -> will result in different private networks having overlapping -> address ranges. A logically separate connection (e.g. an IP in IP -> tunnel or layer 2 virtual circuit) is used between the GGSN/P-GW -> and each private network. -> -> In this case the IP address alone is not necessarily unique. The -> pair of values, Access Point Name (APN) and IPv4 address and/or -> IPv6 prefixes, is unique. +(APN):: + + 2. each private network manages its own addressing. In general this + will result in different private networks having overlapping + address ranges. A logically separate connection (e.g. an IP in IP + tunnel or layer 2 virtual circuit) is used between the GGSN/P-GW + and each private network. + + In this case the IP address alone is not necessarily unique. The + pair of values, Access Point Name (APN) and IPv4 address and/or + IPv6 prefixes, is unique. In order to support the overlapping address range use case, each APN is mapped to a separate Gi/SGi interface (network device). -NOTE: The Access Point Name is purely a control plane (GTP-C) concept. -At the GTP-U level, only Tunnel Endpoint Identifiers are present in -GTP-U packets and network devices are known +.. note:: + + The Access Point Name is purely a control plane (GTP-C) concept. + At the GTP-U level, only Tunnel Endpoint Identifiers are present in + GTP-U packets and network devices are known Therefore for a given UE the mapping in IP to PDN network is: + * network device + MS IP -> Peer IP + Peer TEID, and from PDN to IP network: + * local GTP-U IP + TEID -> network device Furthermore, before a received T-PDU is injected into the network diff --git a/Documentation/networking/hinic.txt b/Documentation/networking/hinic.rst index 989366a4039c..867ac8f4e04a 100644 --- a/Documentation/networking/hinic.txt +++ b/Documentation/networking/hinic.rst @@ -1,3 +1,6 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================================ Linux Kernel Driver for Huawei Intelligent NIC(HiNIC) family ============================================================ @@ -110,7 +113,7 @@ hinic_dev - de/constructs the Logical Tx and Rx Queues. (hinic_main.c, hinic_dev.h) -Miscellaneous: +Miscellaneous ============= Common functions that are used by HW and Logical Device. diff --git a/Documentation/networking/ila.txt b/Documentation/networking/ila.rst index a17dac9dc915..5ac0a6270b17 100644 --- a/Documentation/networking/ila.txt +++ b/Documentation/networking/ila.rst @@ -1,4 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================== Identifier Locator Addressing (ILA) +=================================== Introduction @@ -26,11 +30,13 @@ The ILA protocol is described in Internet-Draft draft-herbert-intarea-ila. ILA terminology =============== - - Identifier A number that identifies an addressable node in the network + - Identifier + A number that identifies an addressable node in the network independent of its location. ILA identifiers are sixty-four bit values. - - Locator A network prefix that routes to a physical host. Locators + - Locator + A network prefix that routes to a physical host. Locators provide the topological location of an addressed node. ILA locators are sixty-four bit prefixes. @@ -51,17 +57,20 @@ ILA terminology bits) and an identifier (low order sixty-four bits). ILA addresses are never visible to an application. - - ILA host An end host that is capable of performing ILA translations + - ILA host + An end host that is capable of performing ILA translations on transmit or receive. - - ILA router A network node that performs ILA translation and forwarding + - ILA router + A network node that performs ILA translation and forwarding of translated packets. - ILA forwarding cache A type of ILA router that only maintains a working set cache of mappings. - - ILA node A network node capable of performing ILA translations. This + - ILA node + A network node capable of performing ILA translations. This can be an ILA router, ILA forwarding cache, or ILA host. @@ -82,18 +91,18 @@ Configuration and datapath for these two points of deployment is somewhat different. The diagram below illustrates the flow of packets through ILA as well -as showing ILA hosts and routers. +as showing ILA hosts and routers:: +--------+ +--------+ | Host A +-+ +--->| Host B | | | | (2) ILA (') | | +--------+ | ...addressed.... ( ) +--------+ - V +---+--+ . packet . +---+--+ (_) + V +---+--+ . packet . +---+--+ (_) (1) SIR | | ILA |----->-------->---->| ILA | | (3) SIR addressed +->|router| . . |router|->-+ addressed packet +---+--+ . IPv6 . +---+--+ packet - / . Network . - / . . +--+-++--------+ + / . Network . + / . . +--+-++--------+ +--------+ / . . |ILA || Host | | Host +--+ . .- -|host|| | | | . . +--+-++--------+ @@ -173,7 +182,7 @@ ILA address, never a SIR address. In the simplest format the identifier types, C-bit, and checksum adjustment value are not present so an identifier is considered an -unstructured sixty-four bit value. +unstructured sixty-four bit value:: +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Identifier | @@ -184,7 +193,7 @@ unstructured sixty-four bit value. The checksum neutral adjustment may be configured to always be present using neutral-map-auto. In this case there is no C-bit, but the checksum adjustment is in the low order 16 bits. The identifier is -still sixty-four bits. +still sixty-four bits:: +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Identifier | @@ -193,7 +202,7 @@ still sixty-four bits. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ The C-bit may used to explicitly indicate that checksum neutral -mapping has been applied to an ILA address. The format is: +mapping has been applied to an ILA address. The format is:: +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | |C| Identifier | @@ -204,7 +213,7 @@ mapping has been applied to an ILA address. The format is: The identifier type field may be present to indicate the identifier type. If it is not present then the type is inferred based on mapping configuration. The checksum neutral adjustment may automatically -used with the identifier type as illustrated below. +used with the identifier type as illustrated below:: +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Type| Identifier | @@ -213,7 +222,7 @@ used with the identifier type as illustrated below. +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ If the identifier type and the C-bit can be present simultaneously so -the identifier format would be: +the identifier format would be:: +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Type|C| Identifier | @@ -258,28 +267,30 @@ same meanings as described above. Some examples ============= -# Configure an ILA route that uses checksum neutral mapping as well -# as type field. Note that the type field is set in the SIR address -# (the 2000 implies type is 1 which is LUID). -ip route add 3333:0:0:1:2000:0:1:87/128 encap ila 2001:0:87:0 \ - csum-mode neutral-map ident-type use-format - -# Configure an ILA LWT route that uses auto checksum neutral mapping -# (no C-bit) and configure identifier type to be LUID so that the -# identifier type field will not be present. -ip route add 3333:0:0:1:2000:0:2:87/128 encap ila 2001:0:87:1 \ - csum-mode neutral-map-auto ident-type luid - -ila_xlat configuration - -# Configure an ILA to SIR mapping that matches a locator and overwrites -# it with a SIR address (3333:0:0:1 in this example). The C-bit and -# identifier field are used. -ip ila add loc_match 2001:0:119:0 loc 3333:0:0:1 \ - csum-mode neutral-map-auto ident-type use-format - -# Configure an ILA to SIR mapping where checksum neutral is automatically -# set without the C-bit and the identifier type is configured to be LUID -# so that the identifier type field is not present. -ip ila add loc_match 2001:0:119:0 loc 3333:0:0:1 \ - csum-mode neutral-map-auto ident-type use-format +:: + + # Configure an ILA route that uses checksum neutral mapping as well + # as type field. Note that the type field is set in the SIR address + # (the 2000 implies type is 1 which is LUID). + ip route add 3333:0:0:1:2000:0:1:87/128 encap ila 2001:0:87:0 \ + csum-mode neutral-map ident-type use-format + + # Configure an ILA LWT route that uses auto checksum neutral mapping + # (no C-bit) and configure identifier type to be LUID so that the + # identifier type field will not be present. + ip route add 3333:0:0:1:2000:0:2:87/128 encap ila 2001:0:87:1 \ + csum-mode neutral-map-auto ident-type luid + + ila_xlat configuration + + # Configure an ILA to SIR mapping that matches a locator and overwrites + # it with a SIR address (3333:0:0:1 in this example). The C-bit and + # identifier field are used. + ip ila add loc_match 2001:0:119:0 loc 3333:0:0:1 \ + csum-mode neutral-map-auto ident-type use-format + + # Configure an ILA to SIR mapping where checksum neutral is automatically + # set without the C-bit and the identifier type is configured to be LUID + # so that the identifier type field is not present. + ip ila add loc_match 2001:0:119:0 loc 3333:0:0:1 \ + csum-mode neutral-map-auto ident-type use-format diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 6538ede29661..e1ff08b94d90 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -15,6 +15,7 @@ Contents: device_drivers/index dsa/index devlink/index + caif/index ethtool-netlink ieee802154 j1939 @@ -36,6 +37,43 @@ Contents: tls-offload nfc 6lowpan + 6pack + altera_tse + arcnet-hardware + arcnet + atm + ax25 + baycom + bonding + cdc_mbim + cops + cxacru + dccp + dctcp + decnet + defza + dns_resolver + driver + eql + fib_trie + filter + fore200e + framerelay + generic-hdlc + generic_netlink + gen_stats + gtp + hinic + ila + ipddp + ip_dynaddr + iphase + ipsec + ip-sysctl + ipv6 + ipvlan + ipvs-sysctl + kcm .. only:: subproject and html diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.rst index 9375324aa8e1..38f811d4b2f0 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.rst @@ -1,8 +1,15 @@ -/proc/sys/net/ipv4/* Variables: +.. SPDX-License-Identifier: GPL-2.0 + +========= +IP Sysctl +========= + +/proc/sys/net/ipv4/* Variables +============================== ip_forward - BOOLEAN - 0 - disabled (default) - not 0 - enabled + - 0 - disabled (default) + - not 0 - enabled Forward Packets between interfaces. @@ -38,6 +45,7 @@ ip_no_pmtu_disc - INTEGER could break other protocols. Possible values: 0-3 + Default: FALSE min_pmtu - INTEGER @@ -51,16 +59,20 @@ ip_forward_use_pmtu - BOOLEAN which tries to discover path mtus by itself and depends on the kernel honoring this information. This is normally not the case. + Default: 0 (disabled) + Possible values: - 0 - disabled - 1 - enabled + + - 0 - disabled + - 1 - enabled fwmark_reflect - BOOLEAN Controls the fwmark of kernel-generated IPv4 reply packets that are not associated with a socket for example, TCP RSTs or ICMP echo replies). If unset, these packets have a fwmark of zero. If set, they have the fwmark of the packet they are replying to. + Default: 0 fib_multipath_use_neigh - BOOLEAN @@ -68,63 +80,80 @@ fib_multipath_use_neigh - BOOLEAN multipath routes. If disabled, neighbor information is not used and packets could be directed to a failed nexthop. Only valid for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled. + Default: 0 (disabled) + Possible values: - 0 - disabled - 1 - enabled + + - 0 - disabled + - 1 - enabled fib_multipath_hash_policy - INTEGER Controls which hash policy to use for multipath routes. Only valid for kernels built with CONFIG_IP_ROUTE_MULTIPATH enabled. + Default: 0 (Layer 3) + Possible values: - 0 - Layer 3 - 1 - Layer 4 - 2 - Layer 3 or inner Layer 3 if present + + - 0 - Layer 3 + - 1 - Layer 4 + - 2 - Layer 3 or inner Layer 3 if present fib_sync_mem - UNSIGNED INTEGER Amount of dirty memory from fib entries that can be backlogged before synchronize_rcu is forced. - Default: 512kB Minimum: 64kB Maximum: 64MB + + Default: 512kB Minimum: 64kB Maximum: 64MB ip_forward_update_priority - INTEGER Whether to update SKB priority from "TOS" field in IPv4 header after it is forwarded. The new SKB priority is mapped from TOS field value according to an rt_tos2priority table (see e.g. man tc-prio). + Default: 1 (Update priority.) + Possible values: - 0 - Do not update priority. - 1 - Update priority. + + - 0 - Do not update priority. + - 1 - Update priority. route/max_size - INTEGER Maximum number of routes allowed in the kernel. Increase this when using large numbers of interfaces and/or routes. + From linux kernel 3.6 onwards, this is deprecated for ipv4 as route cache is no longer used. neigh/default/gc_thresh1 - INTEGER Minimum number of entries to keep. Garbage collector will not purge entries if there are fewer than this number. + Default: 128 neigh/default/gc_thresh2 - INTEGER Threshold when garbage collector becomes more aggressive about purging entries. Entries older than 5 seconds will be cleared when over this number. + Default: 512 neigh/default/gc_thresh3 - INTEGER Maximum number of non-PERMANENT neighbor entries allowed. Increase this when using large numbers of interfaces and when communicating with large numbers of directly-connected peers. + Default: 1024 neigh/default/unres_qlen_bytes - INTEGER The maximum number of bytes which may be used by packets queued for each unresolved address by other network layers. (added in linux 3.3) + Setting negative value is meaningless and will return error. + Default: SK_WMEM_MAX, (same as net.core.wmem_default). + Exact value depends on architecture and kernel options, but should be enough to allow queuing 256 packets of medium size. @@ -132,11 +161,14 @@ neigh/default/unres_qlen_bytes - INTEGER neigh/default/unres_qlen - INTEGER The maximum number of packets which may be queued for each unresolved address by other network layers. + (deprecated in linux 3.3) : use unres_qlen_bytes instead. + Prior to linux 3.3, the default value is 3 which may cause unexpected packet loss. The current default value is calculated according to default value of unres_qlen_bytes and true size of packet. + Default: 101 mtu_expires - INTEGER @@ -183,7 +215,8 @@ ipfrag_max_dist - INTEGER from different IP datagrams, which could result in data corruption. Default: 64 -INET peer storage: +INET peer storage +================= inet_peer_threshold - INTEGER The approximate size of the storage. Starting from this threshold @@ -203,7 +236,8 @@ inet_peer_maxttl - INTEGER when the number of entries in the pool is very small). Measured in seconds. -TCP variables: +TCP variables +============= somaxconn - INTEGER Limit of socket listen() backlog, known in userspace as SOMAXCONN. @@ -222,18 +256,22 @@ tcp_adv_win_scale - INTEGER Count buffering overhead as bytes/2^tcp_adv_win_scale (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale), if it is <= 0. + Possible values are [-31, 31], inclusive. + Default: 1 tcp_allowed_congestion_control - STRING Show/set the congestion control choices available to non-privileged processes. The list is a subset of those listed in tcp_available_congestion_control. + Default is "reno" and the default setting (tcp_congestion_control). tcp_app_win - INTEGER Reserve max(window/2^tcp_app_win, mss) of window for application buffer. Value 0 is special, it means that nothing is reserved. + Default: 31 tcp_autocorking - BOOLEAN @@ -244,6 +282,7 @@ tcp_autocorking - BOOLEAN packet for the flow is waiting in Qdisc queues or device transmit queue. Applications can still use TCP_CORK for optimal behavior when they know how/when to uncork their sockets. + Default : 1 tcp_available_congestion_control - STRING @@ -265,6 +304,7 @@ tcp_mtu_probe_floor - INTEGER tcp_min_snd_mss - INTEGER TCP SYN and SYNACK messages usually advertise an ADVMSS option, as described in RFC 1122 and RFC 6691. + If this ADVMSS option is smaller than tcp_min_snd_mss, it is silently capped to tcp_min_snd_mss. @@ -277,6 +317,7 @@ tcp_congestion_control - STRING Default is set as part of kernel configuration. For passive connections, the listener congestion control choice is inherited. + [see setsockopt(listenfd, SOL_TCP, TCP_CONGESTION, "name" ...) ] tcp_dsack - BOOLEAN @@ -286,9 +327,12 @@ tcp_early_retrans - INTEGER Tail loss probe (TLP) converts RTOs occurring due to tail losses into fast recovery (draft-ietf-tcpm-rack). Note that TLP requires RACK to function properly (see tcp_recovery below) + Possible values: - 0 disables TLP - 3 or 4 enables TLP + + - 0 disables TLP + - 3 or 4 enables TLP + Default: 3 tcp_ecn - INTEGER @@ -297,12 +341,17 @@ tcp_ecn - INTEGER support for it. This feature is useful in avoiding losses due to congestion by allowing supporting routers to signal congestion before having to drop packets. + Possible values are: - 0 Disable ECN. Neither initiate nor accept ECN. - 1 Enable ECN when requested by incoming connections and - also request ECN on outgoing connection attempts. - 2 Enable ECN when requested by incoming connections - but do not request ECN on outgoing connections. + + = ===================================================== + 0 Disable ECN. Neither initiate nor accept ECN. + 1 Enable ECN when requested by incoming connections and + also request ECN on outgoing connection attempts. + 2 Enable ECN when requested by incoming connections + but do not request ECN on outgoing connections. + = ===================================================== + Default: 2 tcp_ecn_fallback - BOOLEAN @@ -312,6 +361,7 @@ tcp_ecn_fallback - BOOLEAN additional detection mechanisms could be implemented under this knob. The value is not used, if tcp_ecn or per route (or congestion control) ECN settings are disabled. + Default: 1 (fallback enabled) tcp_fack - BOOLEAN @@ -324,7 +374,9 @@ tcp_fin_timeout - INTEGER valid "receive only" state for an un-orphaned connection, an orphaned connection in FIN_WAIT_2 state could otherwise wait forever for the remote to close its end of the connection. + Cf. tcp_max_orphans + Default: 60 seconds tcp_frto - INTEGER @@ -390,7 +442,8 @@ tcp_l3mdev_accept - BOOLEAN derived from the listen socket to be bound to the L3 domain in which the packets originated. Only valid when the kernel was compiled with CONFIG_NET_L3_MASTER_DEV. - Default: 0 (disabled) + + Default: 0 (disabled) tcp_low_latency - BOOLEAN This is a legacy option, it has no effect anymore. @@ -410,10 +463,14 @@ tcp_max_orphans - INTEGER tcp_max_syn_backlog - INTEGER Maximal number of remembered connection requests (SYN_RECV), which have not received an acknowledgment from connecting client. + This is a per-listener limit. + The minimal value is 128 for low memory machines, and it will increase in proportion to the memory of machine. + If server suffers from overload, try increasing this number. + Remember to also check /proc/sys/net/core/somaxconn A SYN_RECV request socket consumes about 304 bytes of memory. @@ -445,7 +502,9 @@ tcp_min_rtt_wlen - INTEGER minimum RTT when it is moved to a longer path (e.g., due to traffic engineering). A longer window makes the filter more resistant to RTT inflations such as transient congestion. The unit is seconds. + Possible values: 0 - 86400 (1 day) + Default: 300 tcp_moderate_rcvbuf - BOOLEAN @@ -457,9 +516,10 @@ tcp_moderate_rcvbuf - BOOLEAN tcp_mtu_probing - INTEGER Controls TCP Packetization-Layer Path MTU Discovery. Takes three values: - 0 - Disabled - 1 - Disabled by default, enabled when an ICMP black hole detected - 2 - Always enabled, use initial MSS of tcp_base_mss. + + - 0 - Disabled + - 1 - Disabled by default, enabled when an ICMP black hole detected + - 2 - Always enabled, use initial MSS of tcp_base_mss. tcp_probe_interval - UNSIGNED INTEGER Controls how often to start TCP Packetization-Layer Path MTU @@ -481,6 +541,7 @@ tcp_no_metrics_save - BOOLEAN tcp_no_ssthresh_metrics_save - BOOLEAN Controls whether TCP saves ssthresh metrics in the route cache. + Default is 1, which disables ssthresh metrics. tcp_orphan_retries - INTEGER @@ -489,6 +550,7 @@ tcp_orphan_retries - INTEGER See tcp_retries2 for more details. The default value is 8. + If your machine is a loaded WEB server, you should think about lowering this value, such sockets may consume significant resources. Cf. tcp_max_orphans. @@ -497,11 +559,15 @@ tcp_recovery - INTEGER This value is a bitmap to enable various experimental loss recovery features. - RACK: 0x1 enables the RACK loss detection for fast detection of lost - retransmissions and tail drops. It also subsumes and disables - RFC6675 recovery for SACK connections. - RACK: 0x2 makes RACK's reordering window static (min_rtt/4). - RACK: 0x4 disables RACK's DUPACK threshold heuristic + ========= ============================================================= + RACK: 0x1 enables the RACK loss detection for fast detection of lost + retransmissions and tail drops. It also subsumes and disables + RFC6675 recovery for SACK connections. + + RACK: 0x2 makes RACK's reordering window static (min_rtt/4). + + RACK: 0x4 disables RACK's DUPACK threshold heuristic + ========= ============================================================= Default: 0x1 @@ -509,12 +575,14 @@ tcp_reordering - INTEGER Initial reordering level of packets in a TCP stream. TCP stack can then dynamically adjust flow reordering level between this initial value and tcp_max_reordering + Default: 3 tcp_max_reordering - INTEGER Maximal reordering level of packets in a TCP stream. 300 is a fairly conservative value, but you might increase it if paths are using per packet load balancing (like bonding rr mode) + Default: 300 tcp_retrans_collapse - BOOLEAN @@ -550,12 +618,14 @@ tcp_rfc1337 - BOOLEAN If set, the TCP stack behaves conforming to RFC1337. If unset, we are not conforming to RFC, but prevent TCP TIME_WAIT assassination. + Default: 0 tcp_rmem - vector of 3 INTEGERs: min, default, max min: Minimal size of receive buffer used by TCP sockets. It is guaranteed to each TCP socket, even under moderate memory pressure. + Default: 4K default: initial size of receive buffer used by TCP sockets. @@ -592,12 +662,14 @@ tcp_slow_start_after_idle - BOOLEAN window after an idle period. An idle period is defined at the current RTO. If unset, the congestion window will not be timed out after an idle period. + Default: 1 tcp_stdurg - BOOLEAN Use the Host requirements interpretation of the TCP urgent pointer field. Most hosts use the older BSD interpretation, so if you turn this on Linux might not communicate correctly with them. + Default: FALSE tcp_synack_retries - INTEGER @@ -646,15 +718,18 @@ tcp_fastopen - INTEGER the option value being the length of the syn-data backlog. The values (bitmap) are - 0x1: (client) enables sending data in the opening SYN on the client. - 0x2: (server) enables the server support, i.e., allowing data in + + ===== ======== ====================================================== + 0x1 (client) enables sending data in the opening SYN on the client. + 0x2 (server) enables the server support, i.e., allowing data in a SYN packet to be accepted and passed to the application before 3-way handshake finishes. - 0x4: (client) send data in the opening SYN regardless of cookie + 0x4 (client) send data in the opening SYN regardless of cookie availability and without a cookie option. - 0x200: (server) accept data-in-SYN w/o any cookie option present. - 0x400: (server) enable all listeners to support Fast Open by + 0x200 (server) accept data-in-SYN w/o any cookie option present. + 0x400 (server) enable all listeners to support Fast Open by default without explicit TCP_FASTOPEN socket option. + ===== ======== ====================================================== Default: 0x1 @@ -668,6 +743,7 @@ tcp_fastopen_blackhole_timeout_sec - INTEGER get detected right after Fastopen is re-enabled and will reset to initial value when the blackhole issue goes away. 0 to disable the blackhole detection. + By default, it is set to 1hr. tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs @@ -698,20 +774,24 @@ tcp_syn_retries - INTEGER for an active TCP connection attempt will happen after 127seconds. tcp_timestamps - INTEGER -Enable timestamps as defined in RFC1323. - 0: Disabled. - 1: Enable timestamps as defined in RFC1323 and use random offset for - each connection rather than only using the current time. - 2: Like 1, but without random offsets. + Enable timestamps as defined in RFC1323. + + - 0: Disabled. + - 1: Enable timestamps as defined in RFC1323 and use random offset for + each connection rather than only using the current time. + - 2: Like 1, but without random offsets. + Default: 1 tcp_min_tso_segs - INTEGER Minimal number of segments per TSO frame. + Since linux-3.12, TCP does an automatic sizing of TSO frames, depending on flow rate, instead of filling 64Kbytes packets. For specific usages, it's possible to force TCP to build big TSO frames. Note that TCP stack might split too big TSO packets if available window is too small. + Default: 2 tcp_pacing_ss_ratio - INTEGER @@ -720,6 +800,7 @@ tcp_pacing_ss_ratio - INTEGER If TCP is in slow start, tcp_pacing_ss_ratio is applied to let TCP probe for bigger speeds, assuming cwnd can be doubled every other RTT. + Default: 200 tcp_pacing_ca_ratio - INTEGER @@ -727,6 +808,7 @@ tcp_pacing_ca_ratio - INTEGER to current rate. (current_rate = cwnd * mss / srtt) If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio is applied to conservatively probe for bigger throughput. + Default: 120 tcp_tso_win_divisor - INTEGER @@ -734,16 +816,20 @@ tcp_tso_win_divisor - INTEGER can be consumed by a single TSO frame. The setting of this parameter is a choice between burstiness and building larger TSO frames. + Default: 3 tcp_tw_reuse - INTEGER Enable reuse of TIME-WAIT sockets for new connections when it is safe from protocol viewpoint. - 0 - disable - 1 - global enable - 2 - enable for loopback traffic only + + - 0 - disable + - 1 - global enable + - 2 - enable for loopback traffic only + It should not be changed without advice/request of technical experts. + Default: 2 tcp_window_scaling - BOOLEAN @@ -752,11 +838,14 @@ tcp_window_scaling - BOOLEAN tcp_wmem - vector of 3 INTEGERs: min, default, max min: Amount of memory reserved for send buffers for TCP sockets. Each TCP socket has rights to use it due to fact of its birth. + Default: 4K default: initial size of send buffer used by TCP sockets. This value overrides net.core.wmem_default used by other protocols. + It is usually lower than net.core.wmem_default. + Default: 16K max: Maximal amount of memory allowed for automatically tuned @@ -764,6 +853,7 @@ tcp_wmem - vector of 3 INTEGERs: min, default, max net.core.wmem_max. Calling setsockopt() with SO_SNDBUF disables automatic tuning of that socket's send buffer size, in which case this value is ignored. + Default: between 64K and 4MB, depending on RAM size. tcp_notsent_lowat - UNSIGNED INTEGER @@ -784,6 +874,7 @@ tcp_workaround_signed_windows - BOOLEAN remote TCP is broken and treats the window as a signed quantity. If unset, assume the remote TCP is not broken even if we do not receive a window scaling option from them. + Default: 0 tcp_thin_linear_timeouts - BOOLEAN @@ -796,6 +887,7 @@ tcp_thin_linear_timeouts - BOOLEAN non-aggressive thin streams, often found to be time-dependent. For more information on thin streams, see Documentation/networking/tcp-thin.txt + Default: 0 tcp_limit_output_bytes - INTEGER @@ -807,6 +899,7 @@ tcp_limit_output_bytes - INTEGER flows, for typical pfifo_fast qdiscs. tcp_limit_output_bytes limits the number of bytes on qdisc or device to reduce artificial RTT/cwnd and reduce bufferbloat. + Default: 1048576 (16 * 65536) tcp_challenge_ack_limit - INTEGER @@ -822,7 +915,8 @@ tcp_rx_skb_cache - BOOLEAN Default: 0 (disabled) -UDP variables: +UDP variables +============= udp_l3mdev_accept - BOOLEAN Enabling this option allows a "global" bound socket to work @@ -830,7 +924,8 @@ udp_l3mdev_accept - BOOLEAN being received regardless of the L3 domain in which they originated. Only valid when the kernel was compiled with CONFIG_NET_L3_MASTER_DEV. - Default: 0 (disabled) + + Default: 0 (disabled) udp_mem - vector of 3 INTEGERs: min, pressure, max Number of pages allowed for queueing by all UDP sockets. @@ -849,15 +944,18 @@ udp_rmem_min - INTEGER Minimal size of receive buffer used by UDP sockets in moderation. Each UDP socket is able to use the size for receiving data, even if total pages of UDP sockets exceed udp_mem pressure. The unit is byte. + Default: 4K udp_wmem_min - INTEGER Minimal size of send buffer used by UDP sockets in moderation. Each UDP socket is able to use the size for sending data, even if total pages of UDP sockets exceed udp_mem pressure. The unit is byte. + Default: 4K -RAW variables: +RAW variables +============= raw_l3mdev_accept - BOOLEAN Enabling this option allows a "global" bound socket to work @@ -865,9 +963,11 @@ raw_l3mdev_accept - BOOLEAN being received regardless of the L3 domain in which they originated. Only valid when the kernel was compiled with CONFIG_NET_L3_MASTER_DEV. + Default: 1 (enabled) -CIPSOv4 Variables: +CIPSOv4 Variables +================= cipso_cache_enable - BOOLEAN If set, enable additions to and lookups from the CIPSO label mapping @@ -875,6 +975,7 @@ cipso_cache_enable - BOOLEAN miss. However, regardless of the setting the cache is still invalidated when required when means you can safely toggle this on and off and the cache will always be "safe". + Default: 1 cipso_cache_bucket_size - INTEGER @@ -884,6 +985,7 @@ cipso_cache_bucket_size - INTEGER more CIPSO label mappings that can be cached. When the number of entries in a given hash bucket reaches this limit adding new entries causes the oldest entry in the bucket to be removed to make room. + Default: 10 cipso_rbm_optfmt - BOOLEAN @@ -891,6 +993,7 @@ cipso_rbm_optfmt - BOOLEAN the CIPSO draft specification (see Documentation/netlabel for details). This means that when set the CIPSO tag will be padded with empty categories in order to make the packet data 32-bit aligned. + Default: 0 cipso_rbm_structvalid - BOOLEAN @@ -900,9 +1003,11 @@ cipso_rbm_structvalid - BOOLEAN where in the CIPSO processing code but setting this to 0 (False) should result in less work (i.e. it should be faster) but could cause problems with other implementations that require strict checking. + Default: 0 -IP Variables: +IP Variables +============ ip_local_port_range - 2 INTEGERS Defines the local port range that is used by TCP and UDP to @@ -931,12 +1036,12 @@ ip_local_reserved_ports - list of comma separated ranges assignments. You can reserve ports which are not in the current - ip_local_port_range, e.g.: + ip_local_port_range, e.g.:: - $ cat /proc/sys/net/ipv4/ip_local_port_range - 32000 60999 - $ cat /proc/sys/net/ipv4/ip_local_reserved_ports - 8080,9148 + $ cat /proc/sys/net/ipv4/ip_local_port_range + 32000 60999 + $ cat /proc/sys/net/ipv4/ip_local_reserved_ports + 8080,9148 although this is redundant. However such a setting is useful if later the port range is changed to a value that will @@ -956,6 +1061,7 @@ ip_unprivileged_port_start - INTEGER ip_nonlocal_bind - BOOLEAN If set, allows processes to bind() to non-local IP addresses, which can be quite useful - but may break some applications. + Default: 0 ip_autobind_reuse - BOOLEAN @@ -972,6 +1078,7 @@ ip_dynaddr - BOOLEAN If set to a non-zero value larger than 1, a kernel log message will be printed when dynamic address rewriting occurs. + Default: 0 ip_early_demux - BOOLEAN @@ -981,6 +1088,7 @@ ip_early_demux - BOOLEAN It may add an additional cost for pure routing workloads that reduces overall throughput, in such case you should disable it. + Default: 1 ping_group_range - 2 INTEGERS @@ -992,21 +1100,25 @@ ping_group_range - 2 INTEGERS tcp_early_demux - BOOLEAN Enable early demux for established TCP sockets. + Default: 1 udp_early_demux - BOOLEAN Enable early demux for connected UDP sockets. Disable this if your system could experience more unconnected load. + Default: 1 icmp_echo_ignore_all - BOOLEAN If set non-zero, then the kernel will ignore all ICMP ECHO requests sent to it. + Default: 0 icmp_echo_ignore_broadcasts - BOOLEAN If set non-zero, then the kernel will ignore all ICMP ECHO and TIMESTAMP requests sent to it via broadcast/multicast. + Default: 1 icmp_ratelimit - INTEGER @@ -1016,46 +1128,55 @@ icmp_ratelimit - INTEGER otherwise the minimal space between responses in milliseconds. Note that another sysctl, icmp_msgs_per_sec limits the number of ICMP packets sent on all targets. + Default: 1000 icmp_msgs_per_sec - INTEGER Limit maximal number of ICMP packets sent per second from this host. Only messages whose type matches icmp_ratemask (see below) are controlled by this limit. + Default: 1000 icmp_msgs_burst - INTEGER icmp_msgs_per_sec controls number of ICMP packets sent per second, while icmp_msgs_burst controls the burst size of these packets. + Default: 50 icmp_ratemask - INTEGER Mask made of ICMP types for which rates are being limited. + Significant bits: IHGFEDCBA9876543210 + Default mask: 0000001100000011000 (6168) Bit definitions (see include/linux/icmp.h): + + = ========================= 0 Echo Reply - 3 Destination Unreachable * - 4 Source Quench * + 3 Destination Unreachable [1]_ + 4 Source Quench [1]_ 5 Redirect 8 Echo Request - B Time Exceeded * - C Parameter Problem * + B Time Exceeded [1]_ + C Parameter Problem [1]_ D Timestamp Request E Timestamp Reply F Info Request G Info Reply H Address Mask Request I Address Mask Reply + = ========================= - * These are rate limited by default (see default mask above) + .. [1] These are rate limited by default (see default mask above) icmp_ignore_bogus_error_responses - BOOLEAN Some routers violate RFC1122 by sending bogus responses to broadcast frames. Such violations are normally logged via a kernel warning. If this is set to TRUE, the kernel will not give such warnings, which will avoid log file clutter. + Default: 1 icmp_errors_use_inbound_ifaddr - BOOLEAN @@ -1100,32 +1221,39 @@ igmp_max_memberships - INTEGER igmp_max_msf - INTEGER Maximum number of addresses allowed in the source filter list for a multicast group. + Default: 10 igmp_qrv - INTEGER Controls the IGMP query robustness variable (see RFC2236 8.1). + Default: 2 (as specified by RFC2236 8.1) + Minimum: 1 (as specified by RFC6636 4.5) force_igmp_version - INTEGER - 0 - (default) No enforcement of a IGMP version, IGMPv1/v2 fallback - allowed. Will back to IGMPv3 mode again if all IGMPv1/v2 Querier - Present timer expires. - 1 - Enforce to use IGMP version 1. Will also reply IGMPv1 report if - receive IGMPv2/v3 query. - 2 - Enforce to use IGMP version 2. Will fallback to IGMPv1 if receive - IGMPv1 query message. Will reply report if receive IGMPv3 query. - 3 - Enforce to use IGMP version 3. The same react with default 0. + - 0 - (default) No enforcement of a IGMP version, IGMPv1/v2 fallback + allowed. Will back to IGMPv3 mode again if all IGMPv1/v2 Querier + Present timer expires. + - 1 - Enforce to use IGMP version 1. Will also reply IGMPv1 report if + receive IGMPv2/v3 query. + - 2 - Enforce to use IGMP version 2. Will fallback to IGMPv1 if receive + IGMPv1 query message. Will reply report if receive IGMPv3 query. + - 3 - Enforce to use IGMP version 3. The same react with default 0. + + .. note:: - Note: this is not the same with force_mld_version because IGMPv3 RFC3376 - Security Considerations does not have clear description that we could - ignore other version messages completely as MLDv2 RFC3810. So make - this value as default 0 is recommended. + this is not the same with force_mld_version because IGMPv3 RFC3376 + Security Considerations does not have clear description that we could + ignore other version messages completely as MLDv2 RFC3810. So make + this value as default 0 is recommended. -conf/interface/* changes special settings per interface (where -"interface" is the name of your network interface) +``conf/interface/*`` + changes special settings per interface (where + interface" is the name of your network interface) -conf/all/* is special, changes the settings for all interfaces +``conf/all/*`` + is special, changes the settings for all interfaces log_martians - BOOLEAN Log packets with impossible addresses to kernel log. @@ -1136,14 +1264,21 @@ log_martians - BOOLEAN accept_redirects - BOOLEAN Accept ICMP redirect messages. accept_redirects for the interface will be enabled if: + - both conf/{all,interface}/accept_redirects are TRUE in the case forwarding for the interface is enabled + or + - at least one of conf/{all,interface}/accept_redirects is TRUE in the case forwarding for the interface is disabled + accept_redirects for the interface will be disabled otherwise - default TRUE (host) - FALSE (router) + + default: + + - TRUE (host) + - FALSE (router) forwarding - BOOLEAN Enable IP forwarding on this interface. This controls whether packets @@ -1168,12 +1303,14 @@ medium_id - INTEGER proxy_arp - BOOLEAN Do proxy arp. + proxy_arp for the interface will be enabled if at least one of conf/{all,interface}/proxy_arp is set to TRUE, it will be disabled otherwise proxy_arp_pvlan - BOOLEAN Private VLAN proxy arp. + Basically allow proxy arp replies back to the same interface (from which the ARP request/solicitation was received). @@ -1186,6 +1323,7 @@ proxy_arp_pvlan - BOOLEAN proxy_arp. This technology is known by different names: + In RFC 3069 it is called VLAN Aggregation. Cisco and Allied Telesyn call it Private VLAN. Hewlett-Packard call it Source-Port filtering or port-isolation. @@ -1194,26 +1332,33 @@ proxy_arp_pvlan - BOOLEAN shared_media - BOOLEAN Send(router) or accept(host) RFC1620 shared media redirects. Overrides secure_redirects. + shared_media for the interface will be enabled if at least one of conf/{all,interface}/shared_media is set to TRUE, it will be disabled otherwise + default TRUE secure_redirects - BOOLEAN Accept ICMP redirect messages only to gateways listed in the interface's current gateway list. Even if disabled, RFC1122 redirect rules still apply. + Overridden by shared_media. + secure_redirects for the interface will be enabled if at least one of conf/{all,interface}/secure_redirects is set to TRUE, it will be disabled otherwise + default TRUE send_redirects - BOOLEAN Send redirects, if router. + send_redirects for the interface will be enabled if at least one of conf/{all,interface}/send_redirects is set to TRUE, it will be disabled otherwise + Default: TRUE bootp_relay - BOOLEAN @@ -1222,15 +1367,20 @@ bootp_relay - BOOLEAN BOOTP relay daemon will catch and forward such packets. conf/all/bootp_relay must also be set to TRUE to enable BOOTP relay for the interface + default FALSE + Not Implemented Yet. accept_source_route - BOOLEAN Accept packets with SRR option. conf/all/accept_source_route must also be set to TRUE to accept packets with SRR option on the interface - default TRUE (router) - FALSE (host) + + default + + - TRUE (router) + - FALSE (host) accept_local - BOOLEAN Accept packets with local source addresses. In combination with @@ -1241,18 +1391,19 @@ accept_local - BOOLEAN route_localnet - BOOLEAN Do not consider loopback addresses as martian source or destination while routing. This enables the use of 127/8 for local routing purposes. + default FALSE rp_filter - INTEGER - 0 - No source validation. - 1 - Strict mode as defined in RFC3704 Strict Reverse Path - Each incoming packet is tested against the FIB and if the interface - is not the best reverse path the packet check will fail. - By default failed packets are discarded. - 2 - Loose mode as defined in RFC3704 Loose Reverse Path - Each incoming packet's source address is also tested against the FIB - and if the source address is not reachable via any interface - the packet check will fail. + - 0 - No source validation. + - 1 - Strict mode as defined in RFC3704 Strict Reverse Path + Each incoming packet is tested against the FIB and if the interface + is not the best reverse path the packet check will fail. + By default failed packets are discarded. + - 2 - Loose mode as defined in RFC3704 Loose Reverse Path + Each incoming packet's source address is also tested against the FIB + and if the source address is not reachable via any interface + the packet check will fail. Current recommended practice in RFC3704 is to enable strict mode to prevent IP spoofing from DDos attacks. If using asymmetric routing @@ -1265,19 +1416,19 @@ rp_filter - INTEGER in startup scripts. arp_filter - BOOLEAN - 1 - Allows you to have multiple network interfaces on the same - subnet, and have the ARPs for each interface be answered - based on whether or not the kernel would route a packet from - the ARP'd IP out that interface (therefore you must use source - based routing for this to work). In other words it allows control - of which cards (usually 1) will respond to an arp request. - - 0 - (default) The kernel can respond to arp requests with addresses - from other interfaces. This may seem wrong but it usually makes - sense, because it increases the chance of successful communication. - IP addresses are owned by the complete host on Linux, not by - particular interfaces. Only for more complex setups like load- - balancing, does this behaviour cause problems. + - 1 - Allows you to have multiple network interfaces on the same + subnet, and have the ARPs for each interface be answered + based on whether or not the kernel would route a packet from + the ARP'd IP out that interface (therefore you must use source + based routing for this to work). In other words it allows control + of which cards (usually 1) will respond to an arp request. + + - 0 - (default) The kernel can respond to arp requests with addresses + from other interfaces. This may seem wrong but it usually makes + sense, because it increases the chance of successful communication. + IP addresses are owned by the complete host on Linux, not by + particular interfaces. Only for more complex setups like load- + balancing, does this behaviour cause problems. arp_filter for the interface will be enabled if at least one of conf/{all,interface}/arp_filter is set to TRUE, @@ -1287,26 +1438,27 @@ arp_announce - INTEGER Define different restriction levels for announcing the local source IP address from IP packets in ARP requests sent on interface: - 0 - (default) Use any local address, configured on any interface - 1 - Try to avoid local addresses that are not in the target's - subnet for this interface. This mode is useful when target - hosts reachable via this interface require the source IP - address in ARP requests to be part of their logical network - configured on the receiving interface. When we generate the - request we will check all our subnets that include the - target IP and will preserve the source address if it is from - such subnet. If there is no such subnet we select source - address according to the rules for level 2. - 2 - Always use the best local address for this target. - In this mode we ignore the source address in the IP packet - and try to select local address that we prefer for talks with - the target host. Such local address is selected by looking - for primary IP addresses on all our subnets on the outgoing - interface that include the target IP address. If no suitable - local address is found we select the first local address - we have on the outgoing interface or on all other interfaces, - with the hope we will receive reply for our request and - even sometimes no matter the source IP address we announce. + + - 0 - (default) Use any local address, configured on any interface + - 1 - Try to avoid local addresses that are not in the target's + subnet for this interface. This mode is useful when target + hosts reachable via this interface require the source IP + address in ARP requests to be part of their logical network + configured on the receiving interface. When we generate the + request we will check all our subnets that include the + target IP and will preserve the source address if it is from + such subnet. If there is no such subnet we select source + address according to the rules for level 2. + - 2 - Always use the best local address for this target. + In this mode we ignore the source address in the IP packet + and try to select local address that we prefer for talks with + the target host. Such local address is selected by looking + for primary IP addresses on all our subnets on the outgoing + interface that include the target IP address. If no suitable + local address is found we select the first local address + we have on the outgoing interface or on all other interfaces, + with the hope we will receive reply for our request and + even sometimes no matter the source IP address we announce. The max value from conf/{all,interface}/arp_announce is used. @@ -1317,32 +1469,37 @@ arp_announce - INTEGER arp_ignore - INTEGER Define different modes for sending replies in response to received ARP requests that resolve local target IP addresses: - 0 - (default): reply for any local target IP address, configured - on any interface - 1 - reply only if the target IP address is local address - configured on the incoming interface - 2 - reply only if the target IP address is local address - configured on the incoming interface and both with the - sender's IP address are part from same subnet on this interface - 3 - do not reply for local addresses configured with scope host, - only resolutions for global and link addresses are replied - 4-7 - reserved - 8 - do not reply for all local addresses + + - 0 - (default): reply for any local target IP address, configured + on any interface + - 1 - reply only if the target IP address is local address + configured on the incoming interface + - 2 - reply only if the target IP address is local address + configured on the incoming interface and both with the + sender's IP address are part from same subnet on this interface + - 3 - do not reply for local addresses configured with scope host, + only resolutions for global and link addresses are replied + - 4-7 - reserved + - 8 - do not reply for all local addresses The max value from conf/{all,interface}/arp_ignore is used when ARP request is received on the {interface} arp_notify - BOOLEAN Define mode for notification of address and device changes. - 0 - (default): do nothing - 1 - Generate gratuitous arp requests when device is brought up - or hardware address changes. + + == ========================================================== + 0 (default): do nothing + 1 Generate gratuitous arp requests when device is brought up + or hardware address changes. + == ========================================================== arp_accept - BOOLEAN Define behavior for gratuitous ARP frames who's IP is not already present in the ARP table: - 0 - don't create new entries in the ARP table - 1 - create new entries in the ARP table + + - 0 - don't create new entries in the ARP table + - 1 - create new entries in the ARP table Both replies and requests type gratuitous arp will trigger the ARP table to be updated, if this setting is on. @@ -1378,11 +1535,13 @@ disable_xfrm - BOOLEAN igmpv2_unsolicited_report_interval - INTEGER The interval in milliseconds in which the next unsolicited IGMPv1 or IGMPv2 report retransmit will take place. + Default: 10000 (10 seconds) igmpv3_unsolicited_report_interval - INTEGER The interval in milliseconds in which the next unsolicited IGMPv3 report retransmit will take place. + Default: 1000 (1 seconds) promote_secondaries - BOOLEAN @@ -1393,19 +1552,23 @@ promote_secondaries - BOOLEAN drop_unicast_in_l2_multicast - BOOLEAN Drop any unicast IP packets that are received in link-layer multicast (or broadcast) frames. + This behavior (for multicast) is actually a SHOULD in RFC 1122, but is disabled by default for compatibility reasons. + Default: off (0) drop_gratuitous_arp - BOOLEAN Drop all gratuitous ARP frames, for example if there's a known good ARP proxy on the network and such frames need not be used (or in the case of 802.11, must not be used to prevent attacks.) + Default: off (0) tag - INTEGER Allows you to write a number, which can be used as required. + Default value is 0. xfrm4_gc_thresh - INTEGER @@ -1417,21 +1580,24 @@ xfrm4_gc_thresh - INTEGER igmp_link_local_mcast_reports - BOOLEAN Enable IGMP reports for link local multicast groups in the 224.0.0.X range. + Default TRUE Alexey Kuznetsov. kuznet@ms2.inr.ac.ru Updated by: -Andi Kleen -ak@muc.de -Nicolas Delon -delon.nicolas@wanadoo.fr +- Andi Kleen + ak@muc.de +- Nicolas Delon + delon.nicolas@wanadoo.fr -/proc/sys/net/ipv6/* Variables: + +/proc/sys/net/ipv6/* Variables +============================== IPv6 has no global variables such as tcp_*. tcp_* settings under ipv4/ also apply to IPv6 [XXX?]. @@ -1440,8 +1606,9 @@ bindv6only - BOOLEAN Default value for IPV6_V6ONLY socket option, which restricts use of the IPv6 socket to IPv6 communication only. - TRUE: disable IPv4-mapped address feature - FALSE: enable IPv4-mapped address feature + + - TRUE: disable IPv4-mapped address feature + - FALSE: enable IPv4-mapped address feature Default: FALSE (as specified in RFC3493) @@ -1449,8 +1616,10 @@ flowlabel_consistency - BOOLEAN Protect the consistency (and unicity) of flow label. You have to disable it to use IPV6_FL_F_REFLECT flag on the flow label manager. - TRUE: enabled - FALSE: disabled + + - TRUE: enabled + - FALSE: disabled + Default: TRUE auto_flowlabels - INTEGER @@ -1458,22 +1627,28 @@ auto_flowlabels - INTEGER packet. This allows intermediate devices, such as routers, to identify packet flows for mechanisms like Equal Cost Multipath Routing (see RFC 6438). - 0: automatic flow labels are completely disabled - 1: automatic flow labels are enabled by default, they can be + + = =========================================================== + 0 automatic flow labels are completely disabled + 1 automatic flow labels are enabled by default, they can be disabled on a per socket basis using the IPV6_AUTOFLOWLABEL socket option - 2: automatic flow labels are allowed, they may be enabled on a + 2 automatic flow labels are allowed, they may be enabled on a per socket basis using the IPV6_AUTOFLOWLABEL socket option - 3: automatic flow labels are enabled and enforced, they cannot + 3 automatic flow labels are enabled and enforced, they cannot be disabled by the socket option + = =========================================================== + Default: 1 flowlabel_state_ranges - BOOLEAN Split the flow label number space into two ranges. 0-0x7FFFF is reserved for the IPv6 flow manager facility, 0x80000-0xFFFFF is reserved for stateless flow labels as described in RFC6437. - TRUE: enabled - FALSE: disabled + + - TRUE: enabled + - FALSE: disabled + Default: true flowlabel_reflect - INTEGER @@ -1483,49 +1658,59 @@ flowlabel_reflect - INTEGER https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01 This is a bitmask. - 1: enabled for established flows - Note that this prevents automatic flowlabel changes, as done - in "tcp: change IPv6 flow-label upon receiving spurious retransmission" - and "tcp: Change txhash on every SYN and RTO retransmit" + - 1: enabled for established flows + + Note that this prevents automatic flowlabel changes, as done + in "tcp: change IPv6 flow-label upon receiving spurious retransmission" + and "tcp: Change txhash on every SYN and RTO retransmit" - 2: enabled for TCP RESET packets (no active listener) - If set, a RST packet sent in response to a SYN packet on a closed - port will reflect the incoming flow label. + - 2: enabled for TCP RESET packets (no active listener) + If set, a RST packet sent in response to a SYN packet on a closed + port will reflect the incoming flow label. - 4: enabled for ICMPv6 echo reply messages. + - 4: enabled for ICMPv6 echo reply messages. Default: 0 fib_multipath_hash_policy - INTEGER Controls which hash policy to use for multipath routes. + Default: 0 (Layer 3) + Possible values: - 0 - Layer 3 (source and destination addresses plus flow label) - 1 - Layer 4 (standard 5-tuple) - 2 - Layer 3 or inner Layer 3 if present + + - 0 - Layer 3 (source and destination addresses plus flow label) + - 1 - Layer 4 (standard 5-tuple) + - 2 - Layer 3 or inner Layer 3 if present anycast_src_echo_reply - BOOLEAN Controls the use of anycast addresses as source addresses for ICMPv6 echo reply - TRUE: enabled - FALSE: disabled + + - TRUE: enabled + - FALSE: disabled + Default: FALSE idgen_delay - INTEGER Controls the delay in seconds after which time to retry privacy stable address generation if a DAD conflict is detected. + Default: 1 (as specified in RFC7217) idgen_retries - INTEGER Controls the number of retries to generate a stable privacy address if a DAD conflict is detected. + Default: 3 (as specified in RFC7217) mld_qrv - INTEGER Controls the MLD query robustness variable (see RFC3810 9.1). + Default: 2 (as specified by RFC3810 9.1) + Minimum: 1 (as specified by RFC6636 4.5) max_dst_opts_number - INTEGER @@ -1533,6 +1718,7 @@ max_dst_opts_number - INTEGER options extension header. If this value is less than zero then unknown options are disallowed and the number of known TLVs allowed is the absolute value of this number. + Default: 8 max_hbh_opts_number - INTEGER @@ -1540,16 +1726,19 @@ max_hbh_opts_number - INTEGER options extension header. If this value is less than zero then unknown options are disallowed and the number of known TLVs allowed is the absolute value of this number. + Default: 8 max_dst_opts_length - INTEGER Maximum length allowed for a Destination options extension header. + Default: INT_MAX (unlimited) max_hbh_length - INTEGER Maximum length allowed for a Hop-by-Hop options extension header. + Default: INT_MAX (unlimited) skip_notify_on_dev_down - BOOLEAN @@ -1558,8 +1747,21 @@ skip_notify_on_dev_down - BOOLEAN generate this message; IPv6 does by default. Setting this sysctl to true skips the message, making IPv4 and IPv6 on par in relying on userspace caches to track link events and evict routes. + Default: false (generate message) +nexthop_compat_mode - BOOLEAN + New nexthop API provides a means for managing nexthops independent of + prefixes. Backwards compatibilty with old route format is enabled by + default which means route dumps and notifications contain the new + nexthop attribute but also the full, expanded nexthop definition. + Further, updates or deletes of a nexthop configuration generate route + notifications for each fib entry using the nexthop. Once a system + understands the new API, this sysctl can be disabled to achieve full + performance benefits of the new API by disabling the nexthop expansion + and extraneous notifications. + Default: true (backward compat mode) + IPv6 Fragmentation: ip6frag_high_thresh - INTEGER @@ -1580,18 +1782,20 @@ seg6_flowlabel - INTEGER Controls the behaviour of computing the flowlabel of outer IPv6 header in case of SR T.encaps - -1 set flowlabel to zero. - 0 copy flowlabel from Inner packet in case of Inner IPv6 - (Set flowlabel to 0 in case IPv4/L2) - 1 Compute the flowlabel using seg6_make_flowlabel() + == ======================================================= + -1 set flowlabel to zero. + 0 copy flowlabel from Inner packet in case of Inner IPv6 + (Set flowlabel to 0 in case IPv4/L2) + 1 Compute the flowlabel using seg6_make_flowlabel() + == ======================================================= Default is 0. -conf/default/*: +``conf/default/*``: Change the interface-specific default settings. -conf/all/*: +``conf/all/*``: Change all the interface-specific settings. [XXX: Other special features than forwarding?] @@ -1615,9 +1819,10 @@ fwmark_reflect - BOOLEAN associated with a socket for example, TCP RSTs or ICMPv6 echo replies). If unset, these packets have a fwmark of zero. If set, they have the fwmark of the packet they are replying to. + Default: 0 -conf/interface/*: +``conf/interface/*``: Change special settings per interface. The functional behaviour for certain settings is different @@ -1632,31 +1837,40 @@ accept_ra - INTEGER transmitted. Possible values are: - 0 Do not accept Router Advertisements. - 1 Accept Router Advertisements if forwarding is disabled. - 2 Overrule forwarding behaviour. Accept Router Advertisements - even if forwarding is enabled. - Functional default: enabled if local forwarding is disabled. - disabled if local forwarding is enabled. + == =========================================================== + 0 Do not accept Router Advertisements. + 1 Accept Router Advertisements if forwarding is disabled. + 2 Overrule forwarding behaviour. Accept Router Advertisements + even if forwarding is enabled. + == =========================================================== + + Functional default: + + - enabled if local forwarding is disabled. + - disabled if local forwarding is enabled. accept_ra_defrtr - BOOLEAN Learn default router in Router Advertisement. - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. + Functional default: + + - enabled if accept_ra is enabled. + - disabled if accept_ra is disabled. accept_ra_from_local - BOOLEAN Accept RA with source-address that is found on local machine - if the RA is otherwise proper and able to be accepted. - Default is to NOT accept these as it may be an un-intended - network loop. + if the RA is otherwise proper and able to be accepted. + + Default is to NOT accept these as it may be an un-intended + network loop. Functional default: - enabled if accept_ra_from_local is enabled - on a specific interface. - disabled if accept_ra_from_local is disabled - on a specific interface. + + - enabled if accept_ra_from_local is enabled + on a specific interface. + - disabled if accept_ra_from_local is disabled + on a specific interface. accept_ra_min_hop_limit - INTEGER Minimum hop limit Information in Router Advertisement. @@ -1669,8 +1883,10 @@ accept_ra_min_hop_limit - INTEGER accept_ra_pinfo - BOOLEAN Learn Prefix Information in Router Advertisement. - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. + Functional default: + + - enabled if accept_ra is enabled. + - disabled if accept_ra is disabled. accept_ra_rt_info_min_plen - INTEGER Minimum prefix length of Route Information in RA. @@ -1678,8 +1894,10 @@ accept_ra_rt_info_min_plen - INTEGER Route Information w/ prefix smaller than this variable shall be ignored. - Functional default: 0 if accept_ra_rtr_pref is enabled. - -1 if accept_ra_rtr_pref is disabled. + Functional default: + + * 0 if accept_ra_rtr_pref is enabled. + * -1 if accept_ra_rtr_pref is disabled. accept_ra_rt_info_max_plen - INTEGER Maximum prefix length of Route Information in RA. @@ -1687,33 +1905,41 @@ accept_ra_rt_info_max_plen - INTEGER Route Information w/ prefix larger than this variable shall be ignored. - Functional default: 0 if accept_ra_rtr_pref is enabled. - -1 if accept_ra_rtr_pref is disabled. + Functional default: + + * 0 if accept_ra_rtr_pref is enabled. + * -1 if accept_ra_rtr_pref is disabled. accept_ra_rtr_pref - BOOLEAN Accept Router Preference in RA. - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. + Functional default: + + - enabled if accept_ra is enabled. + - disabled if accept_ra is disabled. accept_ra_mtu - BOOLEAN Apply the MTU value specified in RA option 5 (RFC4861). If disabled, the MTU specified in the RA will be ignored. - Functional default: enabled if accept_ra is enabled. - disabled if accept_ra is disabled. + Functional default: + + - enabled if accept_ra is enabled. + - disabled if accept_ra is disabled. accept_redirects - BOOLEAN Accept Redirects. - Functional default: enabled if local forwarding is disabled. - disabled if local forwarding is enabled. + Functional default: + + - enabled if local forwarding is disabled. + - disabled if local forwarding is enabled. accept_source_route - INTEGER Accept source routing (routing extension header). - >= 0: Accept only routing header type 2. - < 0: Do not accept routing header. + - >= 0: Accept only routing header type 2. + - < 0: Do not accept routing header. Default: 0 @@ -1721,24 +1947,30 @@ autoconf - BOOLEAN Autoconfigure addresses using Prefix Information in Router Advertisements. - Functional default: enabled if accept_ra_pinfo is enabled. - disabled if accept_ra_pinfo is disabled. + Functional default: + + - enabled if accept_ra_pinfo is enabled. + - disabled if accept_ra_pinfo is disabled. dad_transmits - INTEGER The amount of Duplicate Address Detection probes to send. + Default: 1 forwarding - INTEGER Configure interface-specific Host/Router behaviour. - Note: It is recommended to have the same setting on all - interfaces; mixed router/host scenarios are rather uncommon. + .. note:: + + It is recommended to have the same setting on all + interfaces; mixed router/host scenarios are rather uncommon. Possible values are: - 0 Forwarding disabled - 1 Forwarding enabled - FALSE (0): + - 0 Forwarding disabled + - 1 Forwarding enabled + + **FALSE (0)**: By default, Host behaviour is assumed. This means: @@ -1749,7 +1981,7 @@ forwarding - INTEGER Advertisements (and do autoconfiguration). 4. If accept_redirects is TRUE (default), accept Redirects. - TRUE (1): + **TRUE (1)**: If local forwarding is enabled, Router behaviour is assumed. This means exactly the reverse from the above: @@ -1760,19 +1992,22 @@ forwarding - INTEGER 4. Redirects are ignored. Default: 0 (disabled) if global forwarding is disabled (default), - otherwise 1 (enabled). + otherwise 1 (enabled). hop_limit - INTEGER Default Hop Limit to set. + Default: 64 mtu - INTEGER Default Maximum Transfer Unit + Default: 1280 (IPv6 required minimum) ip_nonlocal_bind - BOOLEAN If set, allows processes to bind() to non-local IPv6 addresses, which can be quite useful - but may break some applications. + Default: 0 router_probe_interval - INTEGER @@ -1784,15 +2019,18 @@ router_probe_interval - INTEGER router_solicitation_delay - INTEGER Number of seconds to wait after interface is brought up before sending Router Solicitations. + Default: 1 router_solicitation_interval - INTEGER Number of seconds to wait between Router Solicitations. + Default: 4 router_solicitations - INTEGER Number of Router Solicitations to send until assuming no routers are present. + Default: 3 use_oif_addrs_only - BOOLEAN @@ -1804,28 +2042,35 @@ use_oif_addrs_only - BOOLEAN use_tempaddr - INTEGER Preference for Privacy Extensions (RFC3041). - <= 0 : disable Privacy Extensions - == 1 : enable Privacy Extensions, but prefer public - addresses over temporary addresses. - > 1 : enable Privacy Extensions and prefer temporary - addresses over public addresses. - Default: 0 (for most devices) - -1 (for point-to-point devices and loopback devices) + + * <= 0 : disable Privacy Extensions + * == 1 : enable Privacy Extensions, but prefer public + addresses over temporary addresses. + * > 1 : enable Privacy Extensions and prefer temporary + addresses over public addresses. + + Default: + + * 0 (for most devices) + * -1 (for point-to-point devices and loopback devices) temp_valid_lft - INTEGER valid lifetime (in seconds) for temporary addresses. + Default: 604800 (7 days) temp_prefered_lft - INTEGER Preferred lifetime (in seconds) for temporary addresses. + Default: 86400 (1 day) keep_addr_on_down - INTEGER Keep all IPv6 addresses on an interface down event. If set static global addresses with no expiration time are not flushed. - >0 : enabled - 0 : system default - <0 : disabled + + * >0 : enabled + * 0 : system default + * <0 : disabled Default: 0 (addresses are removed) @@ -1834,11 +2079,13 @@ max_desync_factor - INTEGER that ensures that clients don't synchronize with each other and generate new addresses at exactly the same time. value is in seconds. + Default: 600 regen_max_retry - INTEGER Number of attempts before give up attempting to generate valid temporary addresses. + Default: 5 max_addresses - INTEGER @@ -1846,12 +2093,14 @@ max_addresses - INTEGER to zero disables the limitation. It is not recommended to set this value too large (or to zero) because it would be an easy way to crash the kernel by allowing too many addresses to be created. + Default: 16 disable_ipv6 - BOOLEAN Disable IPv6 operation. If accept_dad is set to 2, this value will be dynamically set to TRUE if DAD fails for the link-local address. + Default: FALSE (enable IPv6 operation) When this value is changed from 1 to 0 (IPv6 is being enabled), @@ -1865,10 +2114,13 @@ disable_ipv6 - BOOLEAN accept_dad - INTEGER Whether to accept DAD (Duplicate Address Detection). - 0: Disable DAD - 1: Enable DAD (default) - 2: Enable DAD, and disable IPv6 operation if MAC-based duplicate - link-local address has been found. + + == ============================================================== + 0 Disable DAD + 1 Enable DAD (default) + 2 Enable DAD, and disable IPv6 operation if MAC-based duplicate + link-local address has been found. + == ============================================================== DAD operation and mode on a given interface will be selected according to the maximum value of conf/{all,interface}/accept_dad. @@ -1876,6 +2128,7 @@ accept_dad - INTEGER force_tllao - BOOLEAN Enable sending the target link-layer address option even when responding to a unicast neighbor solicitation. + Default: FALSE Quoting from RFC 2461, section 4.4, Target link-layer address: @@ -1893,9 +2146,10 @@ force_tllao - BOOLEAN ndisc_notify - BOOLEAN Define mode for notification of address and device changes. - 0 - (default): do nothing - 1 - Generate unsolicited neighbour advertisements when device is brought - up or hardware address changes. + + * 0 - (default): do nothing + * 1 - Generate unsolicited neighbour advertisements when device is brought + up or hardware address changes. ndisc_tclass - INTEGER The IPv6 Traffic Class to use by default when sending IPv6 Neighbor @@ -1904,33 +2158,38 @@ ndisc_tclass - INTEGER These 8 bits can be interpreted as 6 high order bits holding the DSCP value and 2 low order bits representing ECN (which you probably want to leave cleared). - 0 - (default) + + * 0 - (default) mldv1_unsolicited_report_interval - INTEGER The interval in milliseconds in which the next unsolicited MLDv1 report retransmit will take place. + Default: 10000 (10 seconds) mldv2_unsolicited_report_interval - INTEGER The interval in milliseconds in which the next unsolicited MLDv2 report retransmit will take place. + Default: 1000 (1 second) force_mld_version - INTEGER - 0 - (default) No enforcement of a MLD version, MLDv1 fallback allowed - 1 - Enforce to use MLD version 1 - 2 - Enforce to use MLD version 2 + * 0 - (default) No enforcement of a MLD version, MLDv1 fallback allowed + * 1 - Enforce to use MLD version 1 + * 2 - Enforce to use MLD version 2 suppress_frag_ndisc - INTEGER Control RFC 6980 (Security Implications of IPv6 Fragmentation with IPv6 Neighbor Discovery) behavior: - 1 - (default) discard fragmented neighbor discovery packets - 0 - allow fragmented neighbor discovery packets + + * 1 - (default) discard fragmented neighbor discovery packets + * 0 - allow fragmented neighbor discovery packets optimistic_dad - BOOLEAN Whether to perform Optimistic Duplicate Address Detection (RFC 4429). - 0: disabled (default) - 1: enabled + + * 0: disabled (default) + * 1: enabled Optimistic Duplicate Address Detection for the interface will be enabled if at least one of conf/{all,interface}/optimistic_dad is set to 1, @@ -1941,8 +2200,9 @@ use_optimistic - BOOLEAN source address selection. Preferred addresses will still be chosen before optimistic addresses, subject to other ranking in the source address selection algorithm. - 0: disabled (default) - 1: enabled + + * 0: disabled (default) + * 1: enabled This will be enabled if at least one of conf/{all,interface}/use_optimistic is set to 1, disabled otherwise. @@ -1964,12 +2224,14 @@ stable_secret - IPv6 address addr_gen_mode - INTEGER Defines how link-local and autoconf addresses are generated. - 0: generate address based on EUI64 (default) - 1: do no generate a link-local address, use EUI64 for addresses generated - from autoconf - 2: generate stable privacy addresses, using the secret from + = ================================================================= + 0 generate address based on EUI64 (default) + 1 do no generate a link-local address, use EUI64 for addresses + generated from autoconf + 2 generate stable privacy addresses, using the secret from stable_secret (RFC7217) - 3: generate stable privacy addresses, using a random secret if unset + 3 generate stable privacy addresses, using a random secret if unset + = ================================================================= drop_unicast_in_l2_multicast - BOOLEAN Drop any unicast IPv6 packets that are received in link-layer @@ -1991,13 +2253,18 @@ enhanced_dad - BOOLEAN detection of duplicates due to loopback of the NS messages that we send. The nonce option will be sent on an interface unless both of conf/{all,interface}/enhanced_dad are set to FALSE. + Default: TRUE -icmp/*: +``icmp/*``: +=========== + ratelimit - INTEGER Limit the maximal rates for sending ICMPv6 messages. + 0 to disable any limiting, otherwise the minimal space between responses in milliseconds. + Default: 1000 ratemask - list of comma separated ranges @@ -2018,16 +2285,19 @@ ratemask - list of comma separated ranges echo_ignore_all - BOOLEAN If set non-zero, then the kernel will ignore all ICMP ECHO requests sent to it over the IPv6 protocol. + Default: 0 echo_ignore_multicast - BOOLEAN If set non-zero, then the kernel will ignore all ICMP ECHO requests sent to it over the IPv6 protocol via multicast. + Default: 0 echo_ignore_anycast - BOOLEAN If set non-zero, then the kernel will ignore all ICMP ECHO requests sent to it over the IPv6 protocol destined to anycast address. + Default: 0 xfrm6_gc_thresh - INTEGER @@ -2043,43 +2313,52 @@ YOSHIFUJI Hideaki / USAGI Project <yoshfuji@linux-ipv6.org> /proc/sys/net/bridge/* Variables: +================================= bridge-nf-call-arptables - BOOLEAN - 1 : pass bridged ARP traffic to arptables' FORWARD chain. - 0 : disable this. + - 1 : pass bridged ARP traffic to arptables' FORWARD chain. + - 0 : disable this. + Default: 1 bridge-nf-call-iptables - BOOLEAN - 1 : pass bridged IPv4 traffic to iptables' chains. - 0 : disable this. + - 1 : pass bridged IPv4 traffic to iptables' chains. + - 0 : disable this. + Default: 1 bridge-nf-call-ip6tables - BOOLEAN - 1 : pass bridged IPv6 traffic to ip6tables' chains. - 0 : disable this. + - 1 : pass bridged IPv6 traffic to ip6tables' chains. + - 0 : disable this. + Default: 1 bridge-nf-filter-vlan-tagged - BOOLEAN - 1 : pass bridged vlan-tagged ARP/IP/IPv6 traffic to {arp,ip,ip6}tables. - 0 : disable this. + - 1 : pass bridged vlan-tagged ARP/IP/IPv6 traffic to {arp,ip,ip6}tables. + - 0 : disable this. + Default: 0 bridge-nf-filter-pppoe-tagged - BOOLEAN - 1 : pass bridged pppoe-tagged IP/IPv6 traffic to {ip,ip6}tables. - 0 : disable this. + - 1 : pass bridged pppoe-tagged IP/IPv6 traffic to {ip,ip6}tables. + - 0 : disable this. + Default: 0 bridge-nf-pass-vlan-input-dev - BOOLEAN - 1: if bridge-nf-filter-vlan-tagged is enabled, try to find a vlan - interface on the bridge and set the netfilter input device to the vlan. - This allows use of e.g. "iptables -i br0.1" and makes the REDIRECT - target work with vlan-on-top-of-bridge interfaces. When no matching - vlan interface is found, or this switch is off, the input device is - set to the bridge interface. - 0: disable bridge netfilter vlan interface lookup. + - 1: if bridge-nf-filter-vlan-tagged is enabled, try to find a vlan + interface on the bridge and set the netfilter input device to the + vlan. This allows use of e.g. "iptables -i br0.1" and makes the + REDIRECT target work with vlan-on-top-of-bridge interfaces. When no + matching vlan interface is found, or this switch is off, the input + device is set to the bridge interface. + + - 0: disable bridge netfilter vlan interface lookup. + Default: 0 -proc/sys/net/sctp/* Variables: +``proc/sys/net/sctp/*`` Variables: +================================== addip_enable - BOOLEAN Enable or disable extension of Dynamic Address Reconfiguration @@ -2144,11 +2423,13 @@ addip_noauth_enable - BOOLEAN we provide this variable to control the enforcement of the authentication requirement. - 1: Allow ADD-IP extension to be used without authentication. This + == =============================================================== + 1 Allow ADD-IP extension to be used without authentication. This should only be set in a closed environment for interoperability with older implementations. - 0: Enforce the authentication requirement + 0 Enforce the authentication requirement + == =============================================================== Default: 0 @@ -2158,8 +2439,8 @@ auth_enable - BOOLEAN required for secure operation of Dynamic Address Reconfiguration (ADD-IP) extension. - 1: Enable this extension. - 0: Disable this extension. + - 1: Enable this extension. + - 0: Disable this extension. Default: 0 @@ -2167,8 +2448,8 @@ prsctp_enable - BOOLEAN Enable or disable the Partial Reliability extension (RFC3758) which is used to notify peers that a given DATA should no longer be expected. - 1: Enable extension - 0: Disable + - 1: Enable extension + - 0: Disable Default: 1 @@ -2270,8 +2551,8 @@ cookie_preserve_enable - BOOLEAN Enable or disable the ability to extend the lifetime of the SCTP cookie that is used during the establishment phase of SCTP association - 1: Enable cookie lifetime extension. - 0: Disable + - 1: Enable cookie lifetime extension. + - 0: Disable Default: 1 @@ -2279,9 +2560,11 @@ cookie_hmac_alg - STRING Select the hmac algorithm used when generating the cookie value sent by a listening sctp socket to a connecting client in the INIT-ACK chunk. Valid values are: + * md5 * sha1 * none + Ability to assign md5 or sha1 as the selected alg is predicated on the configuration of those algorithms at build time (CONFIG_CRYPTO_MD5 and CONFIG_CRYPTO_SHA1). @@ -2300,16 +2583,16 @@ rcvbuf_policy - INTEGER to each association instead of the socket. This prevents the described blocking. - 1: rcvbuf space is per association - 0: rcvbuf space is per socket + - 1: rcvbuf space is per association + - 0: rcvbuf space is per socket Default: 0 sndbuf_policy - INTEGER Similar to rcvbuf_policy above, this applies to send buffer space. - 1: Send buffer is tracked per association - 0: Send buffer is tracked per socket. + - 1: Send buffer is tracked per association + - 0: Send buffer is tracked per socket. Default: 0 @@ -2342,19 +2625,23 @@ sctp_wmem - vector of 3 INTEGERs: min, default, max addr_scope_policy - INTEGER Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00 - 0 - Disable IPv4 address scoping - 1 - Enable IPv4 address scoping - 2 - Follow draft but allow IPv4 private addresses - 3 - Follow draft but allow IPv4 link local addresses + - 0 - Disable IPv4 address scoping + - 1 - Enable IPv4 address scoping + - 2 - Follow draft but allow IPv4 private addresses + - 3 - Follow draft but allow IPv4 link local addresses Default: 1 -/proc/sys/net/core/* +``/proc/sys/net/core/*`` +======================== + Please see: Documentation/admin-guide/sysctl/net.rst for descriptions of these entries. -/proc/sys/net/unix/* +``/proc/sys/net/unix/*`` +======================== + max_dgram_qlen - INTEGER The maximum length of dgram socket receive queue diff --git a/Documentation/networking/ip_dynaddr.txt b/Documentation/networking/ip_dynaddr.rst index 45f3c1268e86..eacc0c780c7f 100644 --- a/Documentation/networking/ip_dynaddr.txt +++ b/Documentation/networking/ip_dynaddr.rst @@ -1,10 +1,15 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== IP dynamic address hack-port v0.03 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +================================== + This stuff allows diald ONESHOT connections to get established by dynamically changing packet source address (and socket's if local procs). It is implemented for TCP diald-box connections(1) and IP_MASQuerading(2). -If enabled[*] and forwarding interface has changed: +If enabled\ [#]_ and forwarding interface has changed: + 1) Socket (and packet) source address is rewritten ON RETRANSMISSIONS while in SYN_SENT state (diald-box processes). 2) Out-bounded MASQueraded source address changes ON OUTPUT (when @@ -12,18 +17,24 @@ If enabled[*] and forwarding interface has changed: received by the tunnel. This is specially helpful for auto dialup links (diald), where the -``actual'' outgoing address is unknown at the moment the link is +``actual`` outgoing address is unknown at the moment the link is going up. So, the *same* (local AND masqueraded) connections requests that bring the link up will be able to get established. -[*] At boot, by default no address rewriting is attempted. - To enable: +.. [#] At boot, by default no address rewriting is attempted. + + To enable:: + # echo 1 > /proc/sys/net/ipv4/ip_dynaddr - To enable verbose mode: - # echo 2 > /proc/sys/net/ipv4/ip_dynaddr - To disable (default) + + To enable verbose mode:: + + # echo 2 > /proc/sys/net/ipv4/ip_dynaddr + + To disable (default):: + # echo 0 > /proc/sys/net/ipv4/ip_dynaddr Enjoy! --- Juanjo <jjciarla@raiz.uncu.edu.ar> +Juanjo <jjciarla@raiz.uncu.edu.ar> diff --git a/Documentation/networking/ipddp.txt b/Documentation/networking/ipddp.rst index ba5c217fffe0..be7091b77927 100644 --- a/Documentation/networking/ipddp.txt +++ b/Documentation/networking/ipddp.rst @@ -1,7 +1,12 @@ -Text file for ipddp.c: - AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation +.. SPDX-License-Identifier: GPL-2.0 -This text file is written by Jay Schulist <jschlst@samba.org> +========================================================= +AppleTalk-IP Decapsulation and AppleTalk-IP Encapsulation +========================================================= + +Documentation ipddp.c + +This file is written by Jay Schulist <jschlst@samba.org> Introduction ------------ @@ -21,7 +26,7 @@ kernel AppleTalk layer and drivers are available. Each mode requires its own user space software. Compiling AppleTalk-IP Decapsulation/Encapsulation -================================================= +================================================== AppleTalk-IP decapsulation needs to be compiled into your kernel. You will need to turn on AppleTalk-IP driver support. Then you will need to diff --git a/Documentation/networking/iphase.txt b/Documentation/networking/iphase.rst index 670b72f16585..92d9b757d75a 100644 --- a/Documentation/networking/iphase.txt +++ b/Documentation/networking/iphase.rst @@ -1,27 +1,35 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +ATM (i)Chip IA Linux Driver Source +================================== + + READ ME FISRT - READ ME FISRT - ATM (i)Chip IA Linux Driver Source -------------------------------------------------------------------------------- - Read This Before You Begin! + + Read This Before You Begin! + -------------------------------------------------------------------------------- Description ------------ +=========== -This is the README file for the Interphase PCI ATM (i)Chip IA Linux driver +This is the README file for the Interphase PCI ATM (i)Chip IA Linux driver source release. The features and limitations of this driver are as follows: + - A single VPI (VPI value of 0) is supported. - - Supports 4K VCs for the server board (with 512K control memory) and 1K + - Supports 4K VCs for the server board (with 512K control memory) and 1K VCs for the client board (with 128K control memory). - UBR, ABR and CBR service categories are supported. - - Only AAL5 is supported. - - Supports setting of PCR on the VCs. + - Only AAL5 is supported. + - Supports setting of PCR on the VCs. - Multiple adapters in a system are supported. - - All variants of Interphase ATM PCI (i)Chip adapter cards are supported, - including x575 (OC3, control memory 128K , 512K and packet memory 128K, - 512K and 1M), x525 (UTP25) and x531 (DS3 and E3). See + - All variants of Interphase ATM PCI (i)Chip adapter cards are supported, + including x575 (OC3, control memory 128K , 512K and packet memory 128K, + 512K and 1M), x525 (UTP25) and x531 (DS3 and E3). See http://www.iphase.com/ for details. - Only x86 platforms are supported. @@ -29,128 +37,155 @@ The features and limitations of this driver are as follows: Before You Start ----------------- +================ Installation ------------ 1. Installing the adapters in the system + To install the ATM adapters in the system, follow the steps below. + a. Login as root. b. Shut down the system and power off the system. c. Install one or more ATM adapters in the system. - d. Connect each adapter to a port on an ATM switch. The green 'Link' - LED on the front panel of the adapter will be on if the adapter is - connected to the switch properly when the system is powered up. + d. Connect each adapter to a port on an ATM switch. The green 'Link' + LED on the front panel of the adapter will be on if the adapter is + connected to the switch properly when the system is powered up. e. Power on and boot the system. 2. [ Removed ] 3. Rebuild kernel with ABR support + [ a. and b. removed ] - c. Reconfigure the kernel, choose the Interphase ia driver through "make + + c. Reconfigure the kernel, choose the Interphase ia driver through "make menuconfig" or "make xconfig". - d. Rebuild the kernel, loadable modules and the atm tools. + d. Rebuild the kernel, loadable modules and the atm tools. e. Install the new built kernel and modules and reboot. 4. Load the adapter hardware driver (ia driver) if it is built as a module + a. Login as root. b. Change directory to /lib/modules/<kernel-version>/atm. c. Run "insmod suni.o;insmod iphase.o" - The yellow 'status' LED on the front panel of the adapter will blink - while the driver is loaded in the system. - d. To verify that the 'ia' driver is loaded successfully, run the - following command: + The yellow 'status' LED on the front panel of the adapter will blink + while the driver is loaded in the system. + d. To verify that the 'ia' driver is loaded successfully, run the + following command:: - cat /proc/atm/devices + cat /proc/atm/devices - If the driver is loaded successfully, the output of the command will - be similar to the following lines: + If the driver is loaded successfully, the output of the command will + be similar to the following lines:: - Itf Type ESI/"MAC"addr AAL(TX,err,RX,err,drop) ... - 0 ia xxxxxxxxx 0 ( 0 0 0 0 0 ) 5 ( 0 0 0 0 0 ) + Itf Type ESI/"MAC"addr AAL(TX,err,RX,err,drop) ... + 0 ia xxxxxxxxx 0 ( 0 0 0 0 0 ) 5 ( 0 0 0 0 0 ) - You can also check the system log file /var/log/messages for messages - related to the ATM driver. + You can also check the system log file /var/log/messages for messages + related to the ATM driver. -5. Ia Driver Configuration +5. Ia Driver Configuration 5.1 Configuration of adapter buffers The (i)Chip boards have 3 different packet RAM size variants: 128K, 512K and - 1M. The RAM size decides the number of buffers and buffer size. The default - size and number of buffers are set as following: - - Total Rx RAM Tx RAM Rx Buf Tx Buf Rx buf Tx buf - RAM size size size size size cnt cnt - -------- ------ ------ ------ ------ ------ ------ - 128K 64K 64K 10K 10K 6 6 - 512K 256K 256K 10K 10K 25 25 - 1M 512K 512K 10K 10K 51 51 + 1M. The RAM size decides the number of buffers and buffer size. The default + size and number of buffers are set as following: + + ========= ======= ====== ====== ====== ====== ====== + Total Rx RAM Tx RAM Rx Buf Tx Buf Rx buf Tx buf + RAM size size size size size cnt cnt + ========= ======= ====== ====== ====== ====== ====== + 128K 64K 64K 10K 10K 6 6 + 512K 256K 256K 10K 10K 25 25 + 1M 512K 512K 10K 10K 51 51 + ========= ======= ====== ====== ====== ====== ====== These setting should work well in most environments, but can be - changed by typing the following command: - - insmod <IA_DIR>/ia.o IA_RX_BUF=<RX_CNT> IA_RX_BUF_SZ=<RX_SIZE> \ - IA_TX_BUF=<TX_CNT> IA_TX_BUF_SZ=<TX_SIZE> + changed by typing the following command:: + + insmod <IA_DIR>/ia.o IA_RX_BUF=<RX_CNT> IA_RX_BUF_SZ=<RX_SIZE> \ + IA_TX_BUF=<TX_CNT> IA_TX_BUF_SZ=<TX_SIZE> + Where: - RX_CNT = number of receive buffers in the range (1-128) - RX_SIZE = size of receive buffers in the range (48-64K) - TX_CNT = number of transmit buffers in the range (1-128) - TX_SIZE = size of transmit buffers in the range (48-64K) - 1. Transmit and receive buffer size must be a multiple of 4. - 2. Care should be taken so that the memory required for the - transmit and receive buffers is less than or equal to the - total adapter packet memory. + - RX_CNT = number of receive buffers in the range (1-128) + - RX_SIZE = size of receive buffers in the range (48-64K) + - TX_CNT = number of transmit buffers in the range (1-128) + - TX_SIZE = size of transmit buffers in the range (48-64K) + + 1. Transmit and receive buffer size must be a multiple of 4. + 2. Care should be taken so that the memory required for the + transmit and receive buffers is less than or equal to the + total adapter packet memory. 5.2 Turn on ia debug trace - When the ia driver is built with the CONFIG_ATM_IA_DEBUG flag, the driver - can provide more debug trace if needed. There is a bit mask variable, - IADebugFlag, which controls the output of the traces. You can find the bit - map of the IADebugFlag in iphase.h. - The debug trace can be turn on through the insmod command line option, for - example, "insmod iphase.o IADebugFlag=0xffffffff" can turn on all the debug + When the ia driver is built with the CONFIG_ATM_IA_DEBUG flag, the driver + can provide more debug trace if needed. There is a bit mask variable, + IADebugFlag, which controls the output of the traces. You can find the bit + map of the IADebugFlag in iphase.h. + The debug trace can be turn on through the insmod command line option, for + example, "insmod iphase.o IADebugFlag=0xffffffff" can turn on all the debug traces together with loading the driver. 6. Ia Driver Test Using ttcp_atm and PVC - For the PVC setup, the test machines can either be connected back-to-back or - through a switch. If connected through the switch, the switch must be + For the PVC setup, the test machines can either be connected back-to-back or + through a switch. If connected through the switch, the switch must be configured for the PVC(s). a. For UBR test: - At the test machine intended to receive data, type: - ttcp_atm -r -a -s 0.100 - At the other test machine, type: - ttcp_atm -t -a -s 0.100 -n 10000 + + At the test machine intended to receive data, type:: + + ttcp_atm -r -a -s 0.100 + + At the other test machine, type:: + + ttcp_atm -t -a -s 0.100 -n 10000 + Run "ttcp_atm -h" to display more options of the ttcp_atm tool. b. For ABR test: - It is the same as the UBR testing, but with an extra command option: - -Pabr:max_pcr=<xxx> - where: - xxx = the maximum peak cell rate, from 170 - 353207. - This option must be set on both the machines. + + It is the same as the UBR testing, but with an extra command option:: + + -Pabr:max_pcr=<xxx> + + where: + + xxx = the maximum peak cell rate, from 170 - 353207. + + This option must be set on both the machines. + c. For CBR test: - It is the same as the UBR testing, but with an extra command option: - -Pcbr:max_pcr=<xxx> - where: - xxx = the maximum peak cell rate, from 170 - 353207. - This option may only be set on the transmit machine. + It is the same as the UBR testing, but with an extra command option:: + + -Pcbr:max_pcr=<xxx> + + where: + + xxx = the maximum peak cell rate, from 170 - 353207. -OUTSTANDING ISSUES ------------------- + This option may only be set on the transmit machine. + + +Outstanding Issues +================== Contact Information ------------------- +:: + Customer Support: - United States: Telephone: (214) 654-5555 - Fax: (214) 654-5500 + United States: Telephone: (214) 654-5555 + Fax: (214) 654-5500 E-Mail: intouch@iphase.com Europe: Telephone: 33 (0)1 41 15 44 00 Fax: 33 (0)1 41 15 12 13 diff --git a/Documentation/networking/ipsec.txt b/Documentation/networking/ipsec.rst index ba794b7e51be..afe9d7b48be3 100644 --- a/Documentation/networking/ipsec.txt +++ b/Documentation/networking/ipsec.rst @@ -1,12 +1,20 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===== +IPsec +===== + Here documents known IPsec corner cases which need to be keep in mind when deploy various IPsec configuration in real world production environment. -1. IPcomp: Small IP packet won't get compressed at sender, and failed on +1. IPcomp: + Small IP packet won't get compressed at sender, and failed on policy check on receiver. -Quote from RFC3173: -2.2. Non-Expansion Policy +Quote from RFC3173:: + + 2.2. Non-Expansion Policy If the total size of a compressed payload and the IPComp header, as defined in section 3, is not smaller than the size of the original diff --git a/Documentation/networking/ipv6.txt b/Documentation/networking/ipv6.rst index 6cd74fa55358..ba09c2f2dcc7 100644 --- a/Documentation/networking/ipv6.txt +++ b/Documentation/networking/ipv6.rst @@ -1,9 +1,15 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +IPv6 +==== + Options for the ipv6 module are supplied as parameters at load time. Module options may be given as command line arguments to the insmod or modprobe command, but are usually specified in either -/etc/modules.d/*.conf configuration files, or in a distro-specific +``/etc/modules.d/*.conf`` configuration files, or in a distro-specific configuration file. The available ipv6 module parameters are listed below. If a parameter diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.rst index 27a38e50c287..694adcba36b0 100644 --- a/Documentation/networking/ipvlan.txt +++ b/Documentation/networking/ipvlan.rst @@ -1,11 +1,15 @@ +.. SPDX-License-Identifier: GPL-2.0 - IPVLAN Driver HOWTO +=================== +IPVLAN Driver HOWTO +=================== Initial Release: Mahesh Bandewar <maheshb AT google.com> 1. Introduction: - This is conceptually very similar to the macvlan driver with one major +================ +This is conceptually very similar to the macvlan driver with one major exception of using L3 for mux-ing /demux-ing among slaves. This property makes the master device share the L2 with it's slave devices. I have developed this driver in conjunction with network namespaces and not sure if there is use case @@ -13,34 +17,48 @@ outside of it. 2. Building and Installation: - In order to build the driver, please select the config item CONFIG_IPVLAN. +============================= + +In order to build the driver, please select the config item CONFIG_IPVLAN. The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module (CONFIG_IPVLAN=m). 3. Configuration: - There are no module parameters for this driver and it can be configured +================= + +There are no module parameters for this driver and it can be configured using IProute2/ip utility. +:: ip link add link <master> name <slave> type ipvlan [ mode MODE ] [ FLAGS ] where - MODE: l3 (default) | l3s | l2 - FLAGS: bridge (default) | private | vepa + MODE: l3 (default) | l3s | l2 + FLAGS: bridge (default) | private | vepa + +e.g. - e.g. (a) Following will create IPvlan link with eth0 as master in - L3 bridge mode - bash# ip link add link eth0 name ipvl0 type ipvlan - (b) This command will create IPvlan link in L2 bridge mode. - bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge - (c) This command will create an IPvlan device in L2 private mode. - bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private - (d) This command will create an IPvlan device in L2 vepa mode. - bash# ip link add link eth0 name ipvlan type ipvlan mode l2 vepa + L3 bridge mode:: + + bash# ip link add link eth0 name ipvl0 type ipvlan + (b) This command will create IPvlan link in L2 bridge mode:: + + bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge + + (c) This command will create an IPvlan device in L2 private mode:: + + bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private + + (d) This command will create an IPvlan device in L2 vepa mode:: + + bash# ip link add link eth0 name ipvlan type ipvlan mode l2 vepa 4. Operating modes: - IPvlan has two modes of operation - L2 and L3. For a given master device, +=================== + +IPvlan has two modes of operation - L2 and L3. For a given master device, you can select one of these two modes and all slaves on that master will operate in the same (selected) mode. The RX mode is almost identical except that in L3 mode the slaves wont receive any multicast / broadcast traffic. @@ -48,39 +66,50 @@ L3 mode is more restrictive since routing is controlled from the other (mostly) default namespace. 4.1 L2 mode: - In this mode TX processing happens on the stack instance attached to the +------------ + +In this mode TX processing happens on the stack instance attached to the slave device and packets are switched and queued to the master device to send out. In this mode the slaves will RX/TX multicast and broadcast (if applicable) as well. 4.2 L3 mode: - In this mode TX processing up to L3 happens on the stack instance attached +------------ + +In this mode TX processing up to L3 happens on the stack instance attached to the slave device and packets are switched to the stack instance of the master device for the L2 processing and routing from that instance will be used before packets are queued on the outbound device. In this mode the slaves will not receive nor can send multicast / broadcast traffic. 4.3 L3S mode: - This is very similar to the L3 mode except that iptables (conn-tracking) +------------- + +This is very similar to the L3 mode except that iptables (conn-tracking) works in this mode and hence it is L3-symmetric (L3s). This will have slightly less performance but that shouldn't matter since you are choosing this mode over plain-L3 mode to make conn-tracking work. 5. Mode flags: - At this time following mode flags are available +============== + +At this time following mode flags are available 5.1 bridge: - This is the default option. To configure the IPvlan port in this mode, +----------- +This is the default option. To configure the IPvlan port in this mode, user can choose to either add this option on the command-line or don't specify anything. This is the traditional mode where slaves can cross-talk among themselves apart from talking through the master device. 5.2 private: - If this option is added to the command-line, the port is set in private +------------ +If this option is added to the command-line, the port is set in private mode. i.e. port won't allow cross communication between slaves. 5.3 vepa: - If this is added to the command-line, the port is set in VEPA mode. +--------- +If this is added to the command-line, the port is set in VEPA mode. i.e. port will offload switching functionality to the external entity as described in 802.1Qbg Note: VEPA mode in IPvlan has limitations. IPvlan uses the mac-address of the @@ -89,18 +118,25 @@ neighbor will have source and destination mac same. This will make the switch / router send the redirect message. 6. What to choose (macvlan vs. ipvlan)? - These two devices are very similar in many regards and the specific use +======================================= + +These two devices are very similar in many regards and the specific use case could very well define which device to choose. if one of the following -situations defines your use case then you can choose to use ipvlan - - (a) The Linux host that is connected to the external switch / router has -policy configured that allows only one mac per port. - (b) No of virtual devices created on a master exceed the mac capacity and -puts the NIC in promiscuous mode and degraded performance is a concern. - (c) If the slave device is to be put into the hostile / untrusted network -namespace where L2 on the slave could be changed / misused. +situations defines your use case then you can choose to use ipvlan: + + +(a) The Linux host that is connected to the external switch / router has + policy configured that allows only one mac per port. +(b) No of virtual devices created on a master exceed the mac capacity and + puts the NIC in promiscuous mode and degraded performance is a concern. +(c) If the slave device is to be put into the hostile / untrusted network + namespace where L2 on the slave could be changed / misused. 6. Example configuration: +========================= + +:: +=============================================================+ | Host: host1 | @@ -117,30 +153,37 @@ namespace where L2 on the slave could be changed / misused. +==============================#==============================+ - (a) Create two network namespaces - ns0, ns1 - ip netns add ns0 - ip netns add ns1 - - (b) Create two ipvlan slaves on eth0 (master device) - ip link add link eth0 ipvl0 type ipvlan mode l2 - ip link add link eth0 ipvl1 type ipvlan mode l2 - - (c) Assign slaves to the respective network namespaces - ip link set dev ipvl0 netns ns0 - ip link set dev ipvl1 netns ns1 - - (d) Now switch to the namespace (ns0 or ns1) to configure the slave devices - - For ns0 - (1) ip netns exec ns0 bash - (2) ip link set dev ipvl0 up - (3) ip link set dev lo up - (4) ip -4 addr add 127.0.0.1 dev lo - (5) ip -4 addr add $IPADDR dev ipvl0 - (6) ip -4 route add default via $ROUTER dev ipvl0 - - For ns1 - (1) ip netns exec ns1 bash - (2) ip link set dev ipvl1 up - (3) ip link set dev lo up - (4) ip -4 addr add 127.0.0.1 dev lo - (5) ip -4 addr add $IPADDR dev ipvl1 - (6) ip -4 route add default via $ROUTER dev ipvl1 +(a) Create two network namespaces - ns0, ns1:: + + ip netns add ns0 + ip netns add ns1 + +(b) Create two ipvlan slaves on eth0 (master device):: + + ip link add link eth0 ipvl0 type ipvlan mode l2 + ip link add link eth0 ipvl1 type ipvlan mode l2 + +(c) Assign slaves to the respective network namespaces:: + + ip link set dev ipvl0 netns ns0 + ip link set dev ipvl1 netns ns1 + +(d) Now switch to the namespace (ns0 or ns1) to configure the slave devices + + - For ns0:: + + (1) ip netns exec ns0 bash + (2) ip link set dev ipvl0 up + (3) ip link set dev lo up + (4) ip -4 addr add 127.0.0.1 dev lo + (5) ip -4 addr add $IPADDR dev ipvl0 + (6) ip -4 route add default via $ROUTER dev ipvl0 + + - For ns1:: + + (1) ip netns exec ns1 bash + (2) ip link set dev ipvl1 up + (3) ip link set dev lo up + (4) ip -4 addr add 127.0.0.1 dev lo + (5) ip -4 addr add $IPADDR dev ipvl1 + (6) ip -4 route add default via $ROUTER dev ipvl1 diff --git a/Documentation/networking/ipvs-sysctl.txt b/Documentation/networking/ipvs-sysctl.rst index 056898685d40..be36c4600e8f 100644 --- a/Documentation/networking/ipvs-sysctl.txt +++ b/Documentation/networking/ipvs-sysctl.rst @@ -1,23 +1,30 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========== +IPvs-sysctl +=========== + /proc/sys/net/ipv4/vs/* Variables: +================================== am_droprate - INTEGER - default 10 + default 10 - It sets the always mode drop rate, which is used in the mode 3 - of the drop_rate defense. + It sets the always mode drop rate, which is used in the mode 3 + of the drop_rate defense. amemthresh - INTEGER - default 1024 + default 1024 - It sets the available memory threshold (in pages), which is - used in the automatic modes of defense. When there is no - enough available memory, the respective strategy will be - enabled and the variable is automatically set to 2, otherwise - the strategy is disabled and the variable is set to 1. + It sets the available memory threshold (in pages), which is + used in the automatic modes of defense. When there is no + enough available memory, the respective strategy will be + enabled and the variable is automatically set to 2, otherwise + the strategy is disabled and the variable is set to 1. backup_only - BOOLEAN - 0 - disabled (default) - not 0 - enabled + - 0 - disabled (default) + - not 0 - enabled If set, disable the director function while the server is in backup mode to avoid packet loops for DR/TUN methods. @@ -44,8 +51,8 @@ conn_reuse_mode - INTEGER real servers to a very busy cluster. conntrack - BOOLEAN - 0 - disabled (default) - not 0 - enabled + - 0 - disabled (default) + - not 0 - enabled If set, maintain connection tracking entries for connections handled by IPVS. @@ -61,28 +68,28 @@ conntrack - BOOLEAN Only available when IPVS is compiled with CONFIG_IP_VS_NFCT enabled. cache_bypass - BOOLEAN - 0 - disabled (default) - not 0 - enabled + - 0 - disabled (default) + - not 0 - enabled - If it is enabled, forward packets to the original destination - directly when no cache server is available and destination - address is not local (iph->daddr is RTN_UNICAST). It is mostly - used in transparent web cache cluster. + If it is enabled, forward packets to the original destination + directly when no cache server is available and destination + address is not local (iph->daddr is RTN_UNICAST). It is mostly + used in transparent web cache cluster. debug_level - INTEGER - 0 - transmission error messages (default) - 1 - non-fatal error messages - 2 - configuration - 3 - destination trash - 4 - drop entry - 5 - service lookup - 6 - scheduling - 7 - connection new/expire, lookup and synchronization - 8 - state transition - 9 - binding destination, template checks and applications - 10 - IPVS packet transmission - 11 - IPVS packet handling (ip_vs_in/ip_vs_out) - 12 or more - packet traversal + - 0 - transmission error messages (default) + - 1 - non-fatal error messages + - 2 - configuration + - 3 - destination trash + - 4 - drop entry + - 5 - service lookup + - 6 - scheduling + - 7 - connection new/expire, lookup and synchronization + - 8 - state transition + - 9 - binding destination, template checks and applications + - 10 - IPVS packet transmission + - 11 - IPVS packet handling (ip_vs_in/ip_vs_out) + - 12 or more - packet traversal Only available when IPVS is compiled with CONFIG_IP_VS_DEBUG enabled. @@ -92,58 +99,58 @@ debug_level - INTEGER the level. drop_entry - INTEGER - 0 - disabled (default) - - The drop_entry defense is to randomly drop entries in the - connection hash table, just in order to collect back some - memory for new connections. In the current code, the - drop_entry procedure can be activated every second, then it - randomly scans 1/32 of the whole and drops entries that are in - the SYN-RECV/SYNACK state, which should be effective against - syn-flooding attack. - - The valid values of drop_entry are from 0 to 3, where 0 means - that this strategy is always disabled, 1 and 2 mean automatic - modes (when there is no enough available memory, the strategy - is enabled and the variable is automatically set to 2, - otherwise the strategy is disabled and the variable is set to - 1), and 3 means that that the strategy is always enabled. + - 0 - disabled (default) + + The drop_entry defense is to randomly drop entries in the + connection hash table, just in order to collect back some + memory for new connections. In the current code, the + drop_entry procedure can be activated every second, then it + randomly scans 1/32 of the whole and drops entries that are in + the SYN-RECV/SYNACK state, which should be effective against + syn-flooding attack. + + The valid values of drop_entry are from 0 to 3, where 0 means + that this strategy is always disabled, 1 and 2 mean automatic + modes (when there is no enough available memory, the strategy + is enabled and the variable is automatically set to 2, + otherwise the strategy is disabled and the variable is set to + 1), and 3 means that that the strategy is always enabled. drop_packet - INTEGER - 0 - disabled (default) + - 0 - disabled (default) - The drop_packet defense is designed to drop 1/rate packets - before forwarding them to real servers. If the rate is 1, then - drop all the incoming packets. + The drop_packet defense is designed to drop 1/rate packets + before forwarding them to real servers. If the rate is 1, then + drop all the incoming packets. - The value definition is the same as that of the drop_entry. In - the automatic mode, the rate is determined by the follow - formula: rate = amemthresh / (amemthresh - available_memory) - when available memory is less than the available memory - threshold. When the mode 3 is set, the always mode drop rate - is controlled by the /proc/sys/net/ipv4/vs/am_droprate. + The value definition is the same as that of the drop_entry. In + the automatic mode, the rate is determined by the follow + formula: rate = amemthresh / (amemthresh - available_memory) + when available memory is less than the available memory + threshold. When the mode 3 is set, the always mode drop rate + is controlled by the /proc/sys/net/ipv4/vs/am_droprate. expire_nodest_conn - BOOLEAN - 0 - disabled (default) - not 0 - enabled - - The default value is 0, the load balancer will silently drop - packets when its destination server is not available. It may - be useful, when user-space monitoring program deletes the - destination server (because of server overload or wrong - detection) and add back the server later, and the connections - to the server can continue. - - If this feature is enabled, the load balancer will expire the - connection immediately when a packet arrives and its - destination server is not available, then the client program - will be notified that the connection is closed. This is - equivalent to the feature some people requires to flush - connections when its destination is not available. + - 0 - disabled (default) + - not 0 - enabled + + The default value is 0, the load balancer will silently drop + packets when its destination server is not available. It may + be useful, when user-space monitoring program deletes the + destination server (because of server overload or wrong + detection) and add back the server later, and the connections + to the server can continue. + + If this feature is enabled, the load balancer will expire the + connection immediately when a packet arrives and its + destination server is not available, then the client program + will be notified that the connection is closed. This is + equivalent to the feature some people requires to flush + connections when its destination is not available. expire_quiescent_template - BOOLEAN - 0 - disabled (default) - not 0 - enabled + - 0 - disabled (default) + - not 0 - enabled When set to a non-zero value, the load balancer will expire persistent templates when the destination server is quiescent. @@ -158,8 +165,8 @@ expire_quiescent_template - BOOLEAN connection and the destination server is quiescent. ignore_tunneled - BOOLEAN - 0 - disabled (default) - not 0 - enabled + - 0 - disabled (default) + - not 0 - enabled If set, ipvs will set the ipvs_property on all packets which are of unrecognized protocols. This prevents us from routing tunneled @@ -168,30 +175,30 @@ ignore_tunneled - BOOLEAN ipvs routing loops when ipvs is also acting as a real server). nat_icmp_send - BOOLEAN - 0 - disabled (default) - not 0 - enabled + - 0 - disabled (default) + - not 0 - enabled - It controls sending icmp error messages (ICMP_DEST_UNREACH) - for VS/NAT when the load balancer receives packets from real - servers but the connection entries don't exist. + It controls sending icmp error messages (ICMP_DEST_UNREACH) + for VS/NAT when the load balancer receives packets from real + servers but the connection entries don't exist. pmtu_disc - BOOLEAN - 0 - disabled - not 0 - enabled (default) + - 0 - disabled + - not 0 - enabled (default) By default, reject with FRAG_NEEDED all DF packets that exceed the PMTU, irrespective of the forwarding method. For TUN method the flag can be disabled to fragment such packets. secure_tcp - INTEGER - 0 - disabled (default) + - 0 - disabled (default) The secure_tcp defense is to use a more complicated TCP state transition table. For VS/NAT, it also delays entering the TCP ESTABLISHED state until the three way handshake is completed. - The value definition is the same as that of drop_entry and - drop_packet. + The value definition is the same as that of drop_entry and + drop_packet. sync_threshold - vector of 2 INTEGERs: sync_threshold, sync_period default 3 50 @@ -248,8 +255,8 @@ sync_ports - INTEGER 8848+sync_ports-1. snat_reroute - BOOLEAN - 0 - disabled - not 0 - enabled (default) + - 0 - disabled + - not 0 - enabled (default) If enabled, recalculate the route of SNATed packets from realservers so that they are routed as if they originate from the @@ -270,6 +277,7 @@ sync_persist_mode - INTEGER Controls the synchronisation of connections when using persistence 0: All types of connections are synchronised + 1: Attempt to reduce the synchronisation traffic depending on the connection type. For persistent services avoid synchronisation for normal connections, do it only for persistence templates. diff --git a/Documentation/networking/kcm.txt b/Documentation/networking/kcm.rst index b773a5278ac4..db0f5560ac1c 100644 --- a/Documentation/networking/kcm.txt +++ b/Documentation/networking/kcm.rst @@ -1,35 +1,38 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================= Kernel Connection Multiplexor ------------------------------ +============================= Kernel Connection Multiplexor (KCM) is a mechanism that provides a message based interface over TCP for generic application protocols. With KCM an application can efficiently send and receive application protocol messages over TCP using datagram sockets. -KCM implements an NxM multiplexor in the kernel as diagrammed below: - -+------------+ +------------+ +------------+ +------------+ -| KCM socket | | KCM socket | | KCM socket | | KCM socket | -+------------+ +------------+ +------------+ +------------+ - | | | | - +-----------+ | | +----------+ - | | | | - +----------------------------------+ - | Multiplexor | - +----------------------------------+ - | | | | | - +---------+ | | | ------------+ - | | | | | -+----------+ +----------+ +----------+ +----------+ +----------+ -| Psock | | Psock | | Psock | | Psock | | Psock | -+----------+ +----------+ +----------+ +----------+ +----------+ - | | | | | -+----------+ +----------+ +----------+ +----------+ +----------+ -| TCP sock | | TCP sock | | TCP sock | | TCP sock | | TCP sock | -+----------+ +----------+ +----------+ +----------+ +----------+ +KCM implements an NxM multiplexor in the kernel as diagrammed below:: + + +------------+ +------------+ +------------+ +------------+ + | KCM socket | | KCM socket | | KCM socket | | KCM socket | + +------------+ +------------+ +------------+ +------------+ + | | | | + +-----------+ | | +----------+ + | | | | + +----------------------------------+ + | Multiplexor | + +----------------------------------+ + | | | | | + +---------+ | | | ------------+ + | | | | | + +----------+ +----------+ +----------+ +----------+ +----------+ + | Psock | | Psock | | Psock | | Psock | | Psock | + +----------+ +----------+ +----------+ +----------+ +----------+ + | | | | | + +----------+ +----------+ +----------+ +----------+ +----------+ + | TCP sock | | TCP sock | | TCP sock | | TCP sock | | TCP sock | + +----------+ +----------+ +----------+ +----------+ +----------+ KCM sockets ------------ +=========== The KCM sockets provide the user interface to the multiplexor. All the KCM sockets bound to a multiplexor are considered to have equivalent function, and I/O @@ -37,7 +40,7 @@ operations in different sockets may be done in parallel without the need for synchronization between threads in userspace. Multiplexor ------------ +=========== The multiplexor provides the message steering. In the transmit path, messages written on a KCM socket are sent atomically on an appropriate TCP socket. @@ -45,14 +48,14 @@ Similarly, in the receive path, messages are constructed on each TCP socket (Psock) and complete messages are steered to a KCM socket. TCP sockets & Psocks --------------------- +==================== TCP sockets may be bound to a KCM multiplexor. A Psock structure is allocated for each bound TCP socket, this structure holds the state for constructing messages on receive as well as other connection specific information for KCM. Connected mode semantics ------------------------- +======================== Each multiplexor assumes that all attached TCP connections are to the same destination and can use the different connections for load balancing when @@ -60,7 +63,7 @@ transmitting. The normal send and recv calls (include sendmmsg and recvmmsg) can be used to send and receive messages from the KCM socket. Socket types ------------- +============ KCM supports SOCK_DGRAM and SOCK_SEQPACKET socket types. @@ -110,23 +113,23 @@ User interface Creating a multiplexor ---------------------- -A new multiplexor and initial KCM socket is created by a socket call: +A new multiplexor and initial KCM socket is created by a socket call:: socket(AF_KCM, type, protocol) - - type is either SOCK_DGRAM or SOCK_SEQPACKET - - protocol is KCMPROTO_CONNECTED +- type is either SOCK_DGRAM or SOCK_SEQPACKET +- protocol is KCMPROTO_CONNECTED Cloning KCM sockets ------------------- After the first KCM socket is created using the socket call as described above, additional sockets for the multiplexor can be created by cloning -a KCM socket. This is accomplished by an ioctl on a KCM socket: +a KCM socket. This is accomplished by an ioctl on a KCM socket:: /* From linux/kcm.h */ struct kcm_clone { - int fd; + int fd; }; struct kcm_clone info; @@ -142,11 +145,11 @@ Attach transport sockets ------------------------ Attaching of transport sockets to a multiplexor is performed by calling an -ioctl on a KCM socket for the multiplexor. e.g.: +ioctl on a KCM socket for the multiplexor. e.g.:: /* From linux/kcm.h */ struct kcm_attach { - int fd; + int fd; int bpf_fd; }; @@ -160,18 +163,19 @@ ioctl on a KCM socket for the multiplexor. e.g.: ioctl(kcmfd, SIOCKCMATTACH, &info); The kcm_attach structure contains: - fd: file descriptor for TCP socket being attached - bpf_prog_fd: file descriptor for compiled BPF program downloaded + + - fd: file descriptor for TCP socket being attached + - bpf_prog_fd: file descriptor for compiled BPF program downloaded Unattach transport sockets -------------------------- Unattaching a transport socket from a multiplexor is straightforward. An -"unattach" ioctl is done with the kcm_unattach structure as the argument: +"unattach" ioctl is done with the kcm_unattach structure as the argument:: /* From linux/kcm.h */ struct kcm_unattach { - int fd; + int fd; }; struct kcm_unattach info; @@ -190,7 +194,7 @@ When receive is disabled, any pending messages in the socket's receive buffer are moved to other sockets. This feature is useful if an application thread knows that it will be doing a lot of work on a request and won't be able to service new messages for a -while. Example use: +while. Example use:: int val = 1; @@ -200,7 +204,7 @@ BFP programs for message delineation ------------------------------------ BPF programs can be compiled using the BPF LLVM backend. For example, -the BPF program for parsing Thrift is: +the BPF program for parsing Thrift is:: #include "bpf.h" /* for __sk_buff */ #include "bpf_helpers.h" /* for load_word intrinsic */ @@ -250,6 +254,7 @@ based on groups, or batches of messages, can be beneficial for performance. On transmit, there are three ways an application can batch (pipeline) messages on a KCM socket. + 1) Send multiple messages in a single sendmmsg. 2) Send a group of messages each with a sendmsg call, where all messages except the last have MSG_BATCH in the flags of sendmsg call. diff --git a/Documentation/networking/ltpc.txt b/Documentation/networking/ltpc.txt index 0bf3220c715b..a005a73b76d0 100644 --- a/Documentation/networking/ltpc.txt +++ b/Documentation/networking/ltpc.txt @@ -99,7 +99,7 @@ treat the LocalTalk device like an ordinary Ethernet device, even if that's what it looks like to Netatalk. Instead, you follow the same procedure as for doing IP in EtherTalk. -See Documentation/networking/ipddp.txt for more information about the +See Documentation/networking/ipddp.rst for more information about the kernel driver and userspace tools needed. -------------------------------------- diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt index 999eb41da81d..494614573c67 100644 --- a/Documentation/networking/packet_mmap.txt +++ b/Documentation/networking/packet_mmap.txt @@ -1051,7 +1051,7 @@ for more information on hardware timestamps. ------------------------------------------------------------------------------- - Packet sockets work well together with Linux socket filters, thus you also - might want to have a look at Documentation/networking/filter.txt + might want to have a look at Documentation/networking/filter.rst -------------------------------------------------------------------------------- + THANKS diff --git a/Documentation/networking/snmp_counter.rst b/Documentation/networking/snmp_counter.rst index 10e11099e74a..4edd0d38779e 100644 --- a/Documentation/networking/snmp_counter.rst +++ b/Documentation/networking/snmp_counter.rst @@ -792,7 +792,7 @@ counters to indicate the ACK is skipped in which scenario. The ACK would only be skipped if the received packet is either a SYN packet or it has no data. -.. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt +.. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.rst * TcpExtTCPACKSkippedSynRecv diff --git a/MAINTAINERS b/MAINTAINERS index 453fe0713e68..3a5f52a3c055 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3192,7 +3192,7 @@ Q: https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147 T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git F: Documentation/bpf/ -F: Documentation/networking/filter.txt +F: Documentation/networking/filter.rst F: arch/*/net/* F: include/linux/bpf* F: include/linux/filter.h @@ -4728,7 +4728,7 @@ DECnet NETWORK LAYER L: linux-decnet-user@lists.sourceforge.net S: Orphan W: http://linux-decnet.sourceforge.net -F: Documentation/networking/decnet.txt +F: Documentation/networking/decnet.rst F: net/decnet/ DECSTATION PLATFORM SUPPORT @@ -7815,7 +7815,7 @@ HUAWEI ETHERNET DRIVER M: Aviad Krawczyk <aviad.krawczyk@huawei.com> L: netdev@vger.kernel.org S: Supported -F: Documentation/networking/hinic.txt +F: Documentation/networking/hinic.rst F: drivers/net/ethernet/huawei/hinic/ HUGETLB FILESYSTEM @@ -8934,7 +8934,7 @@ L: lvs-devel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs.git -F: Documentation/networking/ipvs-sysctl.txt +F: Documentation/networking/ipvs-sysctl.rst F: include/net/ip_vs.h F: include/uapi/linux/ip_vs.h F: net/netfilter/ipvs/ diff --git a/drivers/atm/Kconfig b/drivers/atm/Kconfig index 8c37294f1d1e..cfb0d16b60ad 100644 --- a/drivers/atm/Kconfig +++ b/drivers/atm/Kconfig @@ -306,7 +306,7 @@ config ATM_IA for more info about the cards. Say Y (or M to compile as a module named iphase) here if you have one of these cards. - See the file <file:Documentation/networking/iphase.txt> for further + See the file <file:Documentation/networking/iphase.rst> for further details. config ATM_IA_DEBUG @@ -336,7 +336,7 @@ config ATM_FORE200E on PCI and SBUS hosts. Say Y (or M to compile as a module named fore_200e) here if you have one of these ATM adapters. - See the file <file:Documentation/networking/fore200e.txt> for + See the file <file:Documentation/networking/fore200e.rst> for further details. config ATM_FORE200E_USE_TASKLET diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index b103fbdd0f68..c822f4a6d166 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -50,7 +50,7 @@ config BONDING The driver supports multiple bonding modes to allow for both high performance and high availability operation. - Refer to <file:Documentation/networking/bonding.txt> for more + Refer to <file:Documentation/networking/bonding.rst> for more information. To compile this driver as a module, choose M here: the module @@ -126,7 +126,7 @@ config EQUALIZER Linux driver or with a Livingston Portmaster 2e. Say Y if you want this and read - <file:Documentation/networking/eql.txt>. You may also want to read + <file:Documentation/networking/eql.rst>. You may also want to read section 6.2 of the NET-3-HOWTO, available from <http://www.tldp.org/docs.html#howto>. diff --git a/drivers/net/appletalk/Kconfig b/drivers/net/appletalk/Kconfig index af509b05ac5c..ccde6479050c 100644 --- a/drivers/net/appletalk/Kconfig +++ b/drivers/net/appletalk/Kconfig @@ -59,7 +59,7 @@ config COPS package. This driver is experimental, which means that it may not work. This driver will only work if you choose "AppleTalk DDP" networking support, above. - Please read the file <file:Documentation/networking/cops.txt>. + Please read the file <file:Documentation/networking/cops.rst>. config COPS_DAYNA bool "Dayna firmware support" @@ -86,7 +86,7 @@ config IPDDP box is stuck on an AppleTalk only network) or decapsulate (e.g. if you want your Linux box to act as an Internet gateway for a zoo of AppleTalk connected Macs). Please see the file - <file:Documentation/networking/ipddp.txt> for more information. + <file:Documentation/networking/ipddp.rst> for more information. If you say Y here, the AppleTalk-IP support will be compiled into the kernel. In this case, you can either use encapsulation or @@ -107,4 +107,4 @@ config IPDDP_ENCAP IP packets inside AppleTalk frames; this is useful if your Linux box is stuck on an AppleTalk network (which hopefully contains a decapsulator somewhere). Please see - <file:Documentation/networking/ipddp.txt> for more information. + <file:Documentation/networking/ipddp.rst> for more information. diff --git a/drivers/net/arcnet/Kconfig b/drivers/net/arcnet/Kconfig index 27551bf3d7e4..43eef60653b2 100644 --- a/drivers/net/arcnet/Kconfig +++ b/drivers/net/arcnet/Kconfig @@ -9,7 +9,7 @@ menuconfig ARCNET ---help--- If you have a network card of this type, say Y and check out the (arguably) beautiful poetry in - <file:Documentation/networking/arcnet.txt>. + <file:Documentation/networking/arcnet.rst>. You need both this driver, and the driver for the particular ARCnet chipset of your card. If you don't know, then it's probably a @@ -28,7 +28,7 @@ config ARCNET_1201 arc0 device. You need to say Y here to communicate with industry-standard RFC1201 implementations, like the arcether.com packet driver or most DOS/Windows ODI drivers. Please read the - ARCnet documentation in <file:Documentation/networking/arcnet.txt> + ARCnet documentation in <file:Documentation/networking/arcnet.rst> for more information about using arc0. config ARCNET_1051 @@ -42,7 +42,7 @@ config ARCNET_1051 industry-standard RFC1201 implementations, like the arcether.com packet driver or most DOS/Windows ODI drivers. RFC1201 is included automatically as the arc0 device. Please read the ARCnet - documentation in <file:Documentation/networking/arcnet.txt> for more + documentation in <file:Documentation/networking/arcnet.rst> for more information about using arc0e and arc0s. config ARCNET_RAW diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig index 661c25eb1c46..1538ad194cf4 100644 --- a/drivers/net/caif/Kconfig +++ b/drivers/net/caif/Kconfig @@ -28,7 +28,7 @@ config CAIF_SPI_SLAVE The CAIF Link layer SPI Protocol driver for Slave SPI interface. This driver implements a platform driver to accommodate for a platform specific SPI device. A sample CAIF SPI Platform device is - provided in <file:Documentation/networking/caif/spi_porting.txt>. + provided in <file:Documentation/networking/caif/spi_porting.rst>. config CAIF_SPI_SYNC bool "Next command and length in start of frame" diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c index 02b7705393ca..112edbd30823 100644 --- a/drivers/net/ethernet/atheros/ag71xx.c +++ b/drivers/net/ethernet/atheros/ag71xx.c @@ -871,13 +871,40 @@ static void ag71xx_mac_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state) { + struct ag71xx *ag = netdev_priv(to_net_dev(config->dev)); __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, }; - if (state->interface != PHY_INTERFACE_MODE_NA && - state->interface != PHY_INTERFACE_MODE_GMII && - state->interface != PHY_INTERFACE_MODE_MII) { - bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS); - return; + switch (state->interface) { + case PHY_INTERFACE_MODE_NA: + break; + case PHY_INTERFACE_MODE_MII: + if ((ag71xx_is(ag, AR9330) && ag->mac_idx == 0) || + ag71xx_is(ag, AR9340) || + ag71xx_is(ag, QCA9530) || + (ag71xx_is(ag, QCA9550) && ag->mac_idx == 1)) + break; + goto unsupported; + case PHY_INTERFACE_MODE_GMII: + if ((ag71xx_is(ag, AR9330) && ag->mac_idx == 1) || + (ag71xx_is(ag, AR9340) && ag->mac_idx == 1) || + (ag71xx_is(ag, QCA9530) && ag->mac_idx == 1)) + break; + goto unsupported; + case PHY_INTERFACE_MODE_SGMII: + if (ag71xx_is(ag, QCA9550) && ag->mac_idx == 0) + break; + goto unsupported; + case PHY_INTERFACE_MODE_RMII: + if (ag71xx_is(ag, AR9340) && ag->mac_idx == 0) + break; + goto unsupported; + case PHY_INTERFACE_MODE_RGMII: + if ((ag71xx_is(ag, AR9340) && ag->mac_idx == 0) || + (ag71xx_is(ag, QCA9550) && ag->mac_idx == 1)) + break; + goto unsupported; + default: + goto unsupported; } phylink_set(mask, MII); @@ -889,6 +916,8 @@ static void ag71xx_mac_validate(struct phylink_config *config, phylink_set(mask, 100baseT_Full); if (state->interface == PHY_INTERFACE_MODE_NA || + state->interface == PHY_INTERFACE_MODE_SGMII || + state->interface == PHY_INTERFACE_MODE_RGMII || state->interface == PHY_INTERFACE_MODE_GMII) { phylink_set(mask, 1000baseT_Full); phylink_set(mask, 1000baseX_Full); @@ -898,6 +927,10 @@ static void ag71xx_mac_validate(struct phylink_config *config, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_and(state->advertising, state->advertising, mask, __ETHTOOL_LINK_MODE_MASK_NBITS); + + return; +unsupported: + bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS); } static void ag71xx_mac_pcs_get_state(struct phylink_config *config, diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c index 80291afff3ea..0a31e4268dfb 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-debugfs.c @@ -139,7 +139,7 @@ static int dpaa2_dbg_ch_show(struct seq_file *file, void *offset) ch->stats.dequeue_portal_busy, ch->stats.frames, ch->stats.cdan, - ch->stats.frames / ch->stats.cdan, + div64_u64(ch->stats.frames, ch->stats.cdan), ch->buf_count); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 6291aa9f06b0..5602bf226687 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -374,6 +374,8 @@ struct hnae3_ae_dev { * Set the max tx rate of specified vf. * set_vf_mac * Configure the default MAC for specified VF + * get_module_eeprom + * Get the optical module eeprom info. */ struct hnae3_ae_ops { int (*init_ae_dev)(struct hnae3_ae_dev *ae_dev); @@ -548,6 +550,8 @@ struct hnae3_ae_ops { int (*set_vf_rate)(struct hnae3_handle *handle, int vf, int min_tx_rate, int max_tx_rate, bool force); int (*set_vf_mac)(struct hnae3_handle *handle, int vf, u8 *p); + int (*get_module_eeprom)(struct hnae3_handle *handle, u32 offset, + u32 len, u8 *data); }; struct hnae3_dcb_ops { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 4d9c85f049dc..1a105f2f87a4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -4,6 +4,7 @@ #include <linux/etherdevice.h> #include <linux/string.h> #include <linux/phy.h> +#include <linux/sfp.h> #include "hns3_enet.h" @@ -12,6 +13,11 @@ struct hns3_stats { int stats_offset; }; +struct hns3_sfp_type { + u8 type; + u8 ext_type; +}; + /* tqp related stats */ #define HNS3_TQP_STAT(_string, _member) { \ .stats_string = _string, \ @@ -1386,6 +1392,73 @@ static int hns3_set_fecparam(struct net_device *netdev, return ops->set_fec(handle, fec_mode); } +static int hns3_get_module_info(struct net_device *netdev, + struct ethtool_modinfo *modinfo) +{ +#define HNS3_SFF_8636_V1_3 0x03 + + struct hnae3_handle *handle = hns3_get_handle(netdev); + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + struct hns3_sfp_type sfp_type; + int ret; + + if (handle->pdev->revision == 0x20 || !ops->get_module_eeprom) + return -EOPNOTSUPP; + + memset(&sfp_type, 0, sizeof(sfp_type)); + ret = ops->get_module_eeprom(handle, 0, sizeof(sfp_type) / sizeof(u8), + (u8 *)&sfp_type); + if (ret) + return ret; + + switch (sfp_type.type) { + case SFF8024_ID_SFP: + modinfo->type = ETH_MODULE_SFF_8472; + modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; + break; + case SFF8024_ID_QSFP_8438: + modinfo->type = ETH_MODULE_SFF_8436; + modinfo->eeprom_len = ETH_MODULE_SFF_8436_MAX_LEN; + break; + case SFF8024_ID_QSFP_8436_8636: + if (sfp_type.ext_type < HNS3_SFF_8636_V1_3) { + modinfo->type = ETH_MODULE_SFF_8436; + modinfo->eeprom_len = ETH_MODULE_SFF_8436_MAX_LEN; + } else { + modinfo->type = ETH_MODULE_SFF_8636; + modinfo->eeprom_len = ETH_MODULE_SFF_8636_MAX_LEN; + } + break; + case SFF8024_ID_QSFP28_8636: + modinfo->type = ETH_MODULE_SFF_8636; + modinfo->eeprom_len = ETH_MODULE_SFF_8636_MAX_LEN; + break; + default: + netdev_err(netdev, "Optical module unknown: %#x\n", + sfp_type.type); + return -EINVAL; + } + + return 0; +} + +static int hns3_get_module_eeprom(struct net_device *netdev, + struct ethtool_eeprom *ee, u8 *data) +{ + struct hnae3_handle *handle = hns3_get_handle(netdev); + const struct hnae3_ae_ops *ops = handle->ae_algo->ops; + + if (handle->pdev->revision == 0x20 || !ops->get_module_eeprom) + return -EOPNOTSUPP; + + if (!ee->len) + return -EINVAL; + + memset(data, 0, ee->len); + + return ops->get_module_eeprom(handle, ee->offset, ee->len, data); +} + #define HNS3_ETHTOOL_COALESCE (ETHTOOL_COALESCE_USECS | \ ETHTOOL_COALESCE_USE_ADAPTIVE | \ ETHTOOL_COALESCE_RX_USECS_HIGH | \ @@ -1449,6 +1522,8 @@ static const struct ethtool_ops hns3_ethtool_ops = { .set_msglevel = hns3_set_msglevel, .get_fecparam = hns3_get_fecparam, .set_fecparam = hns3_set_fecparam, + .get_module_info = hns3_get_module_info, + .get_module_eeprom = hns3_get_module_eeprom, }; void hns3_ethtool_set_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 90e422efe590..9a9d752aedc5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -270,6 +270,8 @@ enum hclge_opcode_type { HCLGE_OPC_M7_COMPAT_CFG = 0x701A, /* SFP command */ + HCLGE_OPC_GET_SFP_EEPROM = 0x7100, + HCLGE_OPC_GET_SFP_EXIST = 0x7101, HCLGE_OPC_GET_SFP_INFO = 0x7104, /* Error INT commands */ @@ -1054,6 +1056,19 @@ struct hclge_firmware_compat_cmd { u8 rsv[20]; }; +#define HCLGE_SFP_INFO_CMD_NUM 6 +#define HCLGE_SFP_INFO_BD0_LEN 20 +#define HCLGE_SFP_INFO_BDX_LEN 24 +#define HCLGE_SFP_INFO_MAX_LEN \ + (HCLGE_SFP_INFO_BD0_LEN + \ + (HCLGE_SFP_INFO_CMD_NUM - 1) * HCLGE_SFP_INFO_BDX_LEN) + +struct hclge_sfp_info_bd0_cmd { + __le16 offset; + __le16 read_len; + u8 data[HCLGE_SFP_INFO_BD0_LEN]; +}; + int hclge_cmd_init(struct hclge_dev *hdev); static inline void hclge_write_reg(void __iomem *base, u32 reg, u32 value) { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index e2fec832fdf0..71a54ddb51f5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -11119,6 +11119,107 @@ static void hclge_sync_promisc_mode(struct hclge_dev *hdev) } } +static bool hclge_module_existed(struct hclge_dev *hdev) +{ + struct hclge_desc desc; + u32 existed; + int ret; + + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_GET_SFP_EXIST, true); + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to get SFP exist state, ret = %d\n", ret); + return false; + } + + existed = le32_to_cpu(desc.data[0]); + + return existed != 0; +} + +/* need 6 bds(total 140 bytes) in one reading + * return the number of bytes actually read, 0 means read failed. + */ +static u16 hclge_get_sfp_eeprom_info(struct hclge_dev *hdev, u32 offset, + u32 len, u8 *data) +{ + struct hclge_desc desc[HCLGE_SFP_INFO_CMD_NUM]; + struct hclge_sfp_info_bd0_cmd *sfp_info_bd0; + u16 read_len; + u16 copy_len; + int ret; + int i; + + /* setup all 6 bds to read module eeprom info. */ + for (i = 0; i < HCLGE_SFP_INFO_CMD_NUM; i++) { + hclge_cmd_setup_basic_desc(&desc[i], HCLGE_OPC_GET_SFP_EEPROM, + true); + + /* bd0~bd4 need next flag */ + if (i < HCLGE_SFP_INFO_CMD_NUM - 1) + desc[i].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT); + } + + /* setup bd0, this bd contains offset and read length. */ + sfp_info_bd0 = (struct hclge_sfp_info_bd0_cmd *)desc[0].data; + sfp_info_bd0->offset = cpu_to_le16((u16)offset); + read_len = min_t(u16, len, HCLGE_SFP_INFO_MAX_LEN); + sfp_info_bd0->read_len = cpu_to_le16(read_len); + + ret = hclge_cmd_send(&hdev->hw, desc, i); + if (ret) { + dev_err(&hdev->pdev->dev, + "failed to get SFP eeprom info, ret = %d\n", ret); + return 0; + } + + /* copy sfp info from bd0 to out buffer. */ + copy_len = min_t(u16, len, HCLGE_SFP_INFO_BD0_LEN); + memcpy(data, sfp_info_bd0->data, copy_len); + read_len = copy_len; + + /* copy sfp info from bd1~bd5 to out buffer if needed. */ + for (i = 1; i < HCLGE_SFP_INFO_CMD_NUM; i++) { + if (read_len >= len) + return read_len; + + copy_len = min_t(u16, len - read_len, HCLGE_SFP_INFO_BDX_LEN); + memcpy(data + read_len, desc[i].data, copy_len); + read_len += copy_len; + } + + return read_len; +} + +static int hclge_get_module_eeprom(struct hnae3_handle *handle, u32 offset, + u32 len, u8 *data) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + struct hclge_dev *hdev = vport->back; + u32 read_len = 0; + u16 data_len; + + if (hdev->hw.mac.media_type != HNAE3_MEDIA_TYPE_FIBER) + return -EOPNOTSUPP; + + if (!hclge_module_existed(hdev)) + return -ENXIO; + + while (read_len < len) { + data_len = hclge_get_sfp_eeprom_info(hdev, + offset + read_len, + len - read_len, + data + read_len); + if (!data_len) + return -EIO; + + read_len += data_len; + } + + return 0; +} + static const struct hnae3_ae_ops hclge_ops = { .init_ae_dev = hclge_init_ae_dev, .uninit_ae_dev = hclge_uninit_ae_dev, @@ -11211,6 +11312,7 @@ static const struct hnae3_ae_ops hclge_ops = { .set_vf_trust = hclge_set_vf_trust, .set_vf_rate = hclge_set_vf_rate, .set_vf_mac = hclge_set_vf_mac, + .get_module_eeprom = hclge_get_module_eeprom, }; static struct hnae3_ae_algo ae_algo = { diff --git a/drivers/net/ethernet/mellanox/mlxsw/Makefile b/drivers/net/ethernet/mellanox/mlxsw/Makefile index 0e86a581d45b..4aeabb35c943 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/Makefile +++ b/drivers/net/ethernet/mellanox/mlxsw/Makefile @@ -21,6 +21,7 @@ mlxsw_spectrum-objs := spectrum.o spectrum_buffers.o \ spectrum_acl_atcam.o spectrum_acl_erp.o \ spectrum1_acl_tcam.o spectrum2_acl_tcam.o \ spectrum_acl_bloom_filter.o spectrum_acl.o \ + spectrum_flow.o spectrum_matchall.o \ spectrum_flower.o spectrum_cnt.o \ spectrum_fid.o spectrum_ipip.o \ spectrum_acl_flex_actions.o \ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 24ca8d5bc564..f78bde8bc16e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -25,9 +25,7 @@ #include <linux/log2.h> #include <net/switchdev.h> #include <net/pkt_cls.h> -#include <net/tc_act/tc_mirred.h> #include <net/netevent.h> -#include <net/tc_act/tc_sample.h> #include <net/addrconf.h> #include "spectrum.h" @@ -582,16 +580,6 @@ static int mlxsw_sp_base_mac_get(struct mlxsw_sp *mlxsw_sp) return 0; } -static int mlxsw_sp_port_sample_set(struct mlxsw_sp_port *mlxsw_sp_port, - bool enable, u32 rate) -{ - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; - char mpsc_pl[MLXSW_REG_MPSC_LEN]; - - mlxsw_reg_mpsc_pack(mpsc_pl, mlxsw_sp_port->local_port, enable, rate); - return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpsc), mpsc_pl); -} - static int mlxsw_sp_port_admin_status_set(struct mlxsw_sp_port *mlxsw_sp_port, bool is_up) { @@ -1362,412 +1350,6 @@ static int mlxsw_sp_port_kill_vid(struct net_device *dev, return 0; } -static struct mlxsw_sp_port_mall_tc_entry * -mlxsw_sp_port_mall_tc_entry_find(struct mlxsw_sp_port *port, - unsigned long cookie) { - struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry; - - list_for_each_entry(mall_tc_entry, &port->mall_tc_list, list) - if (mall_tc_entry->cookie == cookie) - return mall_tc_entry; - - return NULL; -} - -static int -mlxsw_sp_port_add_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port, - struct mlxsw_sp_port_mall_mirror_tc_entry *mirror, - const struct flow_action_entry *act, - bool ingress) -{ - enum mlxsw_sp_span_type span_type; - - if (!act->dev) { - netdev_err(mlxsw_sp_port->dev, "Could not find requested device\n"); - return -EINVAL; - } - - mirror->ingress = ingress; - span_type = ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS; - return mlxsw_sp_span_mirror_add(mlxsw_sp_port, act->dev, span_type, - true, &mirror->span_id); -} - -static void -mlxsw_sp_port_del_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port, - struct mlxsw_sp_port_mall_mirror_tc_entry *mirror) -{ - enum mlxsw_sp_span_type span_type; - - span_type = mirror->ingress ? - MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS; - mlxsw_sp_span_mirror_del(mlxsw_sp_port, mirror->span_id, - span_type, true); -} - -static int -mlxsw_sp_port_add_cls_matchall_sample(struct mlxsw_sp_port *mlxsw_sp_port, - struct tc_cls_matchall_offload *cls, - const struct flow_action_entry *act, - bool ingress) -{ - int err; - - if (!mlxsw_sp_port->sample) - return -EOPNOTSUPP; - if (rtnl_dereference(mlxsw_sp_port->sample->psample_group)) { - netdev_err(mlxsw_sp_port->dev, "sample already active\n"); - return -EEXIST; - } - if (act->sample.rate > MLXSW_REG_MPSC_RATE_MAX) { - netdev_err(mlxsw_sp_port->dev, "sample rate not supported\n"); - return -EOPNOTSUPP; - } - - rcu_assign_pointer(mlxsw_sp_port->sample->psample_group, - act->sample.psample_group); - mlxsw_sp_port->sample->truncate = act->sample.truncate; - mlxsw_sp_port->sample->trunc_size = act->sample.trunc_size; - mlxsw_sp_port->sample->rate = act->sample.rate; - - err = mlxsw_sp_port_sample_set(mlxsw_sp_port, true, act->sample.rate); - if (err) - goto err_port_sample_set; - return 0; - -err_port_sample_set: - RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL); - return err; -} - -static void -mlxsw_sp_port_del_cls_matchall_sample(struct mlxsw_sp_port *mlxsw_sp_port) -{ - if (!mlxsw_sp_port->sample) - return; - - mlxsw_sp_port_sample_set(mlxsw_sp_port, false, 1); - RCU_INIT_POINTER(mlxsw_sp_port->sample->psample_group, NULL); -} - -static int mlxsw_sp_port_add_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, - struct tc_cls_matchall_offload *f, - bool ingress) -{ - struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry; - __be16 protocol = f->common.protocol; - struct flow_action_entry *act; - int err; - - if (!flow_offload_has_one_action(&f->rule->action)) { - netdev_err(mlxsw_sp_port->dev, "only singular actions are supported\n"); - return -EOPNOTSUPP; - } - - mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); - if (!mall_tc_entry) - return -ENOMEM; - mall_tc_entry->cookie = f->cookie; - - act = &f->rule->action.entries[0]; - - if (act->id == FLOW_ACTION_MIRRED && protocol == htons(ETH_P_ALL)) { - struct mlxsw_sp_port_mall_mirror_tc_entry *mirror; - - mall_tc_entry->type = MLXSW_SP_PORT_MALL_MIRROR; - mirror = &mall_tc_entry->mirror; - err = mlxsw_sp_port_add_cls_matchall_mirror(mlxsw_sp_port, - mirror, act, - ingress); - } else if (act->id == FLOW_ACTION_SAMPLE && - protocol == htons(ETH_P_ALL)) { - mall_tc_entry->type = MLXSW_SP_PORT_MALL_SAMPLE; - err = mlxsw_sp_port_add_cls_matchall_sample(mlxsw_sp_port, f, - act, ingress); - } else { - err = -EOPNOTSUPP; - } - - if (err) - goto err_add_action; - - list_add_tail(&mall_tc_entry->list, &mlxsw_sp_port->mall_tc_list); - return 0; - -err_add_action: - kfree(mall_tc_entry); - return err; -} - -static void mlxsw_sp_port_del_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, - struct tc_cls_matchall_offload *f) -{ - struct mlxsw_sp_port_mall_tc_entry *mall_tc_entry; - - mall_tc_entry = mlxsw_sp_port_mall_tc_entry_find(mlxsw_sp_port, - f->cookie); - if (!mall_tc_entry) { - netdev_dbg(mlxsw_sp_port->dev, "tc entry not found on port\n"); - return; - } - list_del(&mall_tc_entry->list); - - switch (mall_tc_entry->type) { - case MLXSW_SP_PORT_MALL_MIRROR: - mlxsw_sp_port_del_cls_matchall_mirror(mlxsw_sp_port, - &mall_tc_entry->mirror); - break; - case MLXSW_SP_PORT_MALL_SAMPLE: - mlxsw_sp_port_del_cls_matchall_sample(mlxsw_sp_port); - break; - default: - WARN_ON(1); - } - - kfree(mall_tc_entry); -} - -static int mlxsw_sp_setup_tc_cls_matchall(struct mlxsw_sp_port *mlxsw_sp_port, - struct tc_cls_matchall_offload *f, - bool ingress) -{ - switch (f->command) { - case TC_CLSMATCHALL_REPLACE: - return mlxsw_sp_port_add_cls_matchall(mlxsw_sp_port, f, - ingress); - case TC_CLSMATCHALL_DESTROY: - mlxsw_sp_port_del_cls_matchall(mlxsw_sp_port, f); - return 0; - default: - return -EOPNOTSUPP; - } -} - -static int -mlxsw_sp_setup_tc_cls_flower(struct mlxsw_sp_acl_block *acl_block, - struct flow_cls_offload *f) -{ - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_acl_block_mlxsw_sp(acl_block); - - switch (f->command) { - case FLOW_CLS_REPLACE: - return mlxsw_sp_flower_replace(mlxsw_sp, acl_block, f); - case FLOW_CLS_DESTROY: - mlxsw_sp_flower_destroy(mlxsw_sp, acl_block, f); - return 0; - case FLOW_CLS_STATS: - return mlxsw_sp_flower_stats(mlxsw_sp, acl_block, f); - case FLOW_CLS_TMPLT_CREATE: - return mlxsw_sp_flower_tmplt_create(mlxsw_sp, acl_block, f); - case FLOW_CLS_TMPLT_DESTROY: - mlxsw_sp_flower_tmplt_destroy(mlxsw_sp, acl_block, f); - return 0; - default: - return -EOPNOTSUPP; - } -} - -static int mlxsw_sp_setup_tc_block_cb_matchall(enum tc_setup_type type, - void *type_data, - void *cb_priv, bool ingress) -{ - struct mlxsw_sp_port *mlxsw_sp_port = cb_priv; - - switch (type) { - case TC_SETUP_CLSMATCHALL: - if (!tc_cls_can_offload_and_chain0(mlxsw_sp_port->dev, - type_data)) - return -EOPNOTSUPP; - - return mlxsw_sp_setup_tc_cls_matchall(mlxsw_sp_port, type_data, - ingress); - case TC_SETUP_CLSFLOWER: - return 0; - default: - return -EOPNOTSUPP; - } -} - -static int mlxsw_sp_setup_tc_block_cb_matchall_ig(enum tc_setup_type type, - void *type_data, - void *cb_priv) -{ - return mlxsw_sp_setup_tc_block_cb_matchall(type, type_data, - cb_priv, true); -} - -static int mlxsw_sp_setup_tc_block_cb_matchall_eg(enum tc_setup_type type, - void *type_data, - void *cb_priv) -{ - return mlxsw_sp_setup_tc_block_cb_matchall(type, type_data, - cb_priv, false); -} - -static int mlxsw_sp_setup_tc_block_cb_flower(enum tc_setup_type type, - void *type_data, void *cb_priv) -{ - struct mlxsw_sp_acl_block *acl_block = cb_priv; - - switch (type) { - case TC_SETUP_CLSMATCHALL: - return 0; - case TC_SETUP_CLSFLOWER: - if (mlxsw_sp_acl_block_disabled(acl_block)) - return -EOPNOTSUPP; - - return mlxsw_sp_setup_tc_cls_flower(acl_block, type_data); - default: - return -EOPNOTSUPP; - } -} - -static void mlxsw_sp_tc_block_flower_release(void *cb_priv) -{ - struct mlxsw_sp_acl_block *acl_block = cb_priv; - - mlxsw_sp_acl_block_destroy(acl_block); -} - -static LIST_HEAD(mlxsw_sp_block_cb_list); - -static int -mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, - struct flow_block_offload *f, bool ingress) -{ - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; - struct mlxsw_sp_acl_block *acl_block; - struct flow_block_cb *block_cb; - bool register_block = false; - int err; - - block_cb = flow_block_cb_lookup(f->block, - mlxsw_sp_setup_tc_block_cb_flower, - mlxsw_sp); - if (!block_cb) { - acl_block = mlxsw_sp_acl_block_create(mlxsw_sp, f->net); - if (!acl_block) - return -ENOMEM; - block_cb = flow_block_cb_alloc(mlxsw_sp_setup_tc_block_cb_flower, - mlxsw_sp, acl_block, - mlxsw_sp_tc_block_flower_release); - if (IS_ERR(block_cb)) { - mlxsw_sp_acl_block_destroy(acl_block); - err = PTR_ERR(block_cb); - goto err_cb_register; - } - register_block = true; - } else { - acl_block = flow_block_cb_priv(block_cb); - } - flow_block_cb_incref(block_cb); - err = mlxsw_sp_acl_block_bind(mlxsw_sp, acl_block, - mlxsw_sp_port, ingress, f->extack); - if (err) - goto err_block_bind; - - if (ingress) - mlxsw_sp_port->ing_acl_block = acl_block; - else - mlxsw_sp_port->eg_acl_block = acl_block; - - if (register_block) { - flow_block_cb_add(block_cb, f); - list_add_tail(&block_cb->driver_list, &mlxsw_sp_block_cb_list); - } - - return 0; - -err_block_bind: - if (!flow_block_cb_decref(block_cb)) - flow_block_cb_free(block_cb); -err_cb_register: - return err; -} - -static void -mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port, - struct flow_block_offload *f, bool ingress) -{ - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; - struct mlxsw_sp_acl_block *acl_block; - struct flow_block_cb *block_cb; - int err; - - block_cb = flow_block_cb_lookup(f->block, - mlxsw_sp_setup_tc_block_cb_flower, - mlxsw_sp); - if (!block_cb) - return; - - if (ingress) - mlxsw_sp_port->ing_acl_block = NULL; - else - mlxsw_sp_port->eg_acl_block = NULL; - - acl_block = flow_block_cb_priv(block_cb); - err = mlxsw_sp_acl_block_unbind(mlxsw_sp, acl_block, - mlxsw_sp_port, ingress); - if (!err && !flow_block_cb_decref(block_cb)) { - flow_block_cb_remove(block_cb, f); - list_del(&block_cb->driver_list); - } -} - -static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, - struct flow_block_offload *f) -{ - struct flow_block_cb *block_cb; - flow_setup_cb_t *cb; - bool ingress; - int err; - - if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) { - cb = mlxsw_sp_setup_tc_block_cb_matchall_ig; - ingress = true; - } else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) { - cb = mlxsw_sp_setup_tc_block_cb_matchall_eg; - ingress = false; - } else { - return -EOPNOTSUPP; - } - - f->driver_block_list = &mlxsw_sp_block_cb_list; - - switch (f->command) { - case FLOW_BLOCK_BIND: - if (flow_block_cb_is_busy(cb, mlxsw_sp_port, - &mlxsw_sp_block_cb_list)) - return -EBUSY; - - block_cb = flow_block_cb_alloc(cb, mlxsw_sp_port, - mlxsw_sp_port, NULL); - if (IS_ERR(block_cb)) - return PTR_ERR(block_cb); - err = mlxsw_sp_setup_tc_block_flower_bind(mlxsw_sp_port, f, - ingress); - if (err) { - flow_block_cb_free(block_cb); - return err; - } - flow_block_cb_add(block_cb, f); - list_add_tail(&block_cb->driver_list, &mlxsw_sp_block_cb_list); - return 0; - case FLOW_BLOCK_UNBIND: - mlxsw_sp_setup_tc_block_flower_unbind(mlxsw_sp_port, - f, ingress); - block_cb = flow_block_cb_lookup(f->block, cb, mlxsw_sp_port); - if (!block_cb) - return -ENOENT; - - flow_block_cb_remove(block_cb, f); - list_del(&block_cb->driver_list); - return 0; - default: - return -EOPNOTSUPP; - } -} - static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { @@ -1791,23 +1373,21 @@ static int mlxsw_sp_setup_tc(struct net_device *dev, enum tc_setup_type type, } } - static int mlxsw_sp_feature_hw_tc(struct net_device *dev, bool enable) { struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); if (!enable) { - if (mlxsw_sp_acl_block_rule_count(mlxsw_sp_port->ing_acl_block) || - mlxsw_sp_acl_block_rule_count(mlxsw_sp_port->eg_acl_block) || - !list_empty(&mlxsw_sp_port->mall_tc_list)) { + if (mlxsw_sp_flow_block_rule_count(mlxsw_sp_port->ing_flow_block) || + mlxsw_sp_flow_block_rule_count(mlxsw_sp_port->eg_flow_block)) { netdev_err(dev, "Active offloaded tc filters, can't turn hw_tc_offload off\n"); return -EINVAL; } - mlxsw_sp_acl_block_disable_inc(mlxsw_sp_port->ing_acl_block); - mlxsw_sp_acl_block_disable_inc(mlxsw_sp_port->eg_acl_block); + mlxsw_sp_flow_block_disable_inc(mlxsw_sp_port->ing_flow_block); + mlxsw_sp_flow_block_disable_inc(mlxsw_sp_port->eg_flow_block); } else { - mlxsw_sp_acl_block_disable_dec(mlxsw_sp_port->ing_acl_block); - mlxsw_sp_acl_block_disable_dec(mlxsw_sp_port->eg_acl_block); + mlxsw_sp_flow_block_disable_dec(mlxsw_sp_port->ing_flow_block); + mlxsw_sp_flow_block_disable_dec(mlxsw_sp_port->eg_flow_block); } return 0; } @@ -3695,7 +3275,6 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port, mlxsw_sp_port->mapping = *port_mapping; mlxsw_sp_port->link.autoneg = 1; INIT_LIST_HEAD(&mlxsw_sp_port->vlans_list); - INIT_LIST_HEAD(&mlxsw_sp_port->mall_tc_list); mlxsw_sp_port->pcpu_stats = netdev_alloc_pcpu_stats(struct mlxsw_sp_port_pcpu_stats); @@ -3704,13 +3283,6 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u8 local_port, goto err_alloc_stats; } - mlxsw_sp_port->sample = kzalloc(sizeof(*mlxsw_sp_port->sample), - GFP_KERNEL); - if (!mlxsw_sp_port->sample) { - err = -ENOMEM; - goto err_alloc_sample; - } - INIT_DELAYED_WORK(&mlxsw_sp_port->periodic_hw_stats.update_dw, &update_stats_cache); @@ -3897,8 +3469,6 @@ err_dev_addr_init: err_port_swid_set: mlxsw_sp_port_module_unmap(mlxsw_sp_port); err_port_module_map: - kfree(mlxsw_sp_port->sample); -err_alloc_sample: free_percpu(mlxsw_sp_port->pcpu_stats); err_alloc_stats: free_netdev(dev); @@ -3926,7 +3496,6 @@ static void mlxsw_sp_port_remove(struct mlxsw_sp *mlxsw_sp, u8 local_port) mlxsw_sp_port_tc_mc_mode_set(mlxsw_sp_port, false); mlxsw_sp_port_swid_set(mlxsw_sp_port, MLXSW_PORT_SWID_DISABLED_PORT); mlxsw_sp_port_module_unmap(mlxsw_sp_port); - kfree(mlxsw_sp_port->sample); free_percpu(mlxsw_sp_port->pcpu_stats); WARN_ON_ONCE(!list_empty(&mlxsw_sp_port->vlans_list)); free_netdev(mlxsw_sp_port->dev); @@ -4413,7 +3982,7 @@ static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port, { struct mlxsw_sp *mlxsw_sp = priv; struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port]; - struct psample_group *psample_group; + struct mlxsw_sp_port_sample *sample; u32 size; if (unlikely(!mlxsw_sp_port)) { @@ -4421,22 +3990,14 @@ static void mlxsw_sp_rx_listener_sample_func(struct sk_buff *skb, u8 local_port, local_port); goto out; } - if (unlikely(!mlxsw_sp_port->sample)) { - dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received on unsupported port\n", - local_port); - goto out; - } - - size = mlxsw_sp_port->sample->truncate ? - mlxsw_sp_port->sample->trunc_size : skb->len; rcu_read_lock(); - psample_group = rcu_dereference(mlxsw_sp_port->sample->psample_group); - if (!psample_group) + sample = rcu_dereference(mlxsw_sp_port->sample); + if (!sample) goto out_unlock; - psample_sample_packet(psample_group, skb, size, - mlxsw_sp_port->dev->ifindex, 0, - mlxsw_sp_port->sample->rate); + size = sample->truncate ? sample->trunc_size : skb->len; + psample_sample_packet(sample->psample_group, skb, size, + mlxsw_sp_port->dev->ifindex, 0, sample->rate); out_unlock: rcu_read_unlock(); out: diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index ca56e72cb4b7..a12ca673c224 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -109,25 +109,6 @@ struct mlxsw_sp_mid { unsigned long *ports_in_mid; /* bits array */ }; -enum mlxsw_sp_port_mall_action_type { - MLXSW_SP_PORT_MALL_MIRROR, - MLXSW_SP_PORT_MALL_SAMPLE, -}; - -struct mlxsw_sp_port_mall_mirror_tc_entry { - int span_id; - bool ingress; -}; - -struct mlxsw_sp_port_mall_tc_entry { - struct list_head list; - unsigned long cookie; - enum mlxsw_sp_port_mall_action_type type; - union { - struct mlxsw_sp_port_mall_mirror_tc_entry mirror; - }; -}; - struct mlxsw_sp_sb; struct mlxsw_sp_bridge; struct mlxsw_sp_router; @@ -211,7 +192,7 @@ struct mlxsw_sp_port_pcpu_stats { }; struct mlxsw_sp_port_sample { - struct psample_group __rcu *psample_group; + struct psample_group *psample_group; u32 trunc_size; u32 rate; bool truncate; @@ -274,21 +255,19 @@ struct mlxsw_sp_port { * the same localport can have * different mapping. */ - /* TC handles */ - struct list_head mall_tc_list; struct { #define MLXSW_HW_STATS_UPDATE_TIME HZ struct rtnl_link_stats64 stats; struct mlxsw_sp_port_xstats xstats; struct delayed_work update_dw; } periodic_hw_stats; - struct mlxsw_sp_port_sample *sample; + struct mlxsw_sp_port_sample __rcu *sample; struct list_head vlans_list; struct mlxsw_sp_port_vlan *default_vlan; struct mlxsw_sp_qdisc_state *qdisc; unsigned acl_rule_count; - struct mlxsw_sp_acl_block *ing_acl_block; - struct mlxsw_sp_acl_block *eg_acl_block; + struct mlxsw_sp_flow_block *ing_flow_block; + struct mlxsw_sp_flow_block *eg_flow_block; struct { struct delayed_work shaper_dw; struct hwtstamp_config hwtstamp_config; @@ -654,17 +633,10 @@ struct mlxsw_sp_acl_rule_info { unsigned int counter_index; }; -struct mlxsw_sp_acl_block; -struct mlxsw_sp_acl_ruleset; - -/* spectrum_acl.c */ -enum mlxsw_sp_acl_profile { - MLXSW_SP_ACL_PROFILE_FLOWER, - MLXSW_SP_ACL_PROFILE_MR, -}; - -struct mlxsw_sp_acl_block { +/* spectrum_flow.c */ +struct mlxsw_sp_flow_block { struct list_head binding_list; + struct list_head mall_list; struct mlxsw_sp_acl_ruleset *ruleset_zero; struct mlxsw_sp *mlxsw_sp; unsigned int rule_count; @@ -676,35 +648,92 @@ struct mlxsw_sp_acl_block { struct net *net; }; +struct mlxsw_sp_flow_block_binding { + struct list_head list; + struct net_device *dev; + struct mlxsw_sp_port *mlxsw_sp_port; + bool ingress; +}; + +static inline struct mlxsw_sp * +mlxsw_sp_flow_block_mlxsw_sp(struct mlxsw_sp_flow_block *block) +{ + return block->mlxsw_sp; +} + +static inline unsigned int +mlxsw_sp_flow_block_rule_count(const struct mlxsw_sp_flow_block *block) +{ + return block ? block->rule_count : 0; +} + +static inline void +mlxsw_sp_flow_block_disable_inc(struct mlxsw_sp_flow_block *block) +{ + if (block) + block->disable_count++; +} + +static inline void +mlxsw_sp_flow_block_disable_dec(struct mlxsw_sp_flow_block *block) +{ + if (block) + block->disable_count--; +} + +static inline bool +mlxsw_sp_flow_block_disabled(const struct mlxsw_sp_flow_block *block) +{ + return block->disable_count; +} + +static inline bool +mlxsw_sp_flow_block_is_egress_bound(const struct mlxsw_sp_flow_block *block) +{ + return block->egress_binding_count; +} + +static inline bool +mlxsw_sp_flow_block_is_ingress_bound(const struct mlxsw_sp_flow_block *block) +{ + return block->ingress_binding_count; +} + +static inline bool +mlxsw_sp_flow_block_is_mixed_bound(const struct mlxsw_sp_flow_block *block) +{ + return block->ingress_binding_count && block->egress_binding_count; +} + +struct mlxsw_sp_flow_block *mlxsw_sp_flow_block_create(struct mlxsw_sp *mlxsw_sp, + struct net *net); +void mlxsw_sp_flow_block_destroy(struct mlxsw_sp_flow_block *block); +int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, + struct flow_block_offload *f); + +/* spectrum_acl.c */ +struct mlxsw_sp_acl_ruleset; + +enum mlxsw_sp_acl_profile { + MLXSW_SP_ACL_PROFILE_FLOWER, + MLXSW_SP_ACL_PROFILE_MR, +}; + struct mlxsw_afk *mlxsw_sp_acl_afk(struct mlxsw_sp_acl *acl); -struct mlxsw_sp *mlxsw_sp_acl_block_mlxsw_sp(struct mlxsw_sp_acl_block *block); -unsigned int -mlxsw_sp_acl_block_rule_count(const struct mlxsw_sp_acl_block *block); -void mlxsw_sp_acl_block_disable_inc(struct mlxsw_sp_acl_block *block); -void mlxsw_sp_acl_block_disable_dec(struct mlxsw_sp_acl_block *block); -bool mlxsw_sp_acl_block_disabled(const struct mlxsw_sp_acl_block *block); -struct mlxsw_sp_acl_block *mlxsw_sp_acl_block_create(struct mlxsw_sp *mlxsw_sp, - struct net *net); -void mlxsw_sp_acl_block_destroy(struct mlxsw_sp_acl_block *block); -int mlxsw_sp_acl_block_bind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress, - struct netlink_ext_ack *extack); -int mlxsw_sp_acl_block_unbind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress); -bool mlxsw_sp_acl_block_is_egress_bound(const struct mlxsw_sp_acl_block *block); -bool mlxsw_sp_acl_block_is_ingress_bound(const struct mlxsw_sp_acl_block *block); -bool mlxsw_sp_acl_block_is_mixed_bound(const struct mlxsw_sp_acl_block *block); + +int mlxsw_sp_acl_ruleset_bind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_flow_block_binding *binding); +void mlxsw_sp_acl_ruleset_unbind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_flow_block_binding *binding); struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, u32 chain_index, + struct mlxsw_sp_flow_block *block, u32 chain_index, enum mlxsw_sp_acl_profile profile); struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_get(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, u32 chain_index, + struct mlxsw_sp_flow_block *block, u32 chain_index, enum mlxsw_sp_acl_profile profile, struct mlxsw_afk_element_usage *tmplt_elusage); void mlxsw_sp_acl_ruleset_put(struct mlxsw_sp *mlxsw_sp, @@ -736,7 +765,7 @@ int mlxsw_sp_acl_rulei_act_drop(struct mlxsw_sp_acl_rule_info *rulei, int mlxsw_sp_acl_rulei_act_trap(struct mlxsw_sp_acl_rule_info *rulei); int mlxsw_sp_acl_rulei_act_mirror(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_rule_info *rulei, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct net_device *out_dev, struct netlink_ext_ack *extack); int mlxsw_sp_acl_rulei_act_fwd(struct mlxsw_sp *mlxsw_sp, @@ -857,21 +886,31 @@ extern const struct mlxsw_afa_ops mlxsw_sp2_act_afa_ops; extern const struct mlxsw_afk_ops mlxsw_sp1_afk_ops; extern const struct mlxsw_afk_ops mlxsw_sp2_afk_ops; +/* spectrum_matchall.c */ +int mlxsw_sp_mall_replace(struct mlxsw_sp_flow_block *block, + struct tc_cls_matchall_offload *f); +void mlxsw_sp_mall_destroy(struct mlxsw_sp_flow_block *block, + struct tc_cls_matchall_offload *f); +int mlxsw_sp_mall_port_bind(struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port); +void mlxsw_sp_mall_port_unbind(struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port); + /* spectrum_flower.c */ int mlxsw_sp_flower_replace(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f); void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f); int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f); int mlxsw_sp_flower_tmplt_create(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f); void mlxsw_sp_flower_tmplt_destroy(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f); /* spectrum_qdisc.c */ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c index e31ec75ac035..a11d911302f1 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum2_mr_tcam.c @@ -9,7 +9,7 @@ struct mlxsw_sp2_mr_tcam { struct mlxsw_sp *mlxsw_sp; - struct mlxsw_sp_acl_block *acl_block; + struct mlxsw_sp_flow_block *flow_block; struct mlxsw_sp_acl_ruleset *ruleset4; struct mlxsw_sp_acl_ruleset *ruleset6; }; @@ -61,7 +61,7 @@ static int mlxsw_sp2_mr_tcam_ipv4_init(struct mlxsw_sp2_mr_tcam *mr_tcam) mlxsw_sp2_mr_tcam_usage_ipv4, ARRAY_SIZE(mlxsw_sp2_mr_tcam_usage_ipv4)); mr_tcam->ruleset4 = mlxsw_sp_acl_ruleset_get(mr_tcam->mlxsw_sp, - mr_tcam->acl_block, + mr_tcam->flow_block, MLXSW_SP_L3_PROTO_IPV4, MLXSW_SP_ACL_PROFILE_MR, &elusage); @@ -111,7 +111,7 @@ static int mlxsw_sp2_mr_tcam_ipv6_init(struct mlxsw_sp2_mr_tcam *mr_tcam) mlxsw_sp2_mr_tcam_usage_ipv6, ARRAY_SIZE(mlxsw_sp2_mr_tcam_usage_ipv6)); mr_tcam->ruleset6 = mlxsw_sp_acl_ruleset_get(mr_tcam->mlxsw_sp, - mr_tcam->acl_block, + mr_tcam->flow_block, MLXSW_SP_L3_PROTO_IPV6, MLXSW_SP_ACL_PROFILE_MR, &elusage); @@ -289,8 +289,8 @@ static int mlxsw_sp2_mr_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv) int err; mr_tcam->mlxsw_sp = mlxsw_sp; - mr_tcam->acl_block = mlxsw_sp_acl_block_create(mlxsw_sp, NULL); - if (!mr_tcam->acl_block) + mr_tcam->flow_block = mlxsw_sp_flow_block_create(mlxsw_sp, NULL); + if (!mr_tcam->flow_block) return -ENOMEM; err = mlxsw_sp2_mr_tcam_ipv4_init(mr_tcam); @@ -306,7 +306,7 @@ static int mlxsw_sp2_mr_tcam_init(struct mlxsw_sp *mlxsw_sp, void *priv) err_ipv6_init: mlxsw_sp2_mr_tcam_ipv4_fini(mr_tcam); err_ipv4_init: - mlxsw_sp_acl_block_destroy(mr_tcam->acl_block); + mlxsw_sp_flow_block_destroy(mr_tcam->flow_block); return err; } @@ -316,7 +316,7 @@ static void mlxsw_sp2_mr_tcam_fini(void *priv) mlxsw_sp2_mr_tcam_ipv6_fini(mr_tcam); mlxsw_sp2_mr_tcam_ipv4_fini(mr_tcam); - mlxsw_sp_acl_block_destroy(mr_tcam->acl_block); + mlxsw_sp_flow_block_destroy(mr_tcam->flow_block); } const struct mlxsw_sp_mr_tcam_ops mlxsw_sp2_mr_tcam_ops = { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c index 01cff711bbd2..c61f78e30397 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl.c @@ -40,15 +40,8 @@ struct mlxsw_afk *mlxsw_sp_acl_afk(struct mlxsw_sp_acl *acl) return acl->afk; } -struct mlxsw_sp_acl_block_binding { - struct list_head list; - struct net_device *dev; - struct mlxsw_sp_port *mlxsw_sp_port; - bool ingress; -}; - struct mlxsw_sp_acl_ruleset_ht_key { - struct mlxsw_sp_acl_block *block; + struct mlxsw_sp_flow_block *block; u32 chain_index; const struct mlxsw_sp_acl_profile_ops *ops; }; @@ -94,49 +87,6 @@ struct mlxsw_sp_fid *mlxsw_sp_acl_dummy_fid(struct mlxsw_sp *mlxsw_sp) return mlxsw_sp->acl->dummy_fid; } -struct mlxsw_sp *mlxsw_sp_acl_block_mlxsw_sp(struct mlxsw_sp_acl_block *block) -{ - return block->mlxsw_sp; -} - -unsigned int -mlxsw_sp_acl_block_rule_count(const struct mlxsw_sp_acl_block *block) -{ - return block ? block->rule_count : 0; -} - -void mlxsw_sp_acl_block_disable_inc(struct mlxsw_sp_acl_block *block) -{ - if (block) - block->disable_count++; -} - -void mlxsw_sp_acl_block_disable_dec(struct mlxsw_sp_acl_block *block) -{ - if (block) - block->disable_count--; -} - -bool mlxsw_sp_acl_block_disabled(const struct mlxsw_sp_acl_block *block) -{ - return block->disable_count; -} - -bool mlxsw_sp_acl_block_is_egress_bound(const struct mlxsw_sp_acl_block *block) -{ - return block->egress_binding_count; -} - -bool mlxsw_sp_acl_block_is_ingress_bound(const struct mlxsw_sp_acl_block *block) -{ - return block->ingress_binding_count; -} - -bool mlxsw_sp_acl_block_is_mixed_bound(const struct mlxsw_sp_acl_block *block) -{ - return block->ingress_binding_count && block->egress_binding_count; -} - static bool mlxsw_sp_acl_ruleset_is_singular(const struct mlxsw_sp_acl_ruleset *ruleset) { @@ -144,10 +94,9 @@ mlxsw_sp_acl_ruleset_is_singular(const struct mlxsw_sp_acl_ruleset *ruleset) return ruleset->ref_count == 2; } -static int -mlxsw_sp_acl_ruleset_bind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_acl_block_binding *binding) +int mlxsw_sp_acl_ruleset_bind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_flow_block_binding *binding) { struct mlxsw_sp_acl_ruleset *ruleset = block->ruleset_zero; const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops; @@ -156,10 +105,9 @@ mlxsw_sp_acl_ruleset_bind(struct mlxsw_sp *mlxsw_sp, binding->mlxsw_sp_port, binding->ingress); } -static void -mlxsw_sp_acl_ruleset_unbind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_acl_block_binding *binding) +void mlxsw_sp_acl_ruleset_unbind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_flow_block_binding *binding) { struct mlxsw_sp_acl_ruleset *ruleset = block->ruleset_zero; const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops; @@ -168,18 +116,12 @@ mlxsw_sp_acl_ruleset_unbind(struct mlxsw_sp *mlxsw_sp, binding->mlxsw_sp_port, binding->ingress); } -static bool -mlxsw_sp_acl_ruleset_block_bound(const struct mlxsw_sp_acl_block *block) -{ - return block->ruleset_zero; -} - static int mlxsw_sp_acl_ruleset_block_bind(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_ruleset *ruleset, - struct mlxsw_sp_acl_block *block) + struct mlxsw_sp_flow_block *block) { - struct mlxsw_sp_acl_block_binding *binding; + struct mlxsw_sp_flow_block_binding *binding; int err; block->ruleset_zero = ruleset; @@ -202,122 +144,18 @@ rollback: static void mlxsw_sp_acl_ruleset_block_unbind(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_ruleset *ruleset, - struct mlxsw_sp_acl_block *block) + struct mlxsw_sp_flow_block *block) { - struct mlxsw_sp_acl_block_binding *binding; + struct mlxsw_sp_flow_block_binding *binding; list_for_each_entry(binding, &block->binding_list, list) mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding); block->ruleset_zero = NULL; } -struct mlxsw_sp_acl_block *mlxsw_sp_acl_block_create(struct mlxsw_sp *mlxsw_sp, - struct net *net) -{ - struct mlxsw_sp_acl_block *block; - - block = kzalloc(sizeof(*block), GFP_KERNEL); - if (!block) - return NULL; - INIT_LIST_HEAD(&block->binding_list); - block->mlxsw_sp = mlxsw_sp; - block->net = net; - return block; -} - -void mlxsw_sp_acl_block_destroy(struct mlxsw_sp_acl_block *block) -{ - WARN_ON(!list_empty(&block->binding_list)); - kfree(block); -} - -static struct mlxsw_sp_acl_block_binding * -mlxsw_sp_acl_block_lookup(struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, bool ingress) -{ - struct mlxsw_sp_acl_block_binding *binding; - - list_for_each_entry(binding, &block->binding_list, list) - if (binding->mlxsw_sp_port == mlxsw_sp_port && - binding->ingress == ingress) - return binding; - return NULL; -} - -int mlxsw_sp_acl_block_bind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress, - struct netlink_ext_ack *extack) -{ - struct mlxsw_sp_acl_block_binding *binding; - int err; - - if (WARN_ON(mlxsw_sp_acl_block_lookup(block, mlxsw_sp_port, ingress))) - return -EEXIST; - - if (ingress && block->ingress_blocker_rule_count) { - NL_SET_ERR_MSG_MOD(extack, "Block cannot be bound to ingress because it contains unsupported rules"); - return -EOPNOTSUPP; - } - - if (!ingress && block->egress_blocker_rule_count) { - NL_SET_ERR_MSG_MOD(extack, "Block cannot be bound to egress because it contains unsupported rules"); - return -EOPNOTSUPP; - } - - binding = kzalloc(sizeof(*binding), GFP_KERNEL); - if (!binding) - return -ENOMEM; - binding->mlxsw_sp_port = mlxsw_sp_port; - binding->ingress = ingress; - - if (mlxsw_sp_acl_ruleset_block_bound(block)) { - err = mlxsw_sp_acl_ruleset_bind(mlxsw_sp, block, binding); - if (err) - goto err_ruleset_bind; - } - - if (ingress) - block->ingress_binding_count++; - else - block->egress_binding_count++; - list_add(&binding->list, &block->binding_list); - return 0; - -err_ruleset_bind: - kfree(binding); - return err; -} - -int mlxsw_sp_acl_block_unbind(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, - struct mlxsw_sp_port *mlxsw_sp_port, - bool ingress) -{ - struct mlxsw_sp_acl_block_binding *binding; - - binding = mlxsw_sp_acl_block_lookup(block, mlxsw_sp_port, ingress); - if (!binding) - return -ENOENT; - - list_del(&binding->list); - - if (ingress) - block->ingress_binding_count--; - else - block->egress_binding_count--; - - if (mlxsw_sp_acl_ruleset_block_bound(block)) - mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding); - - kfree(binding); - return 0; -} - static struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_create(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, u32 chain_index, + struct mlxsw_sp_flow_block *block, u32 chain_index, const struct mlxsw_sp_acl_profile_ops *ops, struct mlxsw_afk_element_usage *tmplt_elusage) { @@ -388,7 +226,7 @@ static void mlxsw_sp_acl_ruleset_ref_dec(struct mlxsw_sp *mlxsw_sp, static struct mlxsw_sp_acl_ruleset * __mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp_acl *acl, - struct mlxsw_sp_acl_block *block, u32 chain_index, + struct mlxsw_sp_flow_block *block, u32 chain_index, const struct mlxsw_sp_acl_profile_ops *ops) { struct mlxsw_sp_acl_ruleset_ht_key ht_key; @@ -403,7 +241,7 @@ __mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp_acl *acl, struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, u32 chain_index, + struct mlxsw_sp_flow_block *block, u32 chain_index, enum mlxsw_sp_acl_profile profile) { const struct mlxsw_sp_acl_profile_ops *ops; @@ -421,7 +259,7 @@ mlxsw_sp_acl_ruleset_lookup(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_ruleset * mlxsw_sp_acl_ruleset_get(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, u32 chain_index, + struct mlxsw_sp_flow_block *block, u32 chain_index, enum mlxsw_sp_acl_profile profile, struct mlxsw_afk_element_usage *tmplt_elusage) { @@ -584,11 +422,11 @@ int mlxsw_sp_acl_rulei_act_fwd(struct mlxsw_sp *mlxsw_sp, int mlxsw_sp_acl_rulei_act_mirror(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_acl_rule_info *rulei, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct net_device *out_dev, struct netlink_ext_ack *extack) { - struct mlxsw_sp_acl_block_binding *binding; + struct mlxsw_sp_flow_block_binding *binding; struct mlxsw_sp_port *in_port; if (!list_is_singular(&block->binding_list)) { @@ -596,7 +434,7 @@ int mlxsw_sp_acl_rulei_act_mirror(struct mlxsw_sp *mlxsw_sp, return -EOPNOTSUPP; } binding = list_first_entry(&block->binding_list, - struct mlxsw_sp_acl_block_binding, list); + struct mlxsw_sp_flow_block_binding, list); in_port = binding->mlxsw_sp_port; return mlxsw_afa_block_append_mirror(rulei->act_block, @@ -818,7 +656,7 @@ int mlxsw_sp_acl_rule_add(struct mlxsw_sp *mlxsw_sp, { struct mlxsw_sp_acl_ruleset *ruleset = rule->ruleset; const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops; - struct mlxsw_sp_acl_block *block = ruleset->ht_key.block; + struct mlxsw_sp_flow_block *block = ruleset->ht_key.block; int err; err = ops->rule_add(mlxsw_sp, ruleset->priv, rule->priv, rule->rulei); @@ -862,18 +700,17 @@ void mlxsw_sp_acl_rule_del(struct mlxsw_sp *mlxsw_sp, { struct mlxsw_sp_acl_ruleset *ruleset = rule->ruleset; const struct mlxsw_sp_acl_profile_ops *ops = ruleset->ht_key.ops; - struct mlxsw_sp_acl_block *block = ruleset->ht_key.block; + struct mlxsw_sp_flow_block *block = ruleset->ht_key.block; block->egress_blocker_rule_count -= rule->rulei->egress_bind_blocker; block->ingress_blocker_rule_count -= rule->rulei->ingress_bind_blocker; - ruleset->ht_key.block->rule_count--; + block->rule_count--; mutex_lock(&mlxsw_sp->acl->rules_lock); list_del(&rule->list); mutex_unlock(&mlxsw_sp->acl->rules_lock); if (!ruleset->ht_key.chain_index && mlxsw_sp_acl_ruleset_is_singular(ruleset)) - mlxsw_sp_acl_ruleset_block_unbind(mlxsw_sp, ruleset, - ruleset->ht_key.block); + mlxsw_sp_acl_ruleset_block_unbind(mlxsw_sp, ruleset, block); rhashtable_remove_fast(&ruleset->rule_ht, &rule->ht_node, mlxsw_sp_acl_rule_ht_params); ops->rule_del(mlxsw_sp, rule->priv); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c new file mode 100644 index 000000000000..ecab581ff956 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flow.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* Copyright (c) 2017-2020 Mellanox Technologies. All rights reserved */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/errno.h> +#include <linux/list.h> +#include <net/net_namespace.h> + +#include "spectrum.h" + +struct mlxsw_sp_flow_block * +mlxsw_sp_flow_block_create(struct mlxsw_sp *mlxsw_sp, struct net *net) +{ + struct mlxsw_sp_flow_block *block; + + block = kzalloc(sizeof(*block), GFP_KERNEL); + if (!block) + return NULL; + INIT_LIST_HEAD(&block->binding_list); + INIT_LIST_HEAD(&block->mall_list); + block->mlxsw_sp = mlxsw_sp; + block->net = net; + return block; +} + +void mlxsw_sp_flow_block_destroy(struct mlxsw_sp_flow_block *block) +{ + WARN_ON(!list_empty(&block->binding_list)); + kfree(block); +} + +static struct mlxsw_sp_flow_block_binding * +mlxsw_sp_flow_block_lookup(struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, bool ingress) +{ + struct mlxsw_sp_flow_block_binding *binding; + + list_for_each_entry(binding, &block->binding_list, list) + if (binding->mlxsw_sp_port == mlxsw_sp_port && + binding->ingress == ingress) + return binding; + return NULL; +} + +static bool +mlxsw_sp_flow_block_ruleset_bound(const struct mlxsw_sp_flow_block *block) +{ + return block->ruleset_zero; +} + +static int mlxsw_sp_flow_block_bind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress, + struct netlink_ext_ack *extack) +{ + struct mlxsw_sp_flow_block_binding *binding; + int err; + + if (WARN_ON(mlxsw_sp_flow_block_lookup(block, mlxsw_sp_port, ingress))) + return -EEXIST; + + if (ingress && block->ingress_blocker_rule_count) { + NL_SET_ERR_MSG_MOD(extack, "Block cannot be bound to ingress because it contains unsupported rules"); + return -EOPNOTSUPP; + } + + if (!ingress && block->egress_blocker_rule_count) { + NL_SET_ERR_MSG_MOD(extack, "Block cannot be bound to egress because it contains unsupported rules"); + return -EOPNOTSUPP; + } + + err = mlxsw_sp_mall_port_bind(block, mlxsw_sp_port); + if (err) + return err; + + binding = kzalloc(sizeof(*binding), GFP_KERNEL); + if (!binding) { + err = -ENOMEM; + goto err_binding_alloc; + } + binding->mlxsw_sp_port = mlxsw_sp_port; + binding->ingress = ingress; + + if (mlxsw_sp_flow_block_ruleset_bound(block)) { + err = mlxsw_sp_acl_ruleset_bind(mlxsw_sp, block, binding); + if (err) + goto err_ruleset_bind; + } + + if (ingress) + block->ingress_binding_count++; + else + block->egress_binding_count++; + list_add(&binding->list, &block->binding_list); + return 0; + +err_ruleset_bind: + kfree(binding); +err_binding_alloc: + mlxsw_sp_mall_port_unbind(block, mlxsw_sp_port); + + return err; +} + +static int mlxsw_sp_flow_block_unbind(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port, + bool ingress) +{ + struct mlxsw_sp_flow_block_binding *binding; + + binding = mlxsw_sp_flow_block_lookup(block, mlxsw_sp_port, ingress); + if (!binding) + return -ENOENT; + + list_del(&binding->list); + + if (ingress) + block->ingress_binding_count--; + else + block->egress_binding_count--; + + if (mlxsw_sp_flow_block_ruleset_bound(block)) + mlxsw_sp_acl_ruleset_unbind(mlxsw_sp, block, binding); + + kfree(binding); + + mlxsw_sp_mall_port_unbind(block, mlxsw_sp_port); + + return 0; +} + +static int mlxsw_sp_flow_block_mall_cb(struct mlxsw_sp_flow_block *flow_block, + struct tc_cls_matchall_offload *f) +{ + switch (f->command) { + case TC_CLSMATCHALL_REPLACE: + return mlxsw_sp_mall_replace(flow_block, f); + case TC_CLSMATCHALL_DESTROY: + mlxsw_sp_mall_destroy(flow_block, f); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int mlxsw_sp_flow_block_flower_cb(struct mlxsw_sp_flow_block *flow_block, + struct flow_cls_offload *f) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_flow_block_mlxsw_sp(flow_block); + + switch (f->command) { + case FLOW_CLS_REPLACE: + return mlxsw_sp_flower_replace(mlxsw_sp, flow_block, f); + case FLOW_CLS_DESTROY: + mlxsw_sp_flower_destroy(mlxsw_sp, flow_block, f); + return 0; + case FLOW_CLS_STATS: + return mlxsw_sp_flower_stats(mlxsw_sp, flow_block, f); + case FLOW_CLS_TMPLT_CREATE: + return mlxsw_sp_flower_tmplt_create(mlxsw_sp, flow_block, f); + case FLOW_CLS_TMPLT_DESTROY: + mlxsw_sp_flower_tmplt_destroy(mlxsw_sp, flow_block, f); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int mlxsw_sp_flow_block_cb(enum tc_setup_type type, + void *type_data, void *cb_priv) +{ + struct mlxsw_sp_flow_block *flow_block = cb_priv; + + if (mlxsw_sp_flow_block_disabled(flow_block)) + return -EOPNOTSUPP; + + switch (type) { + case TC_SETUP_CLSMATCHALL: + return mlxsw_sp_flow_block_mall_cb(flow_block, type_data); + case TC_SETUP_CLSFLOWER: + return mlxsw_sp_flow_block_flower_cb(flow_block, type_data); + default: + return -EOPNOTSUPP; + } +} + +static void mlxsw_sp_tc_block_release(void *cb_priv) +{ + struct mlxsw_sp_flow_block *flow_block = cb_priv; + + mlxsw_sp_flow_block_destroy(flow_block); +} + +static LIST_HEAD(mlxsw_sp_block_cb_list); + +static int mlxsw_sp_setup_tc_block_bind(struct mlxsw_sp_port *mlxsw_sp_port, + struct flow_block_offload *f, + bool ingress) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + struct mlxsw_sp_flow_block *flow_block; + struct flow_block_cb *block_cb; + bool register_block = false; + int err; + + block_cb = flow_block_cb_lookup(f->block, mlxsw_sp_flow_block_cb, + mlxsw_sp); + if (!block_cb) { + flow_block = mlxsw_sp_flow_block_create(mlxsw_sp, f->net); + if (!flow_block) + return -ENOMEM; + block_cb = flow_block_cb_alloc(mlxsw_sp_flow_block_cb, + mlxsw_sp, flow_block, + mlxsw_sp_tc_block_release); + if (IS_ERR(block_cb)) { + mlxsw_sp_flow_block_destroy(flow_block); + err = PTR_ERR(block_cb); + goto err_cb_register; + } + register_block = true; + } else { + flow_block = flow_block_cb_priv(block_cb); + } + flow_block_cb_incref(block_cb); + err = mlxsw_sp_flow_block_bind(mlxsw_sp, flow_block, + mlxsw_sp_port, ingress, f->extack); + if (err) + goto err_block_bind; + + if (ingress) + mlxsw_sp_port->ing_flow_block = flow_block; + else + mlxsw_sp_port->eg_flow_block = flow_block; + + if (register_block) { + flow_block_cb_add(block_cb, f); + list_add_tail(&block_cb->driver_list, &mlxsw_sp_block_cb_list); + } + + return 0; + +err_block_bind: + if (!flow_block_cb_decref(block_cb)) + flow_block_cb_free(block_cb); +err_cb_register: + return err; +} + +static void mlxsw_sp_setup_tc_block_unbind(struct mlxsw_sp_port *mlxsw_sp_port, + struct flow_block_offload *f, + bool ingress) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + struct mlxsw_sp_flow_block *flow_block; + struct flow_block_cb *block_cb; + int err; + + block_cb = flow_block_cb_lookup(f->block, mlxsw_sp_flow_block_cb, + mlxsw_sp); + if (!block_cb) + return; + + if (ingress) + mlxsw_sp_port->ing_flow_block = NULL; + else + mlxsw_sp_port->eg_flow_block = NULL; + + flow_block = flow_block_cb_priv(block_cb); + err = mlxsw_sp_flow_block_unbind(mlxsw_sp, flow_block, + mlxsw_sp_port, ingress); + if (!err && !flow_block_cb_decref(block_cb)) { + flow_block_cb_remove(block_cb, f); + list_del(&block_cb->driver_list); + } +} + +int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, + struct flow_block_offload *f) +{ + bool ingress; + + if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) + ingress = true; + else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) + ingress = false; + else + return -EOPNOTSUPP; + + f->driver_block_list = &mlxsw_sp_block_cb_list; + + switch (f->command) { + case FLOW_BLOCK_BIND: + return mlxsw_sp_setup_tc_block_bind(mlxsw_sp_port, f, ingress); + case FLOW_BLOCK_UNBIND: + mlxsw_sp_setup_tc_block_unbind(mlxsw_sp_port, f, ingress); + return 0; + default: + return -EOPNOTSUPP; + } +} diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 51117a5a6bbf..89c2e9820e95 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -15,7 +15,7 @@ #include "core_acl_flex_keys.h" static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct mlxsw_sp_acl_rule_info *rulei, struct flow_action *flow_action, struct netlink_ext_ack *extack) @@ -53,11 +53,11 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, case FLOW_ACTION_DROP: { bool ingress; - if (mlxsw_sp_acl_block_is_mixed_bound(block)) { + if (mlxsw_sp_flow_block_is_mixed_bound(block)) { NL_SET_ERR_MSG_MOD(extack, "Drop action is not supported when block is bound to ingress and egress"); return -EOPNOTSUPP; } - ingress = mlxsw_sp_acl_block_is_ingress_bound(block); + ingress = mlxsw_sp_flow_block_is_ingress_bound(block); err = mlxsw_sp_acl_rulei_act_drop(rulei, ingress, act->cookie, extack); if (err) { @@ -106,7 +106,7 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fid *fid; u16 fid_index; - if (mlxsw_sp_acl_block_is_egress_bound(block)) { + if (mlxsw_sp_flow_block_is_egress_bound(block)) { NL_SET_ERR_MSG_MOD(extack, "Redirect action is not supported on egress"); return -EOPNOTSUPP; } @@ -190,7 +190,7 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, static int mlxsw_sp_flower_parse_meta(struct mlxsw_sp_acl_rule_info *rulei, struct flow_cls_offload *f, - struct mlxsw_sp_acl_block *block) + struct mlxsw_sp_flow_block *block) { struct flow_rule *rule = flow_cls_offload_flow_rule(f); struct mlxsw_sp_port *mlxsw_sp_port; @@ -371,7 +371,7 @@ static int mlxsw_sp_flower_parse_ip(struct mlxsw_sp *mlxsw_sp, } static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct mlxsw_sp_acl_rule_info *rulei, struct flow_cls_offload *f) { @@ -460,7 +460,7 @@ static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp, struct flow_match_vlan match; flow_rule_match_vlan(rule, &match); - if (mlxsw_sp_acl_block_is_egress_bound(block)) { + if (mlxsw_sp_flow_block_is_egress_bound(block)) { NL_SET_ERR_MSG_MOD(f->common.extack, "vlan_id key is not supported on egress"); return -EOPNOTSUPP; } @@ -505,7 +505,7 @@ static int mlxsw_sp_flower_parse(struct mlxsw_sp *mlxsw_sp, } int mlxsw_sp_flower_replace(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f) { struct mlxsw_sp_acl_rule_info *rulei; @@ -552,7 +552,7 @@ err_rule_create: } void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f) { struct mlxsw_sp_acl_ruleset *ruleset; @@ -574,7 +574,7 @@ void mlxsw_sp_flower_destroy(struct mlxsw_sp *mlxsw_sp, } int mlxsw_sp_flower_stats(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f) { enum flow_action_hw_stats used_hw_stats = FLOW_ACTION_HW_STATS_DISABLED; @@ -611,7 +611,7 @@ err_rule_get_stats: } int mlxsw_sp_flower_tmplt_create(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f) { struct mlxsw_sp_acl_ruleset *ruleset; @@ -632,7 +632,7 @@ int mlxsw_sp_flower_tmplt_create(struct mlxsw_sp *mlxsw_sp, } void mlxsw_sp_flower_tmplt_destroy(struct mlxsw_sp *mlxsw_sp, - struct mlxsw_sp_acl_block *block, + struct mlxsw_sp_flow_block *block, struct flow_cls_offload *f) { struct mlxsw_sp_acl_ruleset *ruleset; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c new file mode 100644 index 000000000000..889da63072be --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_matchall.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* Copyright (c) 2017-2020 Mellanox Technologies. All rights reserved */ + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <net/flow_offload.h> + +#include "spectrum.h" +#include "spectrum_span.h" +#include "reg.h" + +enum mlxsw_sp_mall_action_type { + MLXSW_SP_MALL_ACTION_TYPE_MIRROR, + MLXSW_SP_MALL_ACTION_TYPE_SAMPLE, +}; + +struct mlxsw_sp_mall_mirror_entry { + const struct net_device *to_dev; + int span_id; +}; + +struct mlxsw_sp_mall_entry { + struct list_head list; + unsigned long cookie; + enum mlxsw_sp_mall_action_type type; + bool ingress; + union { + struct mlxsw_sp_mall_mirror_entry mirror; + struct mlxsw_sp_port_sample sample; + }; + struct rcu_head rcu; +}; + +static struct mlxsw_sp_mall_entry * +mlxsw_sp_mall_entry_find(struct mlxsw_sp_flow_block *block, unsigned long cookie) +{ + struct mlxsw_sp_mall_entry *mall_entry; + + list_for_each_entry(mall_entry, &block->mall_list, list) + if (mall_entry->cookie == cookie) + return mall_entry; + + return NULL; +} + +static int +mlxsw_sp_mall_port_mirror_add(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_mall_entry *mall_entry) +{ + enum mlxsw_sp_span_type span_type; + + if (!mall_entry->mirror.to_dev) { + netdev_err(mlxsw_sp_port->dev, "Could not find requested device\n"); + return -EINVAL; + } + + span_type = mall_entry->ingress ? MLXSW_SP_SPAN_INGRESS : + MLXSW_SP_SPAN_EGRESS; + return mlxsw_sp_span_mirror_add(mlxsw_sp_port, + mall_entry->mirror.to_dev, + span_type, true, + &mall_entry->mirror.span_id); +} + +static void +mlxsw_sp_mall_port_mirror_del(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_mall_entry *mall_entry) +{ + enum mlxsw_sp_span_type span_type; + + span_type = mall_entry->ingress ? MLXSW_SP_SPAN_INGRESS : + MLXSW_SP_SPAN_EGRESS; + mlxsw_sp_span_mirror_del(mlxsw_sp_port, mall_entry->mirror.span_id, + span_type, true); +} + +static int mlxsw_sp_mall_port_sample_set(struct mlxsw_sp_port *mlxsw_sp_port, + bool enable, u32 rate) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + char mpsc_pl[MLXSW_REG_MPSC_LEN]; + + mlxsw_reg_mpsc_pack(mpsc_pl, mlxsw_sp_port->local_port, enable, rate); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(mpsc), mpsc_pl); +} + +static int +mlxsw_sp_mall_port_sample_add(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_mall_entry *mall_entry) +{ + int err; + + if (rtnl_dereference(mlxsw_sp_port->sample)) { + netdev_err(mlxsw_sp_port->dev, "sample already active\n"); + return -EEXIST; + } + rcu_assign_pointer(mlxsw_sp_port->sample, &mall_entry->sample); + + err = mlxsw_sp_mall_port_sample_set(mlxsw_sp_port, true, + mall_entry->sample.rate); + if (err) + goto err_port_sample_set; + return 0; + +err_port_sample_set: + RCU_INIT_POINTER(mlxsw_sp_port->sample, NULL); + return err; +} + +static void +mlxsw_sp_mall_port_sample_del(struct mlxsw_sp_port *mlxsw_sp_port) +{ + if (!mlxsw_sp_port->sample) + return; + + mlxsw_sp_mall_port_sample_set(mlxsw_sp_port, false, 1); + RCU_INIT_POINTER(mlxsw_sp_port->sample, NULL); +} + +static int +mlxsw_sp_mall_port_rule_add(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_mall_entry *mall_entry) +{ + switch (mall_entry->type) { + case MLXSW_SP_MALL_ACTION_TYPE_MIRROR: + return mlxsw_sp_mall_port_mirror_add(mlxsw_sp_port, mall_entry); + case MLXSW_SP_MALL_ACTION_TYPE_SAMPLE: + return mlxsw_sp_mall_port_sample_add(mlxsw_sp_port, mall_entry); + default: + WARN_ON(1); + return -EINVAL; + } +} + +static void +mlxsw_sp_mall_port_rule_del(struct mlxsw_sp_port *mlxsw_sp_port, + struct mlxsw_sp_mall_entry *mall_entry) +{ + switch (mall_entry->type) { + case MLXSW_SP_MALL_ACTION_TYPE_MIRROR: + mlxsw_sp_mall_port_mirror_del(mlxsw_sp_port, mall_entry); + break; + case MLXSW_SP_MALL_ACTION_TYPE_SAMPLE: + mlxsw_sp_mall_port_sample_del(mlxsw_sp_port); + break; + default: + WARN_ON(1); + } +} + +int mlxsw_sp_mall_replace(struct mlxsw_sp_flow_block *block, + struct tc_cls_matchall_offload *f) +{ + struct mlxsw_sp_flow_block_binding *binding; + struct mlxsw_sp_mall_entry *mall_entry; + __be16 protocol = f->common.protocol; + struct flow_action_entry *act; + int err; + + if (!flow_offload_has_one_action(&f->rule->action)) { + NL_SET_ERR_MSG(f->common.extack, "Only singular actions are supported"); + return -EOPNOTSUPP; + } + + if (f->common.chain_index) { + NL_SET_ERR_MSG(f->common.extack, "Only chain 0 is supported"); + return -EOPNOTSUPP; + } + + if (mlxsw_sp_flow_block_is_mixed_bound(block)) { + NL_SET_ERR_MSG(f->common.extack, "Only not mixed bound blocks are supported"); + return -EOPNOTSUPP; + } + + mall_entry = kzalloc(sizeof(*mall_entry), GFP_KERNEL); + if (!mall_entry) + return -ENOMEM; + mall_entry->cookie = f->cookie; + mall_entry->ingress = mlxsw_sp_flow_block_is_ingress_bound(block); + + act = &f->rule->action.entries[0]; + + if (act->id == FLOW_ACTION_MIRRED && protocol == htons(ETH_P_ALL)) { + mall_entry->type = MLXSW_SP_MALL_ACTION_TYPE_MIRROR; + mall_entry->mirror.to_dev = act->dev; + } else if (act->id == FLOW_ACTION_SAMPLE && + protocol == htons(ETH_P_ALL)) { + if (act->sample.rate > MLXSW_REG_MPSC_RATE_MAX) { + NL_SET_ERR_MSG(f->common.extack, "Sample rate not supported"); + err = -EOPNOTSUPP; + goto errout; + } + mall_entry->type = MLXSW_SP_MALL_ACTION_TYPE_SAMPLE; + mall_entry->sample.psample_group = act->sample.psample_group; + mall_entry->sample.truncate = act->sample.truncate; + mall_entry->sample.trunc_size = act->sample.trunc_size; + mall_entry->sample.rate = act->sample.rate; + } else { + err = -EOPNOTSUPP; + goto errout; + } + + list_for_each_entry(binding, &block->binding_list, list) { + err = mlxsw_sp_mall_port_rule_add(binding->mlxsw_sp_port, + mall_entry); + if (err) + goto rollback; + } + + block->rule_count++; + if (mall_entry->ingress) + block->egress_blocker_rule_count++; + else + block->ingress_blocker_rule_count++; + list_add_tail(&mall_entry->list, &block->mall_list); + return 0; + +rollback: + list_for_each_entry_continue_reverse(binding, &block->binding_list, + list) + mlxsw_sp_mall_port_rule_del(binding->mlxsw_sp_port, mall_entry); +errout: + kfree(mall_entry); + return err; +} + +void mlxsw_sp_mall_destroy(struct mlxsw_sp_flow_block *block, + struct tc_cls_matchall_offload *f) +{ + struct mlxsw_sp_flow_block_binding *binding; + struct mlxsw_sp_mall_entry *mall_entry; + + mall_entry = mlxsw_sp_mall_entry_find(block, f->cookie); + if (!mall_entry) { + NL_SET_ERR_MSG(f->common.extack, "Entry not found"); + return; + } + + list_del(&mall_entry->list); + if (mall_entry->ingress) + block->egress_blocker_rule_count--; + else + block->ingress_blocker_rule_count--; + block->rule_count--; + list_for_each_entry(binding, &block->binding_list, list) + mlxsw_sp_mall_port_rule_del(binding->mlxsw_sp_port, mall_entry); + kfree_rcu(mall_entry, rcu); /* sample RX packets may be in-flight */ +} + +int mlxsw_sp_mall_port_bind(struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port) +{ + struct mlxsw_sp_mall_entry *mall_entry; + int err; + + list_for_each_entry(mall_entry, &block->mall_list, list) { + err = mlxsw_sp_mall_port_rule_add(mlxsw_sp_port, mall_entry); + if (err) + goto rollback; + } + return 0; + +rollback: + list_for_each_entry_continue_reverse(mall_entry, &block->mall_list, + list) + mlxsw_sp_mall_port_rule_del(mlxsw_sp_port, mall_entry); + return err; +} + +void mlxsw_sp_mall_port_unbind(struct mlxsw_sp_flow_block *block, + struct mlxsw_sp_port *mlxsw_sp_port) +{ + struct mlxsw_sp_mall_entry *mall_entry; + + list_for_each_entry(mall_entry, &block->mall_list, list) + mlxsw_sp_mall_port_rule_del(mlxsw_sp_port, mall_entry); +} diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4c616701856a..68d5255568a5 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -388,10 +388,12 @@ enum rtl_register_content { /* rx_mode_bits */ AcceptErr = 0x20, AcceptRunt = 0x10, +#define RX_CONFIG_ACCEPT_ERR_MASK 0x30 AcceptBroadcast = 0x08, AcceptMulticast = 0x04, AcceptMyPhys = 0x02, AcceptAllPhys = 0x01, +#define RX_CONFIG_ACCEPT_OK_MASK 0x0f #define RX_CONFIG_ACCEPT_MASK 0x3f /* TxConfigBits */ @@ -1497,19 +1499,15 @@ static netdev_features_t rtl8169_fix_features(struct net_device *dev, return features; } -static int rtl8169_set_features(struct net_device *dev, - netdev_features_t features) +static void rtl_set_rx_config_features(struct rtl8169_private *tp, + netdev_features_t features) { - struct rtl8169_private *tp = netdev_priv(dev); - u32 rx_config; - - rtl_lock_work(tp); + u32 rx_config = RTL_R32(tp, RxConfig); - rx_config = RTL_R32(tp, RxConfig); if (features & NETIF_F_RXALL) - rx_config |= (AcceptErr | AcceptRunt); + rx_config |= RX_CONFIG_ACCEPT_ERR_MASK; else - rx_config &= ~(AcceptErr | AcceptRunt); + rx_config &= ~RX_CONFIG_ACCEPT_ERR_MASK; if (rtl_is_8125(tp)) { if (features & NETIF_F_HW_VLAN_CTAG_RX) @@ -1519,6 +1517,16 @@ static int rtl8169_set_features(struct net_device *dev, } RTL_W32(tp, RxConfig, rx_config); +} + +static int rtl8169_set_features(struct net_device *dev, + netdev_features_t features) +{ + struct rtl8169_private *tp = netdev_priv(dev); + + rtl_lock_work(tp); + + rtl_set_rx_config_features(tp, features); if (features & NETIF_F_RXCSUM) tp->cp_cmd |= RxChkSum; @@ -2395,8 +2403,6 @@ static void rtl_pll_power_up(struct rtl8169_private *tp) static void rtl_init_rxcfg(struct rtl8169_private *tp) { - u32 vlan; - switch (tp->mac_version) { case RTL_GIGA_MAC_VER_02 ... RTL_GIGA_MAC_VER_06: case RTL_GIGA_MAC_VER_10 ... RTL_GIGA_MAC_VER_17: @@ -2411,9 +2417,7 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp) RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST | RX_EARLY_OFF); break; case RTL_GIGA_MAC_VER_60 ... RTL_GIGA_MAC_VER_61: - /* VLAN flags are controlled by NETIF_F_HW_VLAN_CTAG_RX */ - vlan = RTL_R32(tp, RxConfig) & RX_VLAN_8125; - RTL_W32(tp, RxConfig, vlan | RX_FETCH_DFLT_8125 | RX_DMA_BURST); + RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_DMA_BURST); break; default: RTL_W32(tp, RxConfig, RX128_INT_EN | RX_DMA_BURST); @@ -2680,14 +2684,11 @@ static void rtl_set_rx_mode(struct net_device *dev) } } - if (dev->features & NETIF_F_RXALL) - rx_mode |= (AcceptErr | AcceptRunt); - RTL_W32(tp, MAR0 + 4, mc_filter[1]); RTL_W32(tp, MAR0 + 0, mc_filter[0]); tmp = RTL_R32(tp, RxConfig); - RTL_W32(tp, RxConfig, (tmp & ~RX_CONFIG_ACCEPT_MASK) | rx_mode); + RTL_W32(tp, RxConfig, (tmp & ~RX_CONFIG_ACCEPT_OK_MASK) | rx_mode); } DECLARE_RTL_COND(rtl_csiar_cond) @@ -3845,7 +3846,6 @@ static void rtl_hw_start(struct rtl8169_private *tp) { rtl_unlock_config_regs(tp); - tp->cp_cmd &= CPCMD_MASK; RTL_W16(tp, CPlusCmd, tp->cp_cmd); if (tp->mac_version <= RTL_GIGA_MAC_VER_06) @@ -3867,6 +3867,7 @@ static void rtl_hw_start(struct rtl8169_private *tp) RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb); rtl_init_rxcfg(tp); rtl_set_tx_config_registers(tp); + rtl_set_rx_config_features(tp, tp->dev->features); rtl_set_rx_mode(tp->dev); rtl_irq_enable(tp); } @@ -5196,7 +5197,8 @@ static int r8169_mdio_register(struct rtl8169_private *tp) /* Most chip versions fail with the genphy driver. * Therefore ensure that the dedicated PHY driver is loaded. */ - dev_err(&pdev->dev, "realtek.ko not loaded, maybe it needs to be added to initramfs?\n"); + dev_err(&pdev->dev, "no dedicated PHY driver found for PHY ID 0x%08x, maybe realtek.ko needs to be added to initramfs?\n", + tp->phydev->phy_id); return -EUNATCH; } @@ -5424,7 +5426,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) tp->mac_version = chipset; - tp->cp_cmd = RTL_R16(tp, CPlusCmd); + tp->cp_cmd = RTL_R16(tp, CPlusCmd) & CPCMD_MASK; if (sizeof(dma_addr_t) > 4 && tp->mac_version >= RTL_GIGA_MAC_VER_18 && !dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64))) diff --git a/drivers/net/ethernet/ti/k3-cppi-desc-pool.c b/drivers/net/ethernet/ti/k3-cppi-desc-pool.c index ad7cfc1316ce..38cc12f9f133 100644 --- a/drivers/net/ethernet/ti/k3-cppi-desc-pool.c +++ b/drivers/net/ethernet/ti/k3-cppi-desc-pool.c @@ -64,8 +64,8 @@ k3_cppi_desc_pool_create_name(struct device *dev, size_t size, return ERR_PTR(-ENOMEM); pool->gen_pool = gen_pool_create(ilog2(pool->desc_size), -1); - if (IS_ERR(pool->gen_pool)) { - ret = PTR_ERR(pool->gen_pool); + if (!pool->gen_pool) { + ret = -ENOMEM; dev_err(pool->dev, "pool create failed %d\n", ret); kfree_const(pool_name); goto gen_pool_create_fail; diff --git a/drivers/net/hamradio/Kconfig b/drivers/net/hamradio/Kconfig index 8e05b5c31a77..fe409819b56d 100644 --- a/drivers/net/hamradio/Kconfig +++ b/drivers/net/hamradio/Kconfig @@ -30,7 +30,7 @@ config 6PACK Note that this driver is still experimental and might cause problems. For details about the features and the usage of the - driver, read <file:Documentation/networking/6pack.txt>. + driver, read <file:Documentation/networking/6pack.rst>. To compile this driver as a module, choose M here: the module will be called 6pack. @@ -127,7 +127,7 @@ config BAYCOM_SER_FDX your serial interface chip. To configure the driver, use the sethdlc utility available in the standard ax25 utilities package. For information on the modems, see <http://www.baycom.de/> and - <file:Documentation/networking/baycom.txt>. + <file:Documentation/networking/baycom.rst>. To compile this driver as a module, choose M here: the module will be called baycom_ser_fdx. This is recommended. @@ -145,7 +145,7 @@ config BAYCOM_SER_HDX the driver, use the sethdlc utility available in the standard ax25 utilities package. For information on the modems, see <http://www.baycom.de/> and - <file:Documentation/networking/baycom.txt>. + <file:Documentation/networking/baycom.rst>. To compile this driver as a module, choose M here: the module will be called baycom_ser_hdx. This is recommended. @@ -160,7 +160,7 @@ config BAYCOM_PAR par96 designs. To configure the driver, use the sethdlc utility available in the standard ax25 utilities package. For information on the modems, see <http://www.baycom.de/> and the file - <file:Documentation/networking/baycom.txt>. + <file:Documentation/networking/baycom.rst>. To compile this driver as a module, choose M here: the module will be called baycom_par. This is recommended. @@ -175,7 +175,7 @@ config BAYCOM_EPP designs. To configure the driver, use the sethdlc utility available in the standard ax25 utilities package. For information on the modems, see <http://www.baycom.de/> and the file - <file:Documentation/networking/baycom.txt>. + <file:Documentation/networking/baycom.rst>. To compile this driver as a module, choose M here: the module will be called baycom_epp. This is recommended. diff --git a/drivers/net/phy/bcm54140.c b/drivers/net/phy/bcm54140.c index 7341f0126cc4..c009ac2856a5 100644 --- a/drivers/net/phy/bcm54140.c +++ b/drivers/net/phy/bcm54140.c @@ -660,7 +660,7 @@ static int bcm54140_config_init(struct phy_device *phydev) BCM54140_RDB_C_PWR_ISOLATE, 0); } -int bcm54140_did_interrupt(struct phy_device *phydev) +static int bcm54140_did_interrupt(struct phy_device *phydev) { int ret; @@ -669,7 +669,7 @@ int bcm54140_did_interrupt(struct phy_device *phydev) return (ret < 0) ? 0 : ret; } -int bcm54140_ack_intr(struct phy_device *phydev) +static int bcm54140_ack_intr(struct phy_device *phydev) { int reg; @@ -681,7 +681,7 @@ int bcm54140_ack_intr(struct phy_device *phydev) return 0; } -int bcm54140_config_intr(struct phy_device *phydev) +static int bcm54140_config_intr(struct phy_device *phydev) { struct bcm54140_priv *priv = phydev->priv; static const u16 port_to_imr_bit[] = { diff --git a/drivers/net/wan/Kconfig b/drivers/net/wan/Kconfig index dbc0e3f7a3e2..3e21726c36e8 100644 --- a/drivers/net/wan/Kconfig +++ b/drivers/net/wan/Kconfig @@ -336,7 +336,7 @@ config DLCI To use frame relay, you need supporting hardware (called FRAD) and certain programs from the net-tools package as explained in - <file:Documentation/networking/framerelay.txt>. + <file:Documentation/networking/framerelay.rst>. To compile this driver as a module, choose M here: the module will be called dlci. @@ -361,7 +361,7 @@ config SDLA These are multi-protocol cards, but only Frame Relay is supported by the driver at this time. Please read - <file:Documentation/networking/framerelay.txt>. + <file:Documentation/networking/framerelay.rst>. To compile this driver as a module, choose M here: the module will be called sdla. diff --git a/drivers/ptp/ptp_ines.c b/drivers/ptp/ptp_ines.c index 52d77db39829..7711651ff19e 100644 --- a/drivers/ptp/ptp_ines.c +++ b/drivers/ptp/ptp_ines.c @@ -783,16 +783,10 @@ static struct mii_timestamping_ctrl ines_ctrl = { static int ines_ptp_ctrl_probe(struct platform_device *pld) { struct ines_clock *clock; - struct resource *res; void __iomem *addr; int err = 0; - res = platform_get_resource(pld, IORESOURCE_MEM, 0); - if (!res) { - dev_err(&pld->dev, "missing memory resource\n"); - return -EINVAL; - } - addr = devm_ioremap_resource(&pld->dev, res); + addr = devm_platform_ioremap_resource(pld, 0); if (IS_ERR(addr)) { err = PTR_ERR(addr); goto out; diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 9e57c4411734..b3a8d3054af0 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -47,6 +47,8 @@ struct br_ip_list { #define BR_BCAST_FLOOD BIT(14) #define BR_NEIGH_SUPPRESS BIT(15) #define BR_ISOLATED BIT(16) +#define BR_MRP_AWARE BIT(17) +#define BR_MRP_LOST_CONT BIT(18) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9947eb1e9eb6..e525f003e619 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -123,7 +123,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); int ip6_ins_rt(struct net *net, struct fib6_info *f6i); -int ip6_del_rt(struct net *net, struct fib6_info *f6i); +int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify); void rt6_flush_exceptions(struct fib6_info *f6i); void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 3e7d2c0e79ca..a5f7c12c326a 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -48,7 +48,7 @@ struct ipv6_stub { struct netlink_ext_ack *extack); void (*fib6_nh_release)(struct fib6_nh *fib6_nh); void (*fib6_update_sernum)(struct net *net, struct fib6_info *rt); - int (*ip6_del_rt)(struct net *net, struct fib6_info *rt); + int (*ip6_del_rt)(struct net *net, struct fib6_info *rt, bool skip_notify); void (*fib6_rt_update)(struct net *net, struct fib6_info *rt, struct nl_info *info); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 154b8f01499b..5acdb4d414c4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -111,6 +111,8 @@ struct netns_ipv4 { int sysctl_tcp_early_demux; int sysctl_udp_early_demux; + int sysctl_nexthop_compat_mode; + int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; #ifdef CONFIG_NET_L3_MASTER_DEV diff --git a/include/net/switchdev.h b/include/net/switchdev.h index aee86a189432..ae7aeb0d1f9c 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -40,6 +40,10 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + SWITCHDEV_ATTR_ID_MRP_PORT_STATE, + SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, +#endif }; struct switchdev_attr { @@ -55,6 +59,11 @@ struct switchdev_attr { clock_t ageing_time; /* BRIDGE_AGEING_TIME */ bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */ bool mc_disabled; /* MC_DISABLED */ +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + u8 mrp_port_state; /* MRP_PORT_STATE */ + u8 mrp_port_role; /* MRP_PORT_ROLE */ + u8 mrp_ring_state; /* MRP_RING_STATE */ +#endif } u; }; @@ -63,6 +72,12 @@ enum switchdev_obj_id { SWITCHDEV_OBJ_ID_PORT_VLAN, SWITCHDEV_OBJ_ID_PORT_MDB, SWITCHDEV_OBJ_ID_HOST_MDB, +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + SWITCHDEV_OBJ_ID_MRP, + SWITCHDEV_OBJ_ID_RING_TEST_MRP, + SWITCHDEV_OBJ_ID_RING_ROLE_MRP, + SWITCHDEV_OBJ_ID_RING_STATE_MRP, +#endif }; struct switchdev_obj { @@ -94,6 +109,53 @@ struct switchdev_obj_port_mdb { #define SWITCHDEV_OBJ_PORT_MDB(OBJ) \ container_of((OBJ), struct switchdev_obj_port_mdb, obj) + +#if IS_ENABLED(CONFIG_BRIDGE_MRP) +/* SWITCHDEV_OBJ_ID_MRP */ +struct switchdev_obj_mrp { + struct switchdev_obj obj; + struct net_device *p_port; + struct net_device *s_port; + u32 ring_id; +}; + +#define SWITCHDEV_OBJ_MRP(OBJ) \ + container_of((OBJ), struct switchdev_obj_mrp, obj) + +/* SWITCHDEV_OBJ_ID_RING_TEST_MRP */ +struct switchdev_obj_ring_test_mrp { + struct switchdev_obj obj; + /* The value is in us and a value of 0 represents to stop */ + u32 interval; + u8 max_miss; + u32 ring_id; + u32 period; +}; + +#define SWITCHDEV_OBJ_RING_TEST_MRP(OBJ) \ + container_of((OBJ), struct switchdev_obj_ring_test_mrp, obj) + +/* SWICHDEV_OBJ_ID_RING_ROLE_MRP */ +struct switchdev_obj_ring_role_mrp { + struct switchdev_obj obj; + u8 ring_role; + u32 ring_id; +}; + +#define SWITCHDEV_OBJ_RING_ROLE_MRP(OBJ) \ + container_of((OBJ), struct switchdev_obj_ring_role_mrp, obj) + +struct switchdev_obj_ring_state_mrp { + struct switchdev_obj obj; + u8 ring_state; + u32 ring_id; +}; + +#define SWITCHDEV_OBJ_RING_STATE_MRP(OBJ) \ + container_of((OBJ), struct switchdev_obj_ring_state_mrp, obj) + +#endif + typedef int switchdev_obj_dump_cb_t(struct switchdev_obj *obj); enum switchdev_notifier_type { diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index bfe621ea51b3..bd8c95488f16 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -120,6 +120,7 @@ enum { IFLA_BRIDGE_MODE, IFLA_BRIDGE_VLAN_INFO, IFLA_BRIDGE_VLAN_TUNNEL_INFO, + IFLA_BRIDGE_MRP, __IFLA_BRIDGE_MAX, }; #define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1) @@ -157,6 +158,47 @@ struct bridge_vlan_xstats { __u32 pad2; }; +enum { + IFLA_BRIDGE_MRP_UNSPEC, + IFLA_BRIDGE_MRP_INSTANCE, + IFLA_BRIDGE_MRP_PORT_STATE, + IFLA_BRIDGE_MRP_PORT_ROLE, + IFLA_BRIDGE_MRP_RING_STATE, + IFLA_BRIDGE_MRP_RING_ROLE, + IFLA_BRIDGE_MRP_START_TEST, + __IFLA_BRIDGE_MRP_MAX, +}; + +struct br_mrp_instance { + __u32 ring_id; + __u32 p_ifindex; + __u32 s_ifindex; +}; + +struct br_mrp_port_role { + __u32 ring_id; + __u32 role; +}; + +struct br_mrp_ring_state { + __u32 ring_id; + __u32 ring_state; +}; + +struct br_mrp_ring_role { + __u32 ring_id; + __u32 ring_role; +}; + +struct br_mrp_start_test { + __u32 ring_id; + __u32 interval; + __u32 max_miss; + __u32 period; +}; + +#define IFLA_BRIDGE_MRP_MAX (__IFLA_BRIDGE_MRP_MAX - 1) + struct bridge_stp_xstats { __u64 transition_blk; __u64 transition_fwd; diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index f6ceb2e63d1e..d6de2b167448 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -92,6 +92,7 @@ #define ETH_P_PREAUTH 0x88C7 /* 802.11 Preauthentication */ #define ETH_P_TIPC 0x88CA /* TIPC */ #define ETH_P_LLDP 0x88CC /* Link Layer Discovery Protocol */ +#define ETH_P_MRP 0x88E3 /* Media Redundancy Protocol */ #define ETH_P_MACSEC 0x88E5 /* 802.1ae MACsec */ #define ETH_P_8021AH 0x88E7 /* 802.1ah Backbone Service Tag */ #define ETH_P_MVRP 0x88F5 /* 802.1Q MVRP */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 127c704eeba9..a009365ad67b 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -343,6 +343,7 @@ enum { IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/include/uapi/linux/mrp_bridge.h b/include/uapi/linux/mrp_bridge.h new file mode 100644 index 000000000000..2600cdf5a284 --- /dev/null +++ b/include/uapi/linux/mrp_bridge.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_MRP_BRIDGE_H_ +#define _UAPI_LINUX_MRP_BRIDGE_H_ + +#include <linux/types.h> +#include <linux/if_ether.h> + +#define MRP_MAX_FRAME_LENGTH 200 +#define MRP_DEFAULT_PRIO 0x8000 +#define MRP_DOMAIN_UUID_LENGTH 16 +#define MRP_VERSION 1 +#define MRP_FRAME_PRIO 7 + +enum br_mrp_ring_role_type { + BR_MRP_RING_ROLE_DISABLED, + BR_MRP_RING_ROLE_MRC, + BR_MRP_RING_ROLE_MRM, +}; + +enum br_mrp_ring_state_type { + BR_MRP_RING_STATE_OPEN, + BR_MRP_RING_STATE_CLOSED, +}; + +enum br_mrp_port_state_type { + BR_MRP_PORT_STATE_DISABLED, + BR_MRP_PORT_STATE_BLOCKED, + BR_MRP_PORT_STATE_FORWARDING, + BR_MRP_PORT_STATE_NOT_CONNECTED, +}; + +enum br_mrp_port_role_type { + BR_MRP_PORT_ROLE_PRIMARY, + BR_MRP_PORT_ROLE_SECONDARY, + BR_MRP_PORT_ROLE_NONE, +}; + +enum br_mrp_tlv_header_type { + BR_MRP_TLV_HEADER_END = 0x0, + BR_MRP_TLV_HEADER_COMMON = 0x1, + BR_MRP_TLV_HEADER_RING_TEST = 0x2, + BR_MRP_TLV_HEADER_RING_TOPO = 0x3, + BR_MRP_TLV_HEADER_RING_LINK_DOWN = 0x4, + BR_MRP_TLV_HEADER_RING_LINK_UP = 0x5, +}; + +struct br_mrp_tlv_hdr { + __u8 type; + __u8 length; +}; + +struct br_mrp_end_hdr { + struct br_mrp_tlv_hdr hdr; +}; + +struct br_mrp_common_hdr { + __u16 seq_id; + __u8 domain[MRP_DOMAIN_UUID_LENGTH]; +}; + +struct br_mrp_ring_test_hdr { + __u16 prio; + __u8 sa[ETH_ALEN]; + __u16 port_role; + __u16 state; + __u16 transitions; + __u32 timestamp; +}; + +struct br_mrp_ring_topo_hdr { + __u16 prio; + __u8 sa[ETH_ALEN]; + __u16 interval; +}; + +struct br_mrp_ring_link_hdr { + __u8 sa[ETH_ALEN]; + __u16 port_role; + __u16 interval; + __u16 blocked; +}; + +#endif diff --git a/net/Kconfig b/net/Kconfig index df8d8c9bd021..8b1f85820a6b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -86,7 +86,7 @@ config INET "Sysctl support" below, you can change various aspects of the behavior of the TCP/IP code by writing to the (virtual) files in /proc/sys/net/ipv4/*; the options are explained in the file - <file:Documentation/networking/ip-sysctl.txt>. + <file:Documentation/networking/ip-sysctl.rst>. Short answer: say Y. diff --git a/net/atm/Kconfig b/net/atm/Kconfig index 271f682e8438..e61dcc9f85b2 100644 --- a/net/atm/Kconfig +++ b/net/atm/Kconfig @@ -16,7 +16,7 @@ config ATM of your ATM card below. Note that you need a set of user-space programs to actually make use - of ATM. See the file <file:Documentation/networking/atm.txt> for + of ATM. See the file <file:Documentation/networking/atm.rst> for further details. config ATM_CLIP diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig index 043fd5437809..97d686d115c0 100644 --- a/net/ax25/Kconfig +++ b/net/ax25/Kconfig @@ -40,7 +40,7 @@ config AX25 radio as well as information about how to configure an AX.25 port is contained in the AX25-HOWTO, available from <http://www.tldp.org/docs.html#howto>. You might also want to - check out the file <file:Documentation/networking/ax25.txt> in the + check out the file <file:Documentation/networking/ax25.rst> in the kernel source. More information about digital amateur radio in general is on the WWW at <http://www.tapr.org/>. @@ -88,7 +88,7 @@ config NETROM users as well as information about how to configure an AX.25 port is contained in the Linux Ham Wiki, available from <http://www.linux-ax25.org>. You also might want to check out the - file <file:Documentation/networking/ax25.txt>. More information about + file <file:Documentation/networking/ax25.rst>. More information about digital amateur radio in general is on the WWW at <http://www.tapr.org/>. @@ -107,7 +107,7 @@ config ROSE users as well as information about how to configure an AX.25 port is contained in the Linux Ham Wiki, available from <http://www.linux-ax25.org>. You also might want to check out the - file <file:Documentation/networking/ax25.txt>. More information about + file <file:Documentation/networking/ax25.rst>. More information about digital amateur radio in general is on the WWW at <http://www.tapr.org/>. diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index a7c8dd7ae513..e87f19c82e8d 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -280,7 +280,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) unsigned int msecs; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; - msecs += prandom_u32() % (2 * BATADV_JITTER); + msecs += prandom_u32_max(2 * BATADV_JITTER); return jiffies + msecs_to_jiffies(msecs); } @@ -288,7 +288,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) /* when do we schedule a ogm packet to be sent */ static unsigned long batadv_iv_ogm_fwd_send_time(void) { - return jiffies + msecs_to_jiffies(prandom_u32() % (BATADV_JITTER / 2)); + return jiffies + msecs_to_jiffies(prandom_u32_max(BATADV_JITTER / 2)); } /* apply hop penalty for a normal link */ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1e3172db7492..353e49c40e7f 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -49,7 +49,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) unsigned int msecs; msecs = atomic_read(&hard_iface->bat_v.elp_interval) - BATADV_JITTER; - msecs += prandom_u32() % (2 * BATADV_JITTER); + msecs += prandom_u32_max(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.elp_wq, msecs_to_jiffies(msecs)); diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 969466218999..0959d32be65c 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -88,7 +88,7 @@ static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface) unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000; /* msecs * [0.9, 1.1] */ - msecs += prandom_u32() % (msecs / 5) - (msecs / 10); + msecs += prandom_u32_max(msecs / 5) - (msecs / 10); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq, msecs_to_jiffies(msecs / 1000)); } @@ -107,7 +107,7 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) return; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; - msecs += prandom_u32() % (2 * BATADV_JITTER); + msecs += prandom_u32_max(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq, msecs_to_jiffies(msecs)); } diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 2bff2f4a325c..4e031661682a 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -163,11 +163,6 @@ static inline void batadv_dat_init_own_addr(struct batadv_priv *bat_priv, { } -static inline void batadv_arp_change_timeout(struct net_device *soft_iface, - const char *name) -{ -} - static inline int batadv_dat_init(struct batadv_priv *bat_priv) { return 0; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 2a234d0ad445..61d8dbe8c954 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2020.1" +#define BATADV_SOURCE_VERSION "2020.2" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index f631b1e01b89..a87547570b4e 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -15,7 +15,6 @@ #include <linux/percpu.h> #include <linux/printk.h> #include <linux/tracepoint.h> -#include <linux/types.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM batadv diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 4a17a66cc572..d152b8e81f61 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1086,7 +1086,7 @@ struct batadv_priv_bla { * struct batadv_priv_debug_log - debug logging data */ struct batadv_priv_debug_log { - /** @log_buff: buffer holding the logs (ring bufer) */ + /** @log_buff: buffer holding the logs (ring buffer) */ char log_buff[BATADV_LOG_BUF_LEN]; /** @log_start: index of next character to read */ diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig index e4fb050e2078..51a6414145d2 100644 --- a/net/bridge/Kconfig +++ b/net/bridge/Kconfig @@ -61,3 +61,15 @@ config BRIDGE_VLAN_FILTERING Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config BRIDGE_MRP + bool "MRP protocol" + depends on BRIDGE + default n + help + If you say Y here, then the Ethernet bridge will be able to run MRP + protocol to detect loops + + Say N to exclude this support and reduce the binary size. + + If unsure, say N. diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 49da7ae6f077..ccb394236fbd 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -25,3 +25,5 @@ bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_opt bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o obj-$(CONFIG_NETFILTER) += netfilter/ + +bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o br_mrp.o br_mrp_netlink.o diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0e3dbc5f3c34..8ec1362588af 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -463,6 +463,9 @@ void br_dev_setup(struct net_device *dev) spin_lock_init(&br->lock); INIT_LIST_HEAD(&br->port_list); INIT_HLIST_HEAD(&br->fdb_list); +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + INIT_LIST_HEAD(&br->mrp_list); +#endif spin_lock_init(&br->hash_lock); br->bridge_id.prio[0] = 0x80; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 4fe30b182ee7..ca685c0cdf95 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -333,6 +333,8 @@ static void del_nbp(struct net_bridge_port *p) br_stp_disable_port(p); spin_unlock_bh(&br->lock); + br_mrp_port_del(br, p); + br_ifinfo_notify(RTM_DELLINK, NULL, p); list_del_rcu(&p->list); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index fcc260840028..d5c34f36f0f4 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -342,6 +342,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) } } + if (unlikely(br_mrp_process(p, skb))) + return RX_HANDLER_PASS; + forward: switch (p->state) { case BR_STATE_FORWARDING: diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index ae22d784b88a..5e71fc8b826f 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -242,8 +242,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - br_stp_set_enabled(br, args[1]); - ret = 0; + ret = br_stp_set_enabled(br, args[1], NULL); break; case BRCTL_SET_BRIDGE_PRIORITY: diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c new file mode 100644 index 000000000000..d7bc09de4c13 --- /dev/null +++ b/net/bridge/br_mrp.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/mrp_bridge.h> +#include "br_private_mrp.h" + +static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 }; + +static struct net_bridge_port *br_mrp_get_port(struct net_bridge *br, + u32 ifindex) +{ + struct net_bridge_port *res = NULL; + struct net_bridge_port *port; + + list_for_each_entry(port, &br->port_list, list) { + if (port->dev->ifindex == ifindex) { + res = port; + break; + } + } + + return res; +} + +static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id) +{ + struct br_mrp *res = NULL; + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + if (mrp->ring_id == ring_id) { + res = mrp; + break; + } + } + + return res; +} + +static struct br_mrp *br_mrp_find_port(struct net_bridge *br, + struct net_bridge_port *p) +{ + struct br_mrp *res = NULL; + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + if (rcu_access_pointer(mrp->p_port) == p || + rcu_access_pointer(mrp->s_port) == p) { + res = mrp; + break; + } + } + + return res; +} + +static int br_mrp_next_seq(struct br_mrp *mrp) +{ + mrp->seq_id++; + return mrp->seq_id; +} + +static struct sk_buff *br_mrp_skb_alloc(struct net_bridge_port *p, + const u8 *src, const u8 *dst) +{ + struct ethhdr *eth_hdr; + struct sk_buff *skb; + u16 *version; + + skb = dev_alloc_skb(MRP_MAX_FRAME_LENGTH); + if (!skb) + return NULL; + + skb->dev = p->dev; + skb->protocol = htons(ETH_P_MRP); + skb->priority = MRP_FRAME_PRIO; + skb_reserve(skb, sizeof(*eth_hdr)); + + eth_hdr = skb_push(skb, sizeof(*eth_hdr)); + ether_addr_copy(eth_hdr->h_dest, dst); + ether_addr_copy(eth_hdr->h_source, src); + eth_hdr->h_proto = htons(ETH_P_MRP); + + version = skb_put(skb, sizeof(*version)); + *version = cpu_to_be16(MRP_VERSION); + + return skb; +} + +static void br_mrp_skb_tlv(struct sk_buff *skb, + enum br_mrp_tlv_header_type type, + u8 length) +{ + struct br_mrp_tlv_hdr *hdr; + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->type = type; + hdr->length = length; +} + +static void br_mrp_skb_common(struct sk_buff *skb, struct br_mrp *mrp) +{ + struct br_mrp_common_hdr *hdr; + + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_COMMON, sizeof(*hdr)); + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->seq_id = cpu_to_be16(br_mrp_next_seq(mrp)); + memset(hdr->domain, 0xff, MRP_DOMAIN_UUID_LENGTH); +} + +static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, + struct net_bridge_port *p, + enum br_mrp_port_role_type port_role) +{ + struct br_mrp_ring_test_hdr *hdr = NULL; + struct sk_buff *skb = NULL; + + if (!p) + return NULL; + + skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_test_dmac); + if (!skb) + return NULL; + + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_RING_TEST, sizeof(*hdr)); + hdr = skb_put(skb, sizeof(*hdr)); + + hdr->prio = cpu_to_be16(MRP_DEFAULT_PRIO); + ether_addr_copy(hdr->sa, p->br->dev->dev_addr); + hdr->port_role = cpu_to_be16(port_role); + hdr->state = cpu_to_be16(mrp->ring_state); + hdr->transitions = cpu_to_be16(mrp->ring_transitions); + hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies)); + + br_mrp_skb_common(skb, mrp); + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0); + + return skb; +} + +static void br_mrp_test_work_expired(struct work_struct *work) +{ + struct delayed_work *del_work = to_delayed_work(work); + struct br_mrp *mrp = container_of(del_work, struct br_mrp, test_work); + struct net_bridge_port *p; + bool notify_open = false; + struct sk_buff *skb; + + if (time_before_eq(mrp->test_end, jiffies)) + return; + + if (mrp->test_count_miss < mrp->test_max_miss) { + mrp->test_count_miss++; + } else { + /* Notify that the ring is open only if the ring state is + * closed, otherwise it would continue to notify at every + * interval. + */ + if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED) + notify_open = true; + } + + rcu_read_lock(); + + p = rcu_dereference(mrp->p_port); + if (p) { + skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_PRIMARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + + if (notify_open && !mrp->ring_role_offloaded) + br_mrp_port_open(p->dev, true); + } + + p = rcu_dereference(mrp->s_port); + if (p) { + skb = br_mrp_alloc_test_skb(mrp, p, BR_MRP_PORT_ROLE_SECONDARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + + if (notify_open && !mrp->ring_role_offloaded) + br_mrp_port_open(p->dev, true); + } + +out: + rcu_read_unlock(); + + queue_delayed_work(system_wq, &mrp->test_work, + usecs_to_jiffies(mrp->test_interval)); +} + +/* Deletes the MRP instance. + * note: called under rtnl_lock + */ +static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) +{ + struct net_bridge_port *p; + + /* Stop sending MRP_Test frames */ + cancel_delayed_work_sync(&mrp->test_work); + br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0); + + br_mrp_switchdev_del(br, mrp); + + /* Reset the ports */ + p = rtnl_dereference(mrp->p_port); + if (p) { + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags &= ~BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING); + rcu_assign_pointer(mrp->p_port, NULL); + } + + p = rtnl_dereference(mrp->s_port); + if (p) { + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags &= ~BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + br_mrp_port_switchdev_set_state(p, BR_STATE_FORWARDING); + rcu_assign_pointer(mrp->s_port, NULL); + } + + list_del_rcu(&mrp->list); + kfree_rcu(mrp, rcu); +} + +/* Adds a new MRP instance. + * note: called under rtnl_lock + */ +int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) +{ + struct net_bridge_port *p; + struct br_mrp *mrp; + int err; + + /* If the ring exists, it is not possible to create another one with the + * same ring_id + */ + mrp = br_mrp_find_id(br, instance->ring_id); + if (mrp) + return -EINVAL; + + if (!br_mrp_get_port(br, instance->p_ifindex) || + !br_mrp_get_port(br, instance->s_ifindex)) + return -EINVAL; + + mrp = kzalloc(sizeof(*mrp), GFP_KERNEL); + if (!mrp) + return -ENOMEM; + + mrp->ring_id = instance->ring_id; + + p = br_mrp_get_port(br, instance->p_ifindex); + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags |= BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + rcu_assign_pointer(mrp->p_port, p); + + p = br_mrp_get_port(br, instance->s_ifindex); + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags |= BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + rcu_assign_pointer(mrp->s_port, p); + + INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired); + list_add_tail_rcu(&mrp->list, &br->mrp_list); + + err = br_mrp_switchdev_add(br, mrp); + if (err) + goto delete_mrp; + + return 0; + +delete_mrp: + br_mrp_del_impl(br, mrp); + + return err; +} + +/* Deletes the MRP instance from which the port is part of + * note: called under rtnl_lock + */ +void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p) +{ + struct br_mrp *mrp = br_mrp_find_port(br, p); + + /* If the port is not part of a MRP instance just bail out */ + if (!mrp) + return; + + br_mrp_del_impl(br, mrp); +} + +/* Deletes existing MRP instance based on ring_id + * note: called under rtnl_lock + */ +int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance) +{ + struct br_mrp *mrp = br_mrp_find_id(br, instance->ring_id); + + if (!mrp) + return -EINVAL; + + br_mrp_del_impl(br, mrp); + + return 0; +} + +/* Set port state, port state can be forwarding, blocked or disabled + * note: already called with rtnl_lock + */ +int br_mrp_set_port_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state) +{ + if (!p || !(p->flags & BR_MRP_AWARE)) + return -EINVAL; + + spin_lock_bh(&p->br->lock); + + if (state == BR_MRP_PORT_STATE_FORWARDING) + p->state = BR_STATE_FORWARDING; + else + p->state = BR_STATE_BLOCKING; + + spin_unlock_bh(&p->br->lock); + + br_mrp_port_switchdev_set_state(p, state); + + return 0; +} + +/* Set port role, port role can be primary or secondary + * note: already called with rtnl_lock + */ +int br_mrp_set_port_role(struct net_bridge_port *p, + struct br_mrp_port_role *role) +{ + struct br_mrp *mrp; + + if (!p || !(p->flags & BR_MRP_AWARE)) + return -EINVAL; + + mrp = br_mrp_find_id(p->br, role->ring_id); + + if (!mrp) + return -EINVAL; + + if (role->role == BR_MRP_PORT_ROLE_PRIMARY) + rcu_assign_pointer(mrp->p_port, p); + else + rcu_assign_pointer(mrp->s_port, p); + + br_mrp_port_switchdev_set_role(p, role->role); + + return 0; +} + +/* Set ring state, ring state can be only Open or Closed + * note: already called with rtnl_lock + */ +int br_mrp_set_ring_state(struct net_bridge *br, + struct br_mrp_ring_state *state) +{ + struct br_mrp *mrp = br_mrp_find_id(br, state->ring_id); + + if (!mrp) + return -EINVAL; + + if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED && + state->ring_state != BR_MRP_RING_STATE_CLOSED) + mrp->ring_transitions++; + + mrp->ring_state = state->ring_state; + + br_mrp_switchdev_set_ring_state(br, mrp, state->ring_state); + + return 0; +} + +/* Set ring role, ring role can be only MRM(Media Redundancy Manager) or + * MRC(Media Redundancy Client). + * note: already called with rtnl_lock + */ +int br_mrp_set_ring_role(struct net_bridge *br, + struct br_mrp_ring_role *role) +{ + struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id); + int err; + + if (!mrp) + return -EINVAL; + + mrp->ring_role = role->ring_role; + + /* If there is an error just bailed out */ + err = br_mrp_switchdev_set_ring_role(br, mrp, role->ring_role); + if (err && err != -EOPNOTSUPP) + return err; + + /* Now detect if the HW actually applied the role or not. If the HW + * applied the role it means that the SW will not to do those operations + * anymore. For example if the role ir MRM then the HW will notify the + * SW when ring is open, but if the is not pushed to the HW the SW will + * need to detect when the ring is open + */ + mrp->ring_role_offloaded = err == -EOPNOTSUPP ? 0 : 1; + + return 0; +} + +/* Start to generate MRP test frames, the frames are generated by HW and if it + * fails, they are generated by the SW. + * note: already called with rtnl_lock + */ +int br_mrp_start_test(struct net_bridge *br, + struct br_mrp_start_test *test) +{ + struct br_mrp *mrp = br_mrp_find_id(br, test->ring_id); + + if (!mrp) + return -EINVAL; + + /* Try to push it to the HW and if it fails then continue to generate in + * SW and if that also fails then return error + */ + if (!br_mrp_switchdev_send_ring_test(br, mrp, test->interval, + test->max_miss, test->period)) + return 0; + + mrp->test_interval = test->interval; + mrp->test_end = jiffies + usecs_to_jiffies(test->period); + mrp->test_max_miss = test->max_miss; + mrp->test_count_miss = 0; + queue_delayed_work(system_wq, &mrp->test_work, + usecs_to_jiffies(test->interval)); + + return 0; +} + +/* Process only MRP Test frame. All the other MRP frames are processed by + * userspace application + * note: already called with rcu_read_lock + */ +static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port, + struct sk_buff *skb) +{ + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + /* Each MRP header starts with a version field which is 16 bits. + * Therefore skip the version and get directly the TLV header. + */ + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return; + + if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST) + return; + + mrp->test_count_miss = 0; + + /* Notify the userspace that the ring is closed only when the ring is + * not closed + */ + if (mrp->ring_state != BR_MRP_RING_STATE_CLOSED) + br_mrp_port_open(port->dev, false); +} + +/* This will just forward the frame to the other mrp ring port(MRC role) or will + * not do anything. + * note: already called with rcu_read_lock + */ +static int br_mrp_rcv(struct net_bridge_port *p, + struct sk_buff *skb, struct net_device *dev) +{ + struct net_device *s_dev, *p_dev, *d_dev; + struct net_bridge_port *p_port, *s_port; + struct net_bridge *br; + struct sk_buff *nskb; + struct br_mrp *mrp; + + /* If port is disabled don't accept any frames */ + if (p->state == BR_STATE_DISABLED) + return 0; + + br = p->br; + mrp = br_mrp_find_port(br, p); + if (unlikely(!mrp)) + return 0; + + p_port = rcu_dereference(mrp->p_port); + if (!p_port) + return 0; + + s_port = rcu_dereference(mrp->s_port); + if (!s_port) + return 0; + + /* If the role is MRM then don't forward the frames */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRM) { + br_mrp_mrm_process(mrp, p, skb); + return 1; + } + + /* Clone the frame and forward it on the other MRP port */ + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return 0; + + p_dev = p_port->dev; + s_dev = s_port->dev; + + if (p_dev == dev) + d_dev = s_dev; + else + d_dev = p_dev; + + nskb->dev = d_dev; + skb_push(nskb, ETH_HLEN); + dev_queue_xmit(nskb); + + return 1; +} + +/* Check if the frame was received on a port that is part of MRP ring + * and if the frame has MRP eth. In that case process the frame otherwise do + * normal forwarding. + * note: already called with rcu_read_lock + */ +int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) +{ + /* If there is no MRP instance do normal forwarding */ + if (likely(!(p->flags & BR_MRP_AWARE))) + goto out; + + if (unlikely(skb->protocol == htons(ETH_P_MRP))) + return br_mrp_rcv(p, skb, p->dev); + +out: + return 0; +} + +bool br_mrp_enabled(struct net_bridge *br) +{ + return !list_empty(&br->mrp_list); +} diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c new file mode 100644 index 000000000000..503896638be0 --- /dev/null +++ b/net/bridge/br_mrp_netlink.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <net/genetlink.h> + +#include <uapi/linux/mrp_bridge.h> +#include "br_private.h" +#include "br_private_mrp.h" + +static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = { + [IFLA_BRIDGE_MRP_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_instance)}, + [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_port_role)}, + [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_ring_state)}, + [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_ring_role)}, + [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct br_mrp_start_test)}, +}; + +int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_MAX + 1]; + int err; + + if (br->stp_enabled != BR_NO_STP) { + NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled\n"); + return -EINVAL; + } + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_MAX, attr, + br_mrp_policy, extack); + if (err) + return err; + + if (tb[IFLA_BRIDGE_MRP_INSTANCE]) { + struct br_mrp_instance *instance = + nla_data(tb[IFLA_BRIDGE_MRP_INSTANCE]); + + if (cmd == RTM_SETLINK) + err = br_mrp_add(br, instance); + else + err = br_mrp_del(br, instance); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_PORT_STATE]) { + enum br_mrp_port_state_type state = + nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE]); + + err = br_mrp_set_port_state(p, state); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_PORT_ROLE]) { + struct br_mrp_port_role *role = + nla_data(tb[IFLA_BRIDGE_MRP_PORT_ROLE]); + + err = br_mrp_set_port_role(p, role); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_RING_STATE]) { + struct br_mrp_ring_state *state = + nla_data(tb[IFLA_BRIDGE_MRP_RING_STATE]); + + err = br_mrp_set_ring_state(br, state); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_RING_ROLE]) { + struct br_mrp_ring_role *role = + nla_data(tb[IFLA_BRIDGE_MRP_RING_ROLE]); + + err = br_mrp_set_ring_role(br, role); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_START_TEST]) { + struct br_mrp_start_test *test = + nla_data(tb[IFLA_BRIDGE_MRP_START_TEST]); + + err = br_mrp_start_test(br, test); + if (err) + return err; + } + + return 0; +} + +int br_mrp_port_open(struct net_device *dev, u8 loc) +{ + struct net_bridge_port *p; + int err = 0; + + p = br_port_get_rcu(dev); + if (!p) { + err = -EINVAL; + goto out; + } + + if (loc) + p->flags |= BR_MRP_LOST_CONT; + else + p->flags &= ~BR_MRP_LOST_CONT; + + br_ifinfo_notify(RTM_NEWLINK, NULL, p); + +out: + return err; +} diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c new file mode 100644 index 000000000000..51cb1d5a24b4 --- /dev/null +++ b/net/bridge/br_mrp_switchdev.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <net/switchdev.h> + +#include "br_private_mrp.h" + +int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp) +{ + struct switchdev_obj_mrp mrp_obj = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_MRP, + .p_port = rtnl_dereference(mrp->p_port)->dev, + .s_port = rtnl_dereference(mrp->s_port)->dev, + .ring_id = mrp->ring_id, + }; + int err; + + err = switchdev_port_obj_add(br->dev, &mrp_obj.obj, NULL); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp) +{ + struct switchdev_obj_mrp mrp_obj = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_MRP, + .p_port = NULL, + .s_port = NULL, + .ring_id = mrp->ring_id, + }; + int err; + + err = switchdev_port_obj_del(br->dev, &mrp_obj.obj); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_switchdev_set_ring_role(struct net_bridge *br, + struct br_mrp *mrp, + enum br_mrp_ring_role_type role) +{ + struct switchdev_obj_ring_role_mrp mrp_role = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_RING_ROLE_MRP, + .ring_role = role, + .ring_id = mrp->ring_id, + }; + int err; + + if (role == BR_MRP_RING_ROLE_DISABLED) + err = switchdev_port_obj_del(br->dev, &mrp_role.obj); + else + err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL); + + return err; +} + +int br_mrp_switchdev_send_ring_test(struct net_bridge *br, + struct br_mrp *mrp, u32 interval, + u8 max_miss, u32 period) +{ + struct switchdev_obj_ring_test_mrp test = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_RING_TEST_MRP, + .interval = interval, + .max_miss = max_miss, + .ring_id = mrp->ring_id, + .period = period, + }; + int err; + + if (interval == 0) + err = switchdev_port_obj_del(br->dev, &test.obj); + else + err = switchdev_port_obj_add(br->dev, &test.obj, NULL); + + return err; +} + +int br_mrp_switchdev_set_ring_state(struct net_bridge *br, + struct br_mrp *mrp, + enum br_mrp_ring_state_type state) +{ + struct switchdev_obj_ring_state_mrp mrp_state = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_RING_STATE_MRP, + .ring_state = state, + .ring_id = mrp->ring_id, + }; + int err; + + err = switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_MRP_PORT_STATE, + .u.mrp_port_state = state, + }; + int err; + + err = switchdev_port_attr_set(p->dev, &attr); + if (err && err != -EOPNOTSUPP) + br_warn(p->br, "error setting offload MRP state on port %u(%s)\n", + (unsigned int)p->port_no, p->dev->name); + + return err; +} + +int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, + enum br_mrp_port_role_type role) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, + .u.mrp_port_role = role, + }; + int err; + + err = switchdev_port_attr_set(p->dev, &attr); + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 43dab4066f91..a774e19c41bb 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -151,6 +151,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + 0; } @@ -213,6 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, !!(p->flags & BR_NEIGH_SUPPRESS)) || + nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & + BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; @@ -669,6 +672,11 @@ static int br_afspec(struct net_bridge *br, if (err) return err; break; + case IFLA_BRIDGE_MRP: + err = br_mrp_parse(br, p, attr, cmd, extack); + if (err) + return err; + break; } } @@ -1101,7 +1109,9 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_STP_STATE]) { u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]); - br_stp_set_enabled(br, stp_enabled); + err = br_stp_set_enabled(br, stp_enabled, extack); + if (err) + return err; } if (data[IFLA_BR_PRIORITY]) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1f97703a52ff..c35647cb138a 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -428,6 +428,10 @@ struct net_bridge { int offload_fwd_mark; #endif struct hlist_head fdb_list; + +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + struct list_head __rcu mrp_list; +#endif }; struct br_input_skb_cb { @@ -1279,7 +1283,8 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time); /* br_stp_if.c */ void br_stp_enable_bridge(struct net_bridge *br); void br_stp_disable_bridge(struct net_bridge *br); -void br_stp_set_enabled(struct net_bridge *br, unsigned long val); +int br_stp_set_enabled(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack); void br_stp_enable_port(struct net_bridge_port *p); void br_stp_disable_port(struct net_bridge_port *p); bool br_stp_recalculate_bridge_id(struct net_bridge *br); @@ -1304,6 +1309,37 @@ unsigned long br_timer_value(const struct timer_list *timer); extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr); #endif +/* br_mrp.c */ +#if IS_ENABLED(CONFIG_BRIDGE_MRP) +int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); +int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb); +bool br_mrp_enabled(struct net_bridge *br); +void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p); +#else +static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) +{ + return 0; +} + +static inline bool br_mrp_enabled(struct net_bridge *br) +{ + return 0; +} + +static inline void br_mrp_port_del(struct net_bridge *br, + struct net_bridge_port *p) +{ +} +#endif + /* br_netlink.c */ extern struct rtnl_link_ops br_link_ops; int br_netlink_init(void); diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h new file mode 100644 index 000000000000..2921a4b59f8e --- /dev/null +++ b/net/bridge/br_private_mrp.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _BR_PRIVATE_MRP_H_ +#define _BR_PRIVATE_MRP_H_ + +#include "br_private.h" +#include <uapi/linux/mrp_bridge.h> + +struct br_mrp { + /* list of mrp instances */ + struct list_head __rcu list; + + struct net_bridge_port __rcu *p_port; + struct net_bridge_port __rcu *s_port; + + u32 ring_id; + + enum br_mrp_ring_role_type ring_role; + u8 ring_role_offloaded; + enum br_mrp_ring_state_type ring_state; + u32 ring_transitions; + + struct delayed_work test_work; + u32 test_interval; + unsigned long test_end; + u32 test_count_miss; + u32 test_max_miss; + + u32 seq_id; + + struct rcu_head rcu; +}; + +/* br_mrp.c */ +int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance); +int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance); +int br_mrp_set_port_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state); +int br_mrp_set_port_role(struct net_bridge_port *p, + struct br_mrp_port_role *role); +int br_mrp_set_ring_state(struct net_bridge *br, + struct br_mrp_ring_state *state); +int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role); +int br_mrp_start_test(struct net_bridge *br, struct br_mrp_start_test *test); + +/* br_mrp_switchdev.c */ +int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp); +int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp); +int br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_ring_role_type role); +int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_ring_state_type state); +int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period); +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state); +int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, + enum br_mrp_port_role_type role); + +/* br_mrp_netlink.c */ +int br_mrp_port_open(struct net_device *dev, u8 loc); + +#endif /* _BR_PRIVATE_MRP_H */ diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 1f14b8455345..3e88be7aa269 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -36,6 +36,12 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) }; int err; + /* Don't change the state of the ports if they are driven by a different + * protocol. + */ + if (p->flags & BR_MRP_AWARE) + return; + p->state = state; err = switchdev_port_attr_set(p->dev, &attr); if (err && err != -EOPNOTSUPP) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index d174d3a566aa..a42850b7eb9a 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -196,10 +196,17 @@ static void br_stp_stop(struct net_bridge *br) br->stp_enabled = BR_NO_STP; } -void br_stp_set_enabled(struct net_bridge *br, unsigned long val) +int br_stp_set_enabled(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { ASSERT_RTNL(); + if (br_mrp_enabled(br)) { + NL_SET_ERR_MSG_MOD(extack, + "STP can't be enabled if MRP is already enabled\n"); + return -EINVAL; + } + if (val) { if (br->stp_enabled == BR_NO_STP) br_stp_start(br); @@ -207,6 +214,8 @@ void br_stp_set_enabled(struct net_bridge *br, unsigned long val) if (br->stp_enabled != BR_NO_STP) br_stp_stop(br); } + + return 0; } /* called under bridge lock */ diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 9ab0f00b1081..7db06e3f642a 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -126,9 +126,7 @@ static ssize_t stp_state_show(struct device *d, static int set_stp_state(struct net_bridge *br, unsigned long val) { - br_stp_set_enabled(br, val); - - return 0; + return br_stp_set_enabled(br, val, NULL); } static ssize_t stp_state_store(struct device *d, diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index 2e8e6f904920..d7bec7adc267 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -39,6 +39,6 @@ config CEPH_LIB_USE_DNS_RESOLVER be resolved using the CONFIG_DNS_RESOLVER facility. For information on how to use CONFIG_DNS_RESOLVER consult - Documentation/networking/dns_resolver.txt + Documentation/networking/dns_resolver.rst If unsure, say N. diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 1d653fbfcf52..e491b083b348 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -6,7 +6,7 @@ * Jamal Hadi Salim * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * - * See Documentation/networking/gen_stats.txt + * See Documentation/networking/gen_stats.rst */ #include <linux/types.h> diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 849380a622ef..15b366a1a958 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -69,10 +69,11 @@ module_param(carrier_timeout, uint, 0644); #define np_notice(np, fmt, ...) \ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) -static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, + struct net_device *dev, + struct netdev_queue *txq) { - int status = NETDEV_TX_OK; + netdev_tx_t status = NETDEV_TX_OK; netdev_features_t features; features = netif_skb_features(skb); @@ -307,7 +308,7 @@ static int netpoll_owner_active(struct net_device *dev) void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev) { - int status = NETDEV_TX_BUSY; + netdev_tx_t status = NETDEV_TX_BUSY; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d6f4f4a9e8ba..2269199c5891 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3997,8 +3997,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; - int err = -EINVAL; __u8 *addr; + int err; u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig index 0935453ccfd5..8f98fb2f2ec9 100644 --- a/net/decnet/Kconfig +++ b/net/decnet/Kconfig @@ -15,7 +15,7 @@ config DECNET <http://linux-decnet.sourceforge.net/>. More detailed documentation is available in - <file:Documentation/networking/decnet.txt>. + <file:Documentation/networking/decnet.rst>. Be sure to say Y to "/proc file system support" and "Sysctl support" below when using DECnet, since you will need sysctl support to aid @@ -40,4 +40,4 @@ config DECNET_ROUTER filtering" option will be required for the forthcoming routing daemon to work. - See <file:Documentation/networking/decnet.txt> for more information. + See <file:Documentation/networking/decnet.rst> for more information. diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig index 0a1c2238b4bd..255df9b6e9e8 100644 --- a/net/dns_resolver/Kconfig +++ b/net/dns_resolver/Kconfig @@ -19,7 +19,7 @@ config DNS_RESOLVER SMB2 later. DNS Resolver is supported by the userspace upcall helper "/sbin/dns.resolver" via /etc/request-key.conf. - See <file:Documentation/networking/dns_resolver.txt> for further + See <file:Documentation/networking/dns_resolver.rst> for further information. To compile this as a module, choose M here: the module will be called diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index ad53eb31d40f..3aced951d5ab 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -1,6 +1,6 @@ /* Key type used to cache DNS lookups made by the kernel * - * See Documentation/networking/dns_resolver.txt + * See Documentation/networking/dns_resolver.rst * * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index cab4e0df924f..82b084cc1cc6 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -1,7 +1,7 @@ /* Upcall routine, designed to work as a key type and working through * /sbin/request-key to contact userspace when handling DNS queries. * - * See Documentation/networking/dns_resolver.txt + * See Documentation/networking/dns_resolver.rst * * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 7321cf8d6d2c..f74193465bf5 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -62,15 +62,6 @@ struct hsr_tag { * with the path field in-between, which seems strange. I'm guessing the MAC * address definition is in error. */ -static inline u16 get_hsr_tag_path(struct hsr_tag *ht) -{ - return ntohs(ht->path_and_LSDU_size) >> 12; -} - -static inline u16 get_hsr_tag_LSDU_size(struct hsr_tag *ht) -{ - return ntohs(ht->path_and_LSDU_size) & 0x0FFF; -} static inline void set_hsr_tag_path(struct hsr_tag *ht, u16 path) { @@ -103,16 +94,6 @@ struct hsr_sup_payload { unsigned char macaddress_A[ETH_ALEN]; } __packed; -static inline u16 get_hsr_stag_path(struct hsr_sup_tag *hst) -{ - return get_hsr_tag_path((struct hsr_tag *)hst); -} - -static inline u16 get_hsr_stag_HSR_ver(struct hsr_sup_tag *hst) -{ - return get_hsr_tag_LSDU_size((struct hsr_tag *)hst); -} - static inline void set_hsr_stag_path(struct hsr_sup_tag *hst, u16 path) { set_hsr_tag_path((struct hsr_tag *)hst, path); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 25a8888826b8..5da4733067fb 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -49,7 +49,7 @@ config IP_ADVANCED_ROUTER Note that some distributions enable it in startup scripts. For details about rp_filter strict and loose mode read - <file:Documentation/networking/ip-sysctl.txt>. + <file:Documentation/networking/ip-sysctl.rst>. If unsure, say N here. diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c618e242490f..6177c4ba0037 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1835,6 +1835,7 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; net->ipv4.sysctl_tcp_early_demux = 1; + net->ipv4.sysctl_nexthop_compat_mode = 1; #ifdef CONFIG_SYSCTL net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 55ca2e521828..e53871e4a097 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1780,6 +1780,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; if (nexthop_is_blackhole(fi->nh)) rtm->rtm_type = RTN_BLACKHOLE; + if (!fi->fib_net->ipv4.sysctl_nexthop_compat_mode) + goto offload; } if (nhs == 1) { @@ -1805,6 +1807,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; } +offload: if (fri->offload) rtm->rtm_flags |= RTM_F_OFFLOAD; if (fri->trap) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index fc61f51d87a3..956a806649f7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -853,7 +853,7 @@ static bool icmp_unreach(struct sk_buff *skb) case ICMP_FRAG_NEEDED: /* for documentation of the ip_no_pmtu_disc * values please see - * Documentation/networking/ip-sysctl.txt + * Documentation/networking/ip-sysctl.rst */ switch (net->ipv4.sysctl_ip_no_pmtu_disc) { default: diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index fdfca534d094..3957364d556c 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -784,7 +784,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); - ipv6_stub->ip6_del_rt(net, f6i); + ipv6_stub->ip6_del_rt(net, f6i, + !net->ipv4.sysctl_nexthop_compat_mode); } } @@ -1041,7 +1042,7 @@ out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); - if (replace_notify) + if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 81b267e990a1..95ad71e76cc3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -711,6 +711,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_tcp_early_demux }, { + .procname = "nexthop_compat_mode", + .data = &init_net.ipv4.sysctl_nexthop_compat_mode, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, .maxlen = sizeof(int), diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 2ccaee98fddb..5a6111da26c4 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -13,7 +13,7 @@ menuconfig IPV6 For general information about IPv6, see <https://en.wikipedia.org/wiki/IPv6>. For specific information about IPv6 under Linux, see - Documentation/networking/ipv6.txt and read the HOWTO at + Documentation/networking/ipv6.rst and read the HOWTO at <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/> To compile this protocol support as a module, choose M here: the diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 27b4fb6e452b..2c4f20ec1e2a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1238,7 +1238,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, ifp->idev->dev, 0, RTF_DEFAULT, true); if (f6i) { if (del_rt) - ip6_del_rt(dev_net(ifp->idev->dev), f6i); + ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); else { if (!(f6i->fib6_flags & RTF_EXPIRES)) fib6_set_expires(f6i, expires); @@ -2718,7 +2718,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (rt) { /* Autoconf prefix route */ if (valid_lft == 0) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ @@ -3813,7 +3813,7 @@ restart: spin_unlock_bh(&ifa->lock); if (rt) - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); @@ -4652,7 +4652,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF; if (f6i->fib6_metric != prio) { /* delete old one */ - ip6_del_rt(dev_net(ifp->idev->dev), f6i); + ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); /* add new one */ addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr, @@ -6073,10 +6073,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ifp->idev->dev, 0, 0, false); if (rt) - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); } if (ifp->rt) { - ip6_del_rt(net, ifp->rt); + ip6_del_rt(net, ifp->rt, false); ifp->rt = NULL; } rt_genid_bump_ipv6(net); diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index ea00ce3d4117..9ebf3fe0d2b1 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -185,7 +185,8 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, return -EAFNOSUPPORT; } -static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) +static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt, + bool skip_notify) { return -EAFNOSUPPORT; } diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index fed91ab7ec46..893261230ffc 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -364,7 +364,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); - ip6_del_rt(dev_net(idev->dev), aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); return 0; @@ -393,7 +393,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) addrconf_leave_solict(idev, &aca->aca_addr); - ip6_del_rt(dev_net(idev->dev), aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index bb6fc0d54dae..ad5f6f6ba333 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -68,11 +68,6 @@ static inline struct ila_addr *ila_a2i(struct in6_addr *addr) return (struct ila_addr *)addr; } -static inline bool ila_addr_is_ila(struct ila_addr *iaddr) -{ - return (iaddr->ident.type != ILA_ATYPE_IID); -} - struct ila_params { struct ila_locator locator; struct ila_locator locator_match; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 1ecd4e9b0bdf..2d09c4da03ee 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1302,7 +1302,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } } if (rt && lifetime == 0) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 310cbddaa533..803212aae4ca 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -984,7 +984,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, gwaddr, dev); if (rt && !lifetime) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } @@ -3729,9 +3729,12 @@ out: return err; } -int ip6_del_rt(struct net *net, struct fib6_info *rt) +int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) { - struct nl_info info = { .nl_net = net }; + struct nl_info info = { + .nl_net = net, + .skip_notify = skip_notify + }; return __ip6_del_rt(rt, &info); } @@ -4252,7 +4255,7 @@ restart: (!idev || idev->cnf.accept_ra != 2) && fib6_info_hold_safe(rt)) { rcu_read_unlock(); - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); goto restart; } } @@ -5554,7 +5557,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (nexthop_is_blackhole(rt->nh)) rtm->rtm_type = RTN_BLACKHOLE; - if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) + if (net->ipv4.sysctl_nexthop_compat_mode && + rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6fd44bdb0fc3..e859e3f420d9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -338,36 +338,53 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) } /* register a new rmb, send confirm_rkey msg to register with peer */ -static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, - bool conf_rkey) +static int smcr_link_reg_rmb(struct smc_link *link, + struct smc_buf_desc *rmb_desc, bool conf_rkey) { - if (!rmb_desc->wr_reg) { + if (!rmb_desc->is_reg_mr[link->link_idx]) { /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { - rmb_desc->regerr = 1; + if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { + rmb_desc->is_reg_err = true; return -EFAULT; } - rmb_desc->wr_reg = 1; + rmb_desc->is_reg_mr[link->link_idx] = true; } if (!conf_rkey) return 0; + /* exchange confirm_rkey msg with peer */ - if (smc_llc_do_confirm_rkey(link, rmb_desc)) { - rmb_desc->regerr = 1; - return -EFAULT; + if (!rmb_desc->is_conf_rkey) { + if (smc_llc_do_confirm_rkey(link, rmb_desc)) { + rmb_desc->is_reg_err = true; + return -EFAULT; + } + rmb_desc->is_conf_rkey = true; } return 0; } -static int smc_clnt_conf_first_link(struct smc_sock *smc) +/* register the new rmb on all links */ +static int smcr_lgr_reg_rmbs(struct smc_link_group *lgr, + struct smc_buf_desc *rmb_desc) { - struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; + int i, rc; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE) + continue; + rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc, true); + if (rc) + return rc; + } + return 0; +} + +static int smcr_clnt_conf_first_link(struct smc_sock *smc) +{ + struct smc_link *link = smc->conn.lnk; int rest; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; /* receive CONFIRM LINK request from server over RoCE fabric */ rest = wait_for_completion_interruptible_timeout( &link->llc_confirm, @@ -389,7 +406,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK response over RoCE fabric */ @@ -415,7 +432,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) if (rc < 0) return SMC_CLC_DECL_TIMEOUT_AL; - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); + smc_llc_link_active(link); return 0; } @@ -610,7 +627,7 @@ static int smc_connect_rdma(struct smc_sock *smc, mutex_unlock(&smc_client_lgr_pending); return reason_code; } - link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + link = smc->conn.lnk; smc_conn_save_peer_info(smc, aclc); @@ -622,7 +639,7 @@ static int smc_connect_rdma(struct smc_sock *smc, if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, aclc); - if (smc_rmb_rtoken_handling(&smc->conn, aclc)) + if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, ini->cln_first_contact); @@ -634,7 +651,7 @@ static int smc_connect_rdma(struct smc_sock *smc, return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, ini->cln_first_contact); } else { - if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) + if (smcr_lgr_reg_rmbs(smc->conn.lgr, smc->conn.rmb_desc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, ini->cln_first_contact); } @@ -649,7 +666,7 @@ static int smc_connect_rdma(struct smc_sock *smc, if (ini->cln_first_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ - reason_code = smc_clnt_conf_first_link(smc); + reason_code = smcr_clnt_conf_first_link(smc); if (reason_code) return smc_connect_abort(smc, reason_code, ini->cln_first_contact); @@ -999,17 +1016,13 @@ void smc_close_non_accepted(struct sock *sk) sock_put(sk); /* final sock_put */ } -static int smc_serv_conf_first_link(struct smc_sock *smc) +static int smcr_serv_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; + struct smc_link *link = smc->conn.lnk; int rest; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; - - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK request to client over the RoCE fabric */ @@ -1050,7 +1063,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; } - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); + smc_llc_link_active(link); return 0; } @@ -1194,10 +1207,10 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_connection *conn = &new_smc->conn; if (local_contact != SMC_FIRST_CONTACT) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) + if (smcr_lgr_reg_rmbs(conn->lgr, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; } smc_rmb_sync_sg_for_device(&new_smc->conn); @@ -1210,13 +1223,13 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, int local_contact) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; if (local_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, cclc); - if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { + if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) { reason_code = SMC_CLC_DECL_ERR_RTOK; goto decline; } @@ -1227,7 +1240,7 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, goto decline; } /* QP confirmation over RoCE fabric */ - reason_code = smc_serv_conf_first_link(new_smc); + reason_code = smcr_serv_conf_first_link(new_smc); if (reason_code) goto decline; } diff --git a/net/smc/smc.h b/net/smc/smc.h index be11ba41190f..1a084afa7372 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -121,6 +121,7 @@ enum smc_urg_state { struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ + struct smc_link *lnk; /* assigned SMC-R link */ u32 alert_token_local; /* unique conn. id */ u8 peer_rmbe_idx; /* from tcp handshake */ int peer_rmbe_size; /* size of peer rx buffer */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 164f1584861b..f64589d823aa 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -57,7 +57,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend) { - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = conn->lnk; int rc; rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, @@ -91,12 +91,10 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend) { + struct smc_link *link = conn->lnk; union smc_host_cursor cfed; - struct smc_link *link; int rc; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; - smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; @@ -165,7 +163,7 @@ static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend) void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) { - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = conn->lnk; smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE, smc_cdc_tx_filter, smc_cdc_tx_dismisser, diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index ea0068f0173c..d5627df24215 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -496,7 +496,7 @@ int smc_clc_send_confirm(struct smc_sock *smc) sizeof(SMCD_EYECATCHER)); } else { /* SMC-R specific settings */ - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + link = conn->lnk; memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); cclc.hdr.path = SMC_TYPE_R; @@ -508,13 +508,13 @@ int smc_clc_send_confirm(struct smc_sock *smc) ETH_ALEN); hton24(cclc.qpn, link->roce_qp->qp_num); cclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); cclc.rmbe_size = conn->rmbe_size_short; cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(cclc.psn, link->psn_initial); memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -572,7 +572,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); aclc.hdr.path = SMC_TYPE_R; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + link = conn->lnk; memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE); @@ -580,13 +580,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) ETH_ALEN); hton24(aclc.qpn, link->roce_qp->qp_num); aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; aclc.rmbe_size = conn->rmbe_size_short, aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(aclc.psn, link->psn_initial); memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index ca209272e5fa..4f2e150a2be1 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -44,6 +44,7 @@ #define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */ #define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ #define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ +#define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 824c5211b027..db49f8cd5c95 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -116,7 +116,7 @@ static void smc_lgr_add_alert_token(struct smc_connection *conn) * Requires @conns_lock * Note that '0' is a reserved value and not assigned. */ -static void smc_lgr_register_conn(struct smc_connection *conn) +static int smc_lgr_register_conn(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); static atomic_t nexttoken = ATOMIC_INIT(0); @@ -131,7 +131,24 @@ static void smc_lgr_register_conn(struct smc_connection *conn) conn->alert_token_local = 0; } smc_lgr_add_alert_token(conn); + + /* assign the new connection to a link */ + if (!conn->lgr->is_smcd) { + struct smc_link *lnk; + int i; + + /* tbd - link balancing */ + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + lnk = &conn->lgr->lnk[i]; + if (lnk->state == SMC_LNK_ACTIVATING || + lnk->state == SMC_LNK_ACTIVE) + conn->lnk = lnk; + } + if (!conn->lnk) + return SMC_CLC_DECL_NOACTLINK; + } conn->lgr->conns_num++; + return 0; } /* Unregister connection and reset the alert token of the given connection< @@ -179,7 +196,7 @@ void smc_lgr_cleanup_early(struct smc_connection *conn) * of the DELETE LINK sequence from server; or as server to * initiate the delete processing. See smc_llc_rx_delete_link(). */ -static int smc_link_send_delete(struct smc_link *lnk, bool orderly) +static int smcr_link_send_delete(struct smc_link *lnk, bool orderly) { if (lnk->state == SMC_LNK_ACTIVE && !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { @@ -197,8 +214,8 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link_group, free_work); spinlock_t *lgr_lock; - struct smc_link *lnk; bool conns; + int i; smc_lgr_list_head(lgr, &lgr_lock); spin_lock_bh(lgr_lock); @@ -215,25 +232,38 @@ static void smc_lgr_free_work(struct work_struct *work) } list_del_init(&lgr->list); /* remove from smc_lgr_list */ - lnk = &lgr->lnk[SMC_SINGLE_LINK]; if (!lgr->is_smcd && !lgr->terminating) { - /* try to send del link msg, on error free lgr immediately */ - if (lnk->state == SMC_LNK_ACTIVE && - !smc_link_send_delete(lnk, true)) { - /* reschedule in case we never receive a response */ - smc_lgr_schedule_free_work(lgr); + bool do_wait = false; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + /* try to send del link msg, on err free immediately */ + if (lnk->state == SMC_LNK_ACTIVE && + !smcr_link_send_delete(lnk, true)) { + /* reschedule in case we never receive a resp */ + smc_lgr_schedule_free_work(lgr); + do_wait = true; + } + } + if (do_wait) { spin_unlock_bh(lgr_lock); - return; + return; /* wait for resp, see smc_llc_rx_delete_link */ } } lgr->freeing = 1; /* this instance does the freeing, no new schedule */ spin_unlock_bh(lgr_lock); cancel_delayed_work(&lgr->free_work); - if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); if (lgr->is_smcd && !lgr->terminating) smc_ism_signal_shutdown(lgr); + if (!lgr->is_smcd) { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (smc_link_usable(lnk)) + lnk->state = SMC_LNK_INACTIVE; + } + } smc_lgr_free(lgr); } @@ -245,6 +275,87 @@ static void smc_lgr_terminate_work(struct work_struct *work) __smc_lgr_terminate(lgr, true); } +/* return next unique link id for the lgr */ +static u8 smcr_next_link_id(struct smc_link_group *lgr) +{ + u8 link_id; + int i; + + while (1) { + link_id = ++lgr->next_link_id; + if (!link_id) /* skip zero as link_id */ + link_id = ++lgr->next_link_id; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (smc_link_usable(&lgr->lnk[i]) && + lgr->lnk[i].link_id == link_id) + continue; + } + break; + } + return link_id; +} + +static int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini) +{ + u8 rndvec[3]; + int rc; + + get_device(&ini->ib_dev->ibdev->dev); + atomic_inc(&ini->ib_dev->lnk_cnt); + lnk->state = SMC_LNK_ACTIVATING; + lnk->link_id = smcr_next_link_id(lgr); + lnk->lgr = lgr; + lnk->link_idx = link_idx; + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + if (!ini->ib_dev->initialized) { + rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev); + if (rc) + goto out; + } + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + + (rndvec[2] << 16); + rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, + ini->vlan_id, lnk->gid, &lnk->sgid_index); + if (rc) + goto out; + rc = smc_llc_link_init(lnk); + if (rc) + goto out; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + return 0; + +destroy_qp: + smc_ib_destroy_queue_pair(lnk); +dealloc_pd: + smc_ib_dealloc_protection_domain(lnk); +free_link_mem: + smc_wr_free_link_mem(lnk); +clear_llc_lnk: + smc_llc_link_clear(lnk); +out: + put_device(&ini->ib_dev->ibdev->dev); + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&ini->ib_dev->lnk_cnt)) + wake_up(&ini->ib_dev->lnks_deleted); + return rc; +} + /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -252,7 +363,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) struct list_head *lgr_list; struct smc_link *lnk; spinlock_t *lgr_lock; - u8 rndvec[3]; + u8 link_idx; int rc = 0; int i; @@ -274,13 +385,14 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->freefast = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; - rwlock_init(&lgr->sndbufs_lock); - rwlock_init(&lgr->rmbs_lock); + mutex_init(&lgr->sndbufs_lock); + mutex_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } + lgr->next_link_id = 0; smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); @@ -297,48 +409,19 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) atomic_inc(&ini->ism_dev->lgr_cnt); } else { /* SMC-R specific settings */ - get_device(&ini->ib_dev->ibdev->dev); lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); + smc_llc_lgr_init(lgr, smc); - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* initialize link */ - lnk->state = SMC_LNK_ACTIVATING; - lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = ini->ib_dev; - lnk->ibport = ini->ib_port; - lgr_list = &smc_lgr_list.list; - lgr_lock = &smc_lgr_list.lock; - lnk->path_mtu = - ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; - if (!ini->ib_dev->initialized) - smc_ib_setup_per_ibdev(ini->ib_dev); - get_random_bytes(rndvec, sizeof(rndvec)); - lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + - (rndvec[2] << 16); - rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, - ini->vlan_id, lnk->gid, - &lnk->sgid_index); - if (rc) - goto free_lgr; - rc = smc_llc_link_init(lnk); + link_idx = SMC_SINGLE_LINK; + lnk = &lgr->lnk[link_idx]; + rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) goto free_lgr; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; - rc = smc_ib_create_protection_domain(lnk); - if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; - rc = smc_wr_create_link(lnk); - if (rc) - goto destroy_qp; + lgr_list = &smc_lgr_list.list; + lgr_lock = &smc_lgr_list.lock; atomic_inc(&lgr_cnt); - atomic_inc(&ini->ib_dev->lnk_cnt); } smc->conn.lgr = lgr; spin_lock_bh(lgr_lock); @@ -346,14 +429,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) spin_unlock_bh(lgr_lock); return 0; -destroy_qp: - smc_ib_destroy_queue_pair(lnk); -dealloc_pd: - smc_ib_dealloc_protection_domain(lnk); -free_link_mem: - smc_wr_free_link_mem(lnk); -clear_llc_lnk: - smc_llc_link_clear(lnk); free_lgr: kfree(lgr); ism_put_vlan: @@ -369,29 +444,37 @@ out: return rc; } +static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, + struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + + if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { + /* unregister rmb with peer */ + smc_llc_do_delete_rkey(lnk, rmb_desc); + rmb_desc->is_conf_rkey = false; + } + if (rmb_desc->is_reg_err) { + /* buf registration failed, reuse not possible */ + mutex_lock(&lgr->rmbs_lock); + list_del(&rmb_desc->list); + mutex_unlock(&lgr->rmbs_lock); + + smc_buf_free(lgr, true, rmb_desc); + } else { + rmb_desc->used = 0; + } +} + static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { if (conn->sndbuf_desc) conn->sndbuf_desc->used = 0; - if (conn->rmb_desc) { - if (!conn->rmb_desc->regerr) { - if (!lgr->is_smcd && !list_empty(&lgr->list)) { - /* unregister rmb with peer */ - smc_llc_do_delete_rkey( - &lgr->lnk[SMC_SINGLE_LINK], - conn->rmb_desc); - } - conn->rmb_desc->used = 0; - } else { - /* buf registration failed, reuse not possible */ - write_lock_bh(&lgr->rmbs_lock); - list_del(&conn->rmb_desc->list); - write_unlock_bh(&lgr->rmbs_lock); - - smc_buf_free(lgr, true, conn->rmb_desc); - } - } + if (conn->rmb_desc && lgr->is_smcd) + conn->rmb_desc->used = 0; + else if (conn->rmb_desc) + smcr_buf_unuse(conn->rmb_desc, conn->lnk); } /* remove a finished connection from its link group */ @@ -417,8 +500,12 @@ void smc_conn_free(struct smc_connection *conn) smc_lgr_schedule_free_work(lgr); } -static void smc_link_clear(struct smc_link *lnk) +static void smcr_link_clear(struct smc_link *lnk) { + struct smc_ib_device *smcibdev; + + if (lnk->peer_qpn == 0) + return; lnk->peer_qpn = 0; smc_llc_link_clear(lnk); smc_ib_modify_qp_reset(lnk); @@ -426,26 +513,35 @@ static void smc_link_clear(struct smc_link *lnk) smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); - if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt)) - wake_up(&lnk->smcibdev->lnks_deleted); + put_device(&lnk->smcibdev->ibdev->dev); + smcibdev = lnk->smcibdev; + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *lnk; + int i; - if (is_rmb) { - if (buf_desc->mr_rx[SMC_SINGLE_LINK]) - smc_ib_put_memory_region( - buf_desc->mr_rx[SMC_SINGLE_LINK]); - smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, - DMA_FROM_DEVICE); - } else { - smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, - DMA_TO_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + lnk = &lgr->lnk[i]; + if (!buf_desc->is_map_ib[lnk->link_idx]) + continue; + if (is_rmb) { + if (buf_desc->mr_rx[lnk->link_idx]) + smc_ib_put_memory_region( + buf_desc->mr_rx[lnk->link_idx]); + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); + } else { + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); + } + sg_free_table(&buf_desc->sgt[lnk->link_idx]); } - sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); + if (buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); kfree(buf_desc); @@ -503,6 +599,8 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { + int i; + smc_lgr_free_bufs(lgr); if (lgr->is_smcd) { if (!lgr->terminating) { @@ -512,8 +610,11 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { - smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); - put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_UNUSED) + smcr_link_clear(&lgr->lnk[i]); + } + smc_llc_lgr_clear(lgr); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } @@ -581,16 +682,20 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft) static void smc_lgr_cleanup(struct smc_link_group *lgr) { + int i; + if (lgr->is_smcd) { smc_ism_signal_shutdown(lgr); smcd_unregister_all_dmbs(lgr); smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(&lgr->smcd->dev); } else { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; - if (lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); + if (smc_link_usable(lnk)) + lnk->state = SMC_LNK_INACTIVE; + } } } @@ -609,8 +714,6 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) if (!soft) cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; - if (!lgr->is_smcd) - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); @@ -656,14 +759,22 @@ void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); + int i; spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (!lgr->is_smcd && - lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; + if (lgr->is_smcd) + continue; + /* tbd - terminate only when no more links are active */ + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&lgr->lnk[i]) || + lgr->lnk[i].state == SMC_LNK_DELETING) + continue; + if (lgr->lnk[i].smcibdev == smcibdev && + lgr->lnk[i].ibport == ibport) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + } } } spin_unlock_bh(&smc_lgr_list.lock); @@ -728,6 +839,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); + int i; spin_lock_bh(&smc_lgr_list.lock); if (!smcibdev) { @@ -736,9 +848,12 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) lgr->freeing = 1; } else { list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].smcibdev == smcibdev) { + list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; + break; + } } } } @@ -810,15 +925,21 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, struct smc_clc_msg_local *lcl, enum smc_lgr_role role, u32 clcqpn) { - return !memcmp(lgr->peer_systemid, lcl->id_for_peer, - SMC_SYSTEMID_LEN) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, - SMC_GID_SIZE) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, - sizeof(lcl->mac)) && - lgr->role == role && - (lgr->role == SMC_SERV || - lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn); + int i; + + if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) || + lgr->role != role) + return false; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE) + continue; + if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) && + !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) && + !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac))) + return true; + } + return false; } static bool smcd_lgr_match(struct smc_link_group *lgr, @@ -859,15 +980,17 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) /* link group found */ ini->cln_first_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; - smc_lgr_register_conn(conn); /* add smc conn to lgr */ - if (delayed_work_pending(&lgr->free_work)) - cancel_delayed_work(&lgr->free_work); + rc = smc_lgr_register_conn(conn); /* add conn to lgr */ write_unlock_bh(&lgr->conns_lock); + if (!rc && delayed_work_pending(&lgr->free_work)) + cancel_delayed_work(&lgr->free_work); break; } write_unlock_bh(&lgr->conns_lock); } spin_unlock_bh(lgr_lock); + if (rc) + return rc; if (role == SMC_CLNT && !ini->srv_first_contact && ini->cln_first_contact == SMC_FIRST_CONTACT) { @@ -885,8 +1008,10 @@ create: goto out; lgr = conn->lgr; write_lock_bh(&lgr->conns_lock); - smc_lgr_register_conn(conn); /* add smc conn to lgr */ + rc = smc_lgr_register_conn(conn); /* add smc conn to lgr */ write_unlock_bh(&lgr->conns_lock); + if (rc) + goto out; } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; @@ -934,19 +1059,19 @@ int smc_uncompress_bufsize(u8 compressed) * buffer size; if not available, return NULL */ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, - rwlock_t *lock, + struct mutex *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; - read_lock_bh(lock); + mutex_lock(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { - read_unlock_bh(lock); + mutex_unlock(lock); return buf_slot; } } - read_unlock_bh(lock); + mutex_unlock(lock); return NULL; } @@ -959,12 +1084,55 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } +/* map an rmb buf to a link */ +static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link *lnk) +{ + int rc; + + if (buf_desc->is_map_ib[lnk->link_idx]) + return 0; + + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); + if (rc) + return rc; + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, + buf_desc->cpu_addr, buf_desc->len); + + /* map sg table to DMA address */ + rc = smc_ib_buf_map_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + /* SMC protocol depends on mapping to one DMA address only */ + if (rc != 1) { + rc = -EAGAIN; + goto free_table; + } + + /* create a new memory region for the RMB */ + if (is_rmb) { + rc = smc_ib_get_memory_region(lnk->roce_pd, + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE, + buf_desc, lnk->link_idx); + if (rc) + goto buf_unmap; + smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE); + } + buf_desc->is_map_ib[lnk->link_idx] = true; + return 0; + +buf_unmap: + smc_ib_buf_unmap_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); +free_table: + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + return rc; +} + static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; - struct smc_link *lnk; - int rc; /* try to alloc a new buffer */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); @@ -981,41 +1149,31 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, return ERR_PTR(-EAGAIN); } buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); + buf_desc->len = bufsize; + return buf_desc; +} - /* build the sg table from the pages */ - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, - GFP_KERNEL); - if (rc) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(rc); - } - sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, - buf_desc->cpu_addr, bufsize); +/* map buf_desc on all usable links, + * unused buffers stay mapped as long as the link is up + */ +static int smcr_buf_map_usable_links(struct smc_link_group *lgr, + struct smc_buf_desc *buf_desc, bool is_rmb) +{ + int i, rc = 0; - /* map sg table to DMA address */ - rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, - is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - /* SMC protocol depends on mapping to one DMA address only */ - if (rc != 1) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(-EAGAIN); - } + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; - /* create a new memory region for the RMB */ - if (is_rmb) { - rc = smc_ib_get_memory_region(lnk->roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, - buf_desc); - if (rc) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(rc); + if (!smc_link_usable(lnk)) + continue; + if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { + smcr_buf_unuse(buf_desc, lnk); + rc = -ENOMEM; + goto out; } } - - buf_desc->len = bufsize; - return buf_desc; +out: + return rc; } #define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ @@ -1062,8 +1220,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_short; + struct mutex *lock; /* lock buffer list */ int sk_buf_size; - rwlock_t *lock; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ @@ -1104,15 +1262,21 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) continue; buf_desc->used = 1; - write_lock_bh(lock); + mutex_lock(lock); list_add(&buf_desc->list, buf_list); - write_unlock_bh(lock); + mutex_unlock(lock); break; /* found */ } if (IS_ERR(buf_desc)) return -ENOMEM; + if (!is_smcd) { + if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { + return -ENOMEM; + } + } + if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_short = bufsize_short; @@ -1132,42 +1296,44 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; - - if (!conn->lgr || conn->lgr->is_smcd) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk)) return; - smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->sndbuf_desc, DMA_TO_DEVICE); + smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; - - if (!conn->lgr || conn->lgr->is_smcd) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk)) return; - smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->sndbuf_desc, DMA_TO_DEVICE); + smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; + int i; if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->rmb_desc, DMA_FROM_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&conn->lgr->lnk[i])) + continue; + smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, + DMA_FROM_DEVICE); + } } void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; + int i; if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->rmb_desc, DMA_FROM_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&conn->lgr->lnk[i])) + continue; + smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc, + DMA_FROM_DEVICE); + } } /* create the send and receive buffer for an SMC socket; @@ -1203,15 +1369,16 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) } /* add a new rtoken from peer */ -int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) { + struct smc_link_group *lgr = smc_get_lgr(lnk); u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { - if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && - (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && + lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr && test_bit(i, lgr->rtokens_used_mask)) { /* already in list */ return i; @@ -1220,23 +1387,25 @@ int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) i = smc_rmb_reserve_rtoken_idx(lgr); if (i < 0) return i; - lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; - lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; + lgr->rtokens[i][lnk->link_idx].rkey = rkey; + lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr; return i; } -/* delete an rtoken */ -int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) +/* delete an rtoken from all links */ +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) { + struct smc_link_group *lgr = smc_get_lgr(lnk); u32 rkey = ntohl(nw_rkey); - int i; + int i, j; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { - if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { - lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; - lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; - + for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) { + lgr->rtokens[i][j].rkey = 0; + lgr->rtokens[i][j].dma_addr = 0; + } clear_bit(i, lgr->rtokens_used_mask); return 0; } @@ -1246,9 +1415,10 @@ int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) /* save rkey and dma_addr received from peer during clc handshake */ int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_link *lnk, struct smc_clc_msg_accept_confirm *clc) { - conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, + conn->rtoken_idx = smc_rtoken_add(lnk, clc->rmb_dma_addr, clc->rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8041db20c753..b5781511063d 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -32,6 +32,7 @@ enum smc_lgr_role { /* possible roles of a link group */ }; enum smc_link_state { /* possible states of a link */ + SMC_LNK_UNUSED, /* link is unused */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ @@ -115,9 +116,10 @@ struct smc_link { u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ + u8 link_idx; /* index in lgr link array */ + struct smc_link_group *lgr; /* parent link group */ enum smc_link_state state; /* state of link */ - struct workqueue_struct *llc_wq; /* single thread work queue */ struct completion llc_confirm; /* wait for rx of conf link */ struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ int llc_confirm_rc; /* rc from confirm link msg */ @@ -127,10 +129,10 @@ struct smc_link { struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ - struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */ - int llc_confirm_rkey_rc; /* rc from cnf rkey msg */ - struct completion llc_delete_rkey; /* wait 4 rx of del rkey */ - int llc_delete_rkey_rc; /* rc from del rkey msg */ + struct completion llc_confirm_rkey_resp; /* w4 rx of cnf rkey */ + int llc_confirm_rkey_resp_rc; /* rc from cnf rkey */ + struct completion llc_delete_rkey_resp; /* w4 rx of del rkey */ + int llc_delete_rkey_resp_rc; /* rc from del rkey */ struct mutex llc_delete_rkey_mutex; /* serialize usage */ }; @@ -150,25 +152,32 @@ struct smc_buf_desc { struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ - u8 wr_reg : 1; /* mem region registered */ - u8 regerr : 1; /* err during registration */ union { struct { /* SMC-R */ - struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; - /* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region - * incl. rkey provided to peer - */ - u32 order; /* allocation order */ + struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; + /* virtual buffer */ + struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + /* for rmb only: memory region + * incl. rkey provided to peer + */ + u32 order; /* allocation order */ + + u8 is_conf_rkey; + /* confirm_rkey done */ + u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX]; + /* mem region registered */ + u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; + /* mem region mapped to lnk */ + u8 is_reg_err; + /* buffer registration err */ }; struct { /* SMC-D */ - unsigned short sba_idx; - /* SBA index number */ - u64 token; - /* DMB token number */ - dma_addr_t dma_addr; - /* DMA address */ + unsigned short sba_idx; + /* SBA index number */ + u64 token; + /* DMB token number */ + dma_addr_t dma_addr; + /* DMA address */ }; }; }; @@ -196,9 +205,9 @@ struct smc_link_group { unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ - rwlock_t sndbufs_lock; /* protects tx buffers */ + struct mutex sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ - rwlock_t rmbs_lock; /* protects rx buffers */ + struct mutex rmbs_lock; /* protects rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ @@ -222,6 +231,15 @@ struct smc_link_group { /* remote addr/key pairs */ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ + u8 next_link_id; + struct list_head llc_event_q; + /* queue for llc events */ + spinlock_t llc_event_q_lock; + /* protects llc_event_q */ + struct work_struct llc_event_work; + /* llc event worker */ + int llc_testlink_time; + /* link keep alive time */ }; struct { /* SMC-D */ u64 peer_gid; @@ -285,6 +303,14 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +/* returns true if the specified link is usable */ +static inline bool smc_link_usable(struct smc_link *lnk) +{ + if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) + return false; + return true; +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; @@ -299,10 +325,10 @@ void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); -int smc_rmb_rtoken_handling(struct smc_connection *conn, +int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); -int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey); -int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey); +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); @@ -317,6 +343,6 @@ void smc_core_exit(void); static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { - return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + return link->lgr; } #endif diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 04b6fefb8bce..c090678a3e5a 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -389,15 +389,15 @@ void smc_ib_put_memory_region(struct ib_mr *mr) ib_dereg_mr(mr); } -static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) +static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) { unsigned int offset = 0; int sg_num; /* map the largest prefix of a dma mapped SG list */ - sg_num = ib_map_mr_sg(buf_slot->mr_rx[SMC_SINGLE_LINK], - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], + buf_slot->sgt[link_idx].sgl, + buf_slot->sgt[link_idx].orig_nents, &offset, PAGE_SIZE); return sg_num; @@ -405,29 +405,29 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) /* Allocate a memory region and map the dma mapped SG list of buf_slot */ int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct smc_buf_desc *buf_slot) + struct smc_buf_desc *buf_slot, u8 link_idx) { - if (buf_slot->mr_rx[SMC_SINGLE_LINK]) + if (buf_slot->mr_rx[link_idx]) return 0; /* already done */ - buf_slot->mr_rx[SMC_SINGLE_LINK] = + buf_slot->mr_rx[link_idx] = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); - if (IS_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK])) { + if (IS_ERR(buf_slot->mr_rx[link_idx])) { int rc; - rc = PTR_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK]); - buf_slot->mr_rx[SMC_SINGLE_LINK] = NULL; + rc = PTR_ERR(buf_slot->mr_rx[link_idx]); + buf_slot->mr_rx[link_idx] = NULL; return rc; } - if (smc_ib_map_mr_sg(buf_slot) != 1) + if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) return -EINVAL; return 0; } /* synchronize buffer usage for cpu access */ -void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { @@ -435,11 +435,11 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, unsigned int i; /* for now there is just one DMA address */ - for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, - buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { if (!sg_dma_len(sg)) break; - ib_dma_sync_single_for_cpu(smcibdev->ibdev, + ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev, sg_dma_address(sg), sg_dma_len(sg), data_direction); @@ -447,7 +447,7 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, } /* synchronize buffer usage for device access */ -void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { @@ -455,11 +455,11 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, unsigned int i; /* for now there is just one DMA address */ - for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, - buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { if (!sg_dma_len(sg)) break; - ib_dma_sync_single_for_device(smcibdev->ibdev, + ib_dma_sync_single_for_device(lnk->smcibdev->ibdev, sg_dma_address(sg), sg_dma_len(sg), data_direction); @@ -467,15 +467,15 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, } /* Map a new TX or RX buffer SG-table to DMA */ -int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, +int smc_ib_buf_map_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { int mapped_nents; - mapped_nents = ib_dma_map_sg(smcibdev->ibdev, - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, data_direction); if (!mapped_nents) return -ENOMEM; @@ -483,18 +483,18 @@ int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, return mapped_nents; } -void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +void smc_ib_buf_unmap_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { - if (!buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address) + if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) return; /* already unmapped */ - ib_dma_unmap_sg(smcibdev->ibdev, - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + ib_dma_unmap_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, data_direction); - buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0; + buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) @@ -579,8 +579,9 @@ static void smc_ib_add_dev(struct ib_device *ibdev) i++) { set_bit(i, &smcibdev->port_event_mask); /* determine pnetids of the port */ - smc_pnetid_by_dev_port(ibdev->dev.parent, i, - smcibdev->pnetid[i]); + if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, + smcibdev->pnetid[i])) + smc_pnetid_by_table_ib(smcibdev, i + 1); } schedule_work(&smcibdev->port_event_work); } diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 5c2b115d36da..e6a696ae15f3 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -59,10 +59,10 @@ struct smc_link; int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); -int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, +int smc_ib_buf_map_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +void smc_ib_buf_unmap_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); void smc_ib_dealloc_protection_domain(struct smc_link *lnk); @@ -74,12 +74,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct smc_buf_desc *buf_slot); + struct smc_buf_desc *buf_slot, u8 link_idx); void smc_ib_put_memory_region(struct ib_mr *mr); -void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 5c4727d5066e..32be2da2cb85 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -296,7 +296,8 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, device_initialize(&smcd->dev); dev_set_name(&smcd->dev, name); smcd->ops = ops; - smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); + if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 0e52aab53d97..e715dd6735ee 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -134,6 +134,12 @@ union smc_llc_msg { #define SMC_LLC_FLAG_RESP 0x80 +struct smc_llc_qentry { + struct list_head list; + struct smc_link *link; + union smc_llc_msg msg; +}; + /********************************** send *************************************/ struct smc_llc_tx_pend { @@ -231,9 +237,9 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link, rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey); rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(rmb_desc->mr_rx[link->link_idx]->rkey); rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (u64)sg_dma_address(rmb_desc->sgt[link->link_idx].sgl)); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -256,7 +262,7 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); rkeyllc->num_rkeys = 1; - rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -356,46 +362,20 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } -struct smc_llc_send_work { - struct work_struct work; - struct smc_link *link; - int llclen; - union smc_llc_msg llcbuf; -}; - -/* worker that sends a prepared message */ -static void smc_llc_send_message_work(struct work_struct *work) +/* schedule an llc send on link, may wait for buffers */ +static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { - struct smc_llc_send_work *llcwrk = container_of(work, - struct smc_llc_send_work, work); struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; - if (llcwrk->link->state == SMC_LNK_INACTIVE) - goto out; - rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend); + if (!smc_link_usable(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - goto out; - memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen); - smc_wr_tx_send(llcwrk->link, pend); -out: - kfree(llcwrk); -} - -/* copy llcbuf and schedule an llc send on link */ -static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) -{ - struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); - - if (!wrk) - return -ENOMEM; - INIT_WORK(&wrk->work, smc_llc_send_message_work); - wrk->link = link; - wrk->llclen = llclen; - memcpy(&wrk->llcbuf, llcbuf, llclen); - queue_work(link->llc_wq, &wrk->work); - return 0; + return rc; + memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); + return smc_wr_tx_send(link, pend); } /********************************* receive ***********************************/ @@ -404,27 +384,17 @@ static void smc_llc_rx_confirm_link(struct smc_link *link, struct smc_llc_msg_confirm_link *llc) { struct smc_link_group *lgr = smc_get_lgr(link); - int conf_rc; + int conf_rc = 0; /* RMBE eyecatchers are not supported */ - if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC) - conf_rc = 0; - else + if (!(llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) conf_rc = ENOTSUPP; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_resp_rc = conf_rc; - complete(&link->llc_confirm_resp); - } - } else { - if (lgr->role == SMC_CLNT && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_rc = conf_rc; - link->link_id = llc->link_num; - complete(&link->llc_confirm); - } + if (lgr->role == SMC_CLNT && + link->state == SMC_LNK_ACTIVATING) { + link->llc_confirm_rc = conf_rc; + link->link_id = llc->link_num; + complete(&link->llc_confirm); } } @@ -433,27 +403,22 @@ static void smc_llc_rx_add_link(struct smc_link *link, { struct smc_link_group *lgr = smc_get_lgr(link); - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (link->state == SMC_LNK_ACTIVATING) - complete(&link->llc_add_resp); - } else { - if (link->state == SMC_LNK_ACTIVATING) { - complete(&link->llc_add); - return; - } + if (link->state == SMC_LNK_ACTIVATING) { + complete(&link->llc_add); + return; + } - if (lgr->role == SMC_SERV) { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); + if (lgr->role == SMC_SERV) { + smc_llc_prep_add_link(llc, link, + link->smcibdev->mac[link->ibport - 1], + link->gid, SMC_LLC_REQ); - } else { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); - } - smc_llc_send_message(link, llc, sizeof(*llc)); + } else { + smc_llc_prep_add_link(llc, link, + link->smcibdev->mac[link->ibport - 1], + link->gid, SMC_LLC_RESP); } + smc_llc_send_message(link, llc); } static void smc_llc_rx_delete_link(struct smc_link *link, @@ -461,34 +426,24 @@ static void smc_llc_rx_delete_link(struct smc_link *link, { struct smc_link_group *lgr = smc_get_lgr(link); - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV) - smc_lgr_schedule_free_work_fast(lgr); + smc_lgr_forget(lgr); + smc_llc_link_deleting(link); + if (lgr->role == SMC_SERV) { + /* client asks to delete this link, send request */ + smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); } else { - smc_lgr_forget(lgr); - smc_llc_link_deleting(link); - if (lgr->role == SMC_SERV) { - /* client asks to delete this link, send request */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); - } else { - /* server requests to delete this link, send response */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); - } - smc_llc_send_message(link, llc, sizeof(*llc)); - smc_lgr_terminate_sched(lgr); + /* server requests to delete this link, send response */ + smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); } + smc_llc_send_message(link, llc); + smc_lgr_terminate_sched(lgr); } static void smc_llc_rx_test_link(struct smc_link *link, struct smc_llc_msg_test_link *llc) { - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (link->state == SMC_LNK_ACTIVE) - complete(&link->llc_testlink_resp); - } else { - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); - } + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); } static void smc_llc_rx_confirm_rkey(struct smc_link *link, @@ -496,34 +451,24 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link, { int rc; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - link->llc_confirm_rkey_rc = llc->hd.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_confirm_rkey); - } else { - rc = smc_rtoken_add(smc_get_lgr(link), - llc->rtoken[0].rmb_vaddr, - llc->rtoken[0].rmb_key); + rc = smc_rtoken_add(link, + llc->rtoken[0].rmb_vaddr, + llc->rtoken[0].rmb_key); - /* ignore rtokens for other links, we have only one link */ + /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - if (rc < 0) - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - smc_llc_send_message(link, llc, sizeof(*llc)); - } + llc->hd.flags |= SMC_LLC_FLAG_RESP; + if (rc < 0) + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + smc_llc_send_message(link, llc); } static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, struct smc_llc_msg_confirm_rkey_cont *llc) { - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ - } else { - /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); - } + /* ignore rtokens for other links, we have only one link */ + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); } static void smc_llc_rx_delete_rkey(struct smc_link *link, @@ -532,38 +477,41 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, u8 err_mask = 0; int i, max; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - link->llc_delete_rkey_rc = llc->hd.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_delete_rkey); - } else { - max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); - for (i = 0; i < max; i++) { - if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i])) - err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); - } + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); + for (i = 0; i < max; i++) { + if (smc_rtoken_delete(link, llc->rkey[i])) + err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); + } - if (err_mask) { - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - llc->err_mask = err_mask; - } + if (err_mask) { + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->err_mask = err_mask; + } + + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); +} + +/* flush the llc event queue */ +static void smc_llc_event_flush(struct smc_link_group *lgr) +{ + struct smc_llc_qentry *qentry, *q; - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + spin_lock_bh(&lgr->llc_event_q_lock); + list_for_each_entry_safe(qentry, q, &lgr->llc_event_q, list) { + list_del_init(&qentry->list); + kfree(qentry); } + spin_unlock_bh(&lgr->llc_event_q_lock); } -static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +static void smc_llc_event_handler(struct smc_llc_qentry *qentry) { - struct smc_link *link = (struct smc_link *)wc->qp->qp_context; - union smc_llc_msg *llc = buf; + union smc_llc_msg *llc = &qentry->msg; + struct smc_link *link = qentry->link; - if (wc->byte_len < sizeof(*llc)) - return; /* short message */ - if (llc->raw.hdr.length != sizeof(*llc)) - return; /* invalid message */ - if (link->state == SMC_LNK_INACTIVE) - return; /* link not active, drop msg */ + if (!smc_link_usable(link)) + goto out; switch (llc->raw.hdr.common.type) { case SMC_LLC_TEST_LINK: @@ -588,6 +536,103 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) smc_llc_rx_delete_rkey(link, &llc->delete_rkey); break; } +out: + kfree(qentry); +} + +/* worker to process llc messages on the event queue */ +static void smc_llc_event_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_event_work); + struct smc_llc_qentry *qentry; + +again: + spin_lock_bh(&lgr->llc_event_q_lock); + if (!list_empty(&lgr->llc_event_q)) { + qentry = list_first_entry(&lgr->llc_event_q, + struct smc_llc_qentry, list); + list_del_init(&qentry->list); + spin_unlock_bh(&lgr->llc_event_q_lock); + smc_llc_event_handler(qentry); + goto again; + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +/* process llc responses in tasklet context */ +static void smc_llc_rx_response(struct smc_link *link, union smc_llc_msg *llc) +{ + int rc = 0; + + switch (llc->raw.hdr.common.type) { + case SMC_LLC_TEST_LINK: + if (link->state == SMC_LNK_ACTIVE) + complete(&link->llc_testlink_resp); + break; + case SMC_LLC_CONFIRM_LINK: + if (!(llc->raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) + rc = ENOTSUPP; + if (link->lgr->role == SMC_SERV && + link->state == SMC_LNK_ACTIVATING) { + link->llc_confirm_resp_rc = rc; + complete(&link->llc_confirm_resp); + } + break; + case SMC_LLC_ADD_LINK: + if (link->state == SMC_LNK_ACTIVATING) + complete(&link->llc_add_resp); + break; + case SMC_LLC_DELETE_LINK: + if (link->lgr->role == SMC_SERV) + smc_lgr_schedule_free_work_fast(link->lgr); + break; + case SMC_LLC_CONFIRM_RKEY: + link->llc_confirm_rkey_resp_rc = llc->raw.hdr.flags & + SMC_LLC_FLAG_RKEY_NEG; + complete(&link->llc_confirm_rkey_resp); + break; + case SMC_LLC_CONFIRM_RKEY_CONT: + /* unused as long as we don't send this type of msg */ + break; + case SMC_LLC_DELETE_RKEY: + link->llc_delete_rkey_resp_rc = llc->raw.hdr.flags & + SMC_LLC_FLAG_RKEY_NEG; + complete(&link->llc_delete_rkey_resp); + break; + } +} + +/* copy received msg and add it to the event queue */ +static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry; + union smc_llc_msg *llc = buf; + unsigned long flags; + + if (wc->byte_len < sizeof(*llc)) + return; /* short message */ + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + + /* process responses immediately */ + if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) { + smc_llc_rx_response(link, llc); + return; + } + + qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); + if (!qentry) + return; + qentry->link = link; + INIT_LIST_HEAD(&qentry->list); + memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg)); + spin_lock_irqsave(&lgr->llc_event_q_lock, flags); + list_add_tail(&qentry->list, &lgr->llc_event_q); + spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags); + schedule_work(&link->lgr->llc_event_work); } /***************************** worker, utils *********************************/ @@ -613,43 +658,55 @@ static void smc_llc_testlink_work(struct work_struct *work) /* receive TEST LINK response over RoCE fabric */ rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); + if (link->state != SMC_LNK_ACTIVE) + return; /* link state changed */ if (rc <= 0) { smc_lgr_terminate_sched(smc_get_lgr(link)); return; } next_interval = link->llc_testlink_time; out: - queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, - next_interval); + schedule_delayed_work(&link->llc_testlink_wrk, next_interval); +} + +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) +{ + struct net *net = sock_net(smc->clcsock->sk); + + INIT_WORK(&lgr->llc_event_work, smc_llc_event_work); + INIT_LIST_HEAD(&lgr->llc_event_q); + spin_lock_init(&lgr->llc_event_q_lock); + lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time; +} + +/* called after lgr was removed from lgr_list */ +void smc_llc_lgr_clear(struct smc_link_group *lgr) +{ + smc_llc_event_flush(lgr); + cancel_work_sync(&lgr->llc_event_work); } int smc_llc_link_init(struct smc_link *link) { - struct smc_link_group *lgr = smc_get_lgr(link); - link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM, - *((u32 *)lgr->id), - link->link_id); - if (!link->llc_wq) - return -ENOMEM; init_completion(&link->llc_confirm); init_completion(&link->llc_confirm_resp); init_completion(&link->llc_add); init_completion(&link->llc_add_resp); - init_completion(&link->llc_confirm_rkey); - init_completion(&link->llc_delete_rkey); + init_completion(&link->llc_confirm_rkey_resp); + init_completion(&link->llc_delete_rkey_resp); mutex_init(&link->llc_delete_rkey_mutex); init_completion(&link->llc_testlink_resp); INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); return 0; } -void smc_llc_link_active(struct smc_link *link, int testlink_time) +void smc_llc_link_active(struct smc_link *link) { link->state = SMC_LNK_ACTIVE; - if (testlink_time) { - link->llc_testlink_time = testlink_time * HZ; - queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, - link->llc_testlink_time); + if (link->lgr->llc_testlink_time) { + link->llc_testlink_time = link->lgr->llc_testlink_time * HZ; + schedule_delayed_work(&link->llc_testlink_wrk, + link->llc_testlink_time); } } @@ -659,20 +716,13 @@ void smc_llc_link_deleting(struct smc_link *link) smc_wr_wakeup_tx_wait(link); } -/* called in tasklet context */ -void smc_llc_link_inactive(struct smc_link *link) -{ - link->state = SMC_LNK_INACTIVE; - cancel_delayed_work(&link->llc_testlink_wrk); - smc_wr_wakeup_reg_wait(link); - smc_wr_wakeup_tx_wait(link); -} - /* called in worker context */ void smc_llc_link_clear(struct smc_link *link) { - flush_workqueue(link->llc_wq); - destroy_workqueue(link->llc_wq); + complete(&link->llc_testlink_resp); + cancel_delayed_work_sync(&link->llc_testlink_wrk); + smc_wr_wakeup_reg_wait(link); + smc_wr_wakeup_tx_wait(link); } /* register a new rtoken at the remote peer */ @@ -682,14 +732,14 @@ int smc_llc_do_confirm_rkey(struct smc_link *link, int rc; /* protected by mutex smc_create_lgr_pending */ - reinit_completion(&link->llc_confirm_rkey); + reinit_completion(&link->llc_confirm_rkey_resp); rc = smc_llc_send_confirm_rkey(link, rmb_desc); if (rc) return rc; /* receive CONFIRM RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey, - SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_confirm_rkey_rc) + rc = wait_for_completion_interruptible_timeout( + &link->llc_confirm_rkey_resp, SMC_LLC_WAIT_TIME); + if (rc <= 0 || link->llc_confirm_rkey_resp_rc) return -EFAULT; return 0; } @@ -703,14 +753,14 @@ int smc_llc_do_delete_rkey(struct smc_link *link, mutex_lock(&link->llc_delete_rkey_mutex); if (link->state != SMC_LNK_ACTIVE) goto out; - reinit_completion(&link->llc_delete_rkey); + reinit_completion(&link->llc_delete_rkey_resp); rc = smc_llc_send_delete_rkey(link, rmb_desc); if (rc) goto out; /* receive DELETE RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout(&link->llc_delete_rkey, - SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_delete_rkey_rc) + rc = wait_for_completion_interruptible_timeout( + &link->llc_delete_rkey_resp, SMC_LLC_WAIT_TIME); + if (rc <= 0 || link->llc_delete_rkey_resp_rc) rc = -EFAULT; else rc = 0; diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 461c0c3ef76e..66063f22166b 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -35,6 +35,17 @@ enum smc_llc_msg_type { SMC_LLC_DELETE_RKEY = 0x09, }; +/* returns a usable link of the link group, or NULL */ +static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (smc_link_usable(&lgr->lnk[i])) + return &lgr->lnk[i]; + return NULL; +} + /* transmit */ int smc_llc_send_confirm_link(struct smc_link *lnk, enum smc_llc_reqresp reqresp); @@ -42,10 +53,11 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], enum smc_llc_reqresp reqresp); int smc_llc_send_delete_link(struct smc_link *link, enum smc_llc_reqresp reqresp, bool orderly); +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); +void smc_llc_lgr_clear(struct smc_link_group *lgr); int smc_llc_link_init(struct smc_link *link); -void smc_llc_link_active(struct smc_link *link, int testlink_time); +void smc_llc_link_active(struct smc_link *link); void smc_llc_link_deleting(struct smc_link *link); -void smc_llc_link_inactive(struct smc_link *link); void smc_llc_link_clear(struct smc_link *link); int smc_llc_do_confirm_rkey(struct smc_link *link, struct smc_buf_desc *rmb_desc); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 2a5ed47c3e08..bd01c71b827a 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -50,29 +50,26 @@ static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { static struct genl_family smc_pnet_nl_family; -/** - * struct smc_user_pnetentry - pnet identifier name entry for/from user - * @list: List node. - * @pnet_name: Pnet identifier name - * @ndev: pointer to network device. - * @smcibdev: Pointer to IB device. - * @ib_port: Port of IB device. - * @smcd_dev: Pointer to smcd device. - */ -struct smc_user_pnetentry { - struct list_head list; - char pnet_name[SMC_MAX_PNETID_LEN + 1]; - struct net_device *ndev; - struct smc_ib_device *smcibdev; - u8 ib_port; - struct smcd_dev *smcd_dev; +enum smc_pnet_nametype { + SMC_PNET_ETH = 1, + SMC_PNET_IB = 2, }; /* pnet entry stored in pnet table */ struct smc_pnetentry { struct list_head list; char pnet_name[SMC_MAX_PNETID_LEN + 1]; - struct net_device *ndev; + enum smc_pnet_nametype type; + union { + struct { + char eth_name[IFNAMSIZ + 1]; + struct net_device *ndev; + }; + struct { + char ib_name[IB_DEVICE_NAME_MAX + 1]; + u8 ib_port; + }; + }; }; /* Check if two given pnetids match */ @@ -106,14 +103,15 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - /* remove netdevices */ + /* remove table entry */ write_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || smc_pnet_match(pnetelem->pnet_name, pnet_name)) { list_del(&pnetelem->list); - dev_put(pnetelem->ndev); + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) + dev_put(pnetelem->ndev); kfree(pnetelem); rc = 0; } @@ -155,9 +153,9 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) return rc; } -/* Remove a pnet entry mentioning a given network device from the pnet table. +/* Add the reference to a given network device to the pnet table. */ -static int smc_pnet_remove_by_ndev(struct net_device *ndev) +static int smc_pnet_add_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; @@ -171,10 +169,10 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) write_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { - if (pnetelem->ndev == ndev) { - list_del(&pnetelem->list); - dev_put(pnetelem->ndev); - kfree(pnetelem); + if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && + !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { + dev_hold(ndev); + pnetelem->ndev = ndev; rc = 0; break; } @@ -183,80 +181,67 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) return rc; } -/* Append a pnetid to the end of the pnet table if not already on this list. +/* Remove the reference to a given network device from the pnet table. */ -static int smc_pnet_enter(struct smc_pnettable *pnettable, - struct smc_user_pnetentry *new_pnetelem) +static int smc_pnet_remove_by_ndev(struct net_device *ndev) { - u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; - u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct smc_pnetentry *tmp_pnetelem; - struct smc_pnetentry *pnetelem; - bool new_smcddev = false; - struct net_device *ndev; - bool new_netdev = true; - bool new_ibdev = false; - - if (new_pnetelem->smcibdev) { - struct smc_ib_device *ib_dev = new_pnetelem->smcibdev; - int ib_port = new_pnetelem->ib_port; + struct smc_pnetentry *pnetelem, *tmp_pe; + struct smc_pnettable *pnettable; + struct net *net = dev_net(ndev); + struct smc_net *sn; + int rc = -ENOENT; - spin_lock(&smc_ib_devices.lock); - if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) { - memcpy(ib_dev->pnetid[ib_port - 1], - new_pnetelem->pnet_name, SMC_MAX_PNETID_LEN); - ib_dev->pnetid_by_user[ib_port - 1] = true; - new_ibdev = true; - } - spin_unlock(&smc_ib_devices.lock); - } - if (new_pnetelem->smcd_dev) { - struct smcd_dev *smcd_dev = new_pnetelem->smcd_dev; + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; - spin_lock(&smcd_dev_list.lock); - if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) { - memcpy(smcd_dev->pnetid, new_pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - smcd_dev->pnetid_by_user = true; - new_smcddev = true; + write_lock(&pnettable->lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { + dev_put(pnetelem->ndev); + pnetelem->ndev = NULL; + rc = 0; + break; } - spin_unlock(&smcd_dev_list.lock); } + write_unlock(&pnettable->lock); + return rc; +} - if (!new_pnetelem->ndev) - return (new_ibdev || new_smcddev) ? 0 : -EEXIST; +/* Apply pnetid to ib device when no pnetid is set. + */ +static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, + char *pnet_name) +{ + u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; + bool applied = false; - /* check if (base) netdev already has a pnetid. If there is one, we do - * not want to add a pnet table entry - */ - ndev = pnet_find_base_ndev(new_pnetelem->ndev); - if (!smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, - ndev_pnetid)) - return (new_ibdev || new_smcddev) ? 0 : -EEXIST; + spin_lock(&smc_ib_devices.lock); + if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) { + memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, + SMC_MAX_PNETID_LEN); + ib_dev->pnetid_by_user[ib_port - 1] = true; + applied = true; + } + spin_unlock(&smc_ib_devices.lock); + return applied; +} - /* add a new netdev entry to the pnet table if there isn't one */ - tmp_pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL); - if (!tmp_pnetelem) - return -ENOMEM; - memcpy(tmp_pnetelem->pnet_name, new_pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - tmp_pnetelem->ndev = new_pnetelem->ndev; +/* Apply pnetid to smcd device when no pnetid is set. + */ +static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) +{ + u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; + bool applied = false; - write_lock(&pnettable->lock); - list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { - if (pnetelem->ndev == new_pnetelem->ndev) - new_netdev = false; - } - if (new_netdev) { - dev_hold(tmp_pnetelem->ndev); - list_add_tail(&tmp_pnetelem->list, &pnettable->pnetlist); - write_unlock(&pnettable->lock); - } else { - write_unlock(&pnettable->lock); - kfree(tmp_pnetelem); + spin_lock(&smcd_dev_list.lock); + if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) { + memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); + smcd_dev->pnetid_by_user = true; + applied = true; } - - return (new_netdev || new_ibdev || new_smcddev) ? 0 : -EEXIST; + spin_unlock(&smcd_dev_list.lock); + return applied; } /* The limit for pnetid is 16 characters. @@ -323,57 +308,167 @@ out: return smcd_dev; } -/* Parse the supplied netlink attributes and fill a pnetentry structure. - * For ethernet and infiniband device names verify that the devices exist. +static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, + char *eth_name, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct net_device *ndev, *base_ndev; + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + bool new_netdev; + int rc; + + /* check if (base) netdev already has a pnetid. If there is one, we do + * not want to add a pnet table entry + */ + rc = -EEXIST; + ndev = dev_get_by_name(net, eth_name); /* dev_hold() */ + if (ndev) { + base_ndev = pnet_find_base_ndev(ndev); + if (!smc_pnetid_by_dev_port(base_ndev->dev.parent, + base_ndev->dev_port, ndev_pnetid)) + goto out_put; + } + + /* add a new netdev entry to the pnet table if there isn't one */ + rc = -ENOMEM; + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + goto out_put; + new_pe->type = SMC_PNET_ETH; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); + new_pe->ndev = ndev; + + rc = -EEXIST; + new_netdev = true; + write_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_ETH && + !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { + new_netdev = false; + break; + } + } + if (new_netdev) { + list_add_tail(&new_pe->list, &pnettable->pnetlist); + write_unlock(&pnettable->lock); + } else { + write_unlock(&pnettable->lock); + kfree(new_pe); + goto out_put; + } + return 0; + +out_put: + if (ndev) + dev_put(ndev); + return rc; +} + +static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, + u8 ib_port, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct smc_ib_device *ib_dev; + bool smcddev_applied = true; + bool ibdev_applied = true; + struct smcd_dev *smcd_dev; + bool new_ibdev; + + /* try to apply the pnetid to active devices */ + ib_dev = smc_pnet_find_ib(ib_name); + if (ib_dev) + ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); + smcd_dev = smc_pnet_find_smcd(ib_name); + if (smcd_dev) + smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name); + /* Apply fails when a device has a hardware-defined pnetid set, do not + * add a pnet table entry in that case. + */ + if (!ibdev_applied || !smcddev_applied) + return -EEXIST; + + /* add a new ib entry to the pnet table if there isn't one */ + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + return -ENOMEM; + new_pe->type = SMC_PNET_IB; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); + new_pe->ib_port = ib_port; + + new_ibdev = true; + write_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + new_ibdev = false; + break; + } + } + if (new_ibdev) { + list_add_tail(&new_pe->list, &pnettable->pnetlist); + write_unlock(&pnettable->lock); + } else { + write_unlock(&pnettable->lock); + kfree(new_pe); + } + return (new_ibdev) ? 0 : -EEXIST; +} + +/* Append a pnetid to the end of the pnet table if not already on this list. */ -static int smc_pnet_fill_entry(struct net *net, - struct smc_user_pnetentry *pnetelem, - struct nlattr *tb[]) +static int smc_pnet_enter(struct net *net, struct nlattr *tb[]) { - char *string, *ibname; + char pnet_name[SMC_MAX_PNETID_LEN + 1]; + struct smc_pnettable *pnettable; + bool new_netdev = false; + bool new_ibdev = false; + struct smc_net *sn; + u8 ibport = 1; + char *string; int rc; - memset(pnetelem, 0, sizeof(*pnetelem)); - INIT_LIST_HEAD(&pnetelem->list); + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; rc = -EINVAL; if (!tb[SMC_PNETID_NAME]) goto error; string = (char *)nla_data(tb[SMC_PNETID_NAME]); - if (!smc_pnetid_valid(string, pnetelem->pnet_name)) + if (!smc_pnetid_valid(string, pnet_name)) goto error; - rc = -EINVAL; if (tb[SMC_PNETID_ETHNAME]) { string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); - pnetelem->ndev = dev_get_by_name(net, string); - if (!pnetelem->ndev) + rc = smc_pnet_add_eth(pnettable, net, string, pnet_name); + if (!rc) + new_netdev = true; + else if (rc != -EEXIST) goto error; } /* if this is not the initial namespace, stop here */ if (net != &init_net) - return 0; + return new_netdev ? 0 : -EEXIST; rc = -EINVAL; if (tb[SMC_PNETID_IBNAME]) { - ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]); - ibname = strim(ibname); - pnetelem->smcibdev = smc_pnet_find_ib(ibname); - pnetelem->smcd_dev = smc_pnet_find_smcd(ibname); - if (!pnetelem->smcibdev && !pnetelem->smcd_dev) - goto error; - if (pnetelem->smcibdev) { - if (!tb[SMC_PNETID_IBPORT]) - goto error; - pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]); - if (pnetelem->ib_port < 1 || - pnetelem->ib_port > SMC_MAX_PORTS) + string = (char *)nla_data(tb[SMC_PNETID_IBNAME]); + string = strim(string); + if (tb[SMC_PNETID_IBPORT]) { + ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]); + if (ibport < 1 || ibport > SMC_MAX_PORTS) goto error; } + rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name); + if (!rc) + new_ibdev = true; + else if (rc != -EEXIST) + goto error; } - - return 0; + return (new_netdev || new_ibdev) ? 0 : -EEXIST; error: return rc; @@ -381,28 +476,22 @@ error: /* Convert an smc_pnetentry to a netlink attribute sequence */ static int smc_pnet_set_nla(struct sk_buff *msg, - struct smc_user_pnetentry *pnetelem) + struct smc_pnetentry *pnetelem) { if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name)) return -1; - if (pnetelem->ndev) { + if (pnetelem->type == SMC_PNET_ETH) { if (nla_put_string(msg, SMC_PNETID_ETHNAME, - pnetelem->ndev->name)) + pnetelem->eth_name)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a")) return -1; } - if (pnetelem->smcibdev) { - if (nla_put_string(msg, SMC_PNETID_IBNAME, - dev_name(pnetelem->smcibdev->ibdev->dev.parent)) || + if (pnetelem->type == SMC_PNET_IB) { + if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) || nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) return -1; - } else if (pnetelem->smcd_dev) { - if (nla_put_string(msg, SMC_PNETID_IBNAME, - dev_name(&pnetelem->smcd_dev->dev)) || - nla_put_u8(msg, SMC_PNETID_IBPORT, 1)) - return -1; } else { if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") || nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff)) @@ -415,21 +504,8 @@ static int smc_pnet_set_nla(struct sk_buff *msg, static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - struct smc_user_pnetentry pnetelem; - struct smc_pnettable *pnettable; - struct smc_net *sn; - int rc; - - /* get pnettable for namespace */ - sn = net_generic(net, smc_net_id); - pnettable = &sn->pnettable; - rc = smc_pnet_fill_entry(net, &pnetelem, info->attrs); - if (!rc) - rc = smc_pnet_enter(pnettable, &pnetelem); - if (pnetelem.ndev) - dev_put(pnetelem.ndev); - return rc; + return smc_pnet_enter(net, info->attrs); } static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) @@ -450,7 +526,7 @@ static int smc_pnet_dump_start(struct netlink_callback *cb) static int smc_pnet_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, u32 flags, - struct smc_user_pnetentry *pnetelem) + struct smc_pnetentry *pnetelem) { void *hdr; @@ -469,91 +545,32 @@ static int smc_pnet_dumpinfo(struct sk_buff *skb, static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u8 *pnetid, int start_idx) { - struct smc_user_pnetentry tmp_entry; struct smc_pnettable *pnettable; struct smc_pnetentry *pnetelem; - struct smc_ib_device *ibdev; - struct smcd_dev *smcd_dev; struct smc_net *sn; int idx = 0; - int ibport; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - /* dump netdevices */ + /* dump pnettable entries */ read_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; if (idx++ < start_idx) continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - tmp_entry.ndev = pnetelem->ndev; + /* if this is not the initial namespace, dump only netdev */ + if (net != &init_net && pnetelem->type != SMC_PNET_ETH) + continue; if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, - &tmp_entry)) { + pnetelem)) { --idx; break; } } read_unlock(&pnettable->lock); - - /* if this is not the initial namespace, stop here */ - if (net != &init_net) - return idx; - - /* dump ib devices */ - spin_lock(&smc_ib_devices.lock); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { - if (ibdev->pnetid_by_user[ibport]) { - if (pnetid && - !smc_pnet_match(ibdev->pnetid[ibport], - pnetid)) - continue; - if (idx++ < start_idx) - continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, - ibdev->pnetid[ibport], - SMC_MAX_PNETID_LEN); - tmp_entry.smcibdev = ibdev; - tmp_entry.ib_port = ibport + 1; - if (smc_pnet_dumpinfo(skb, portid, seq, - NLM_F_MULTI, - &tmp_entry)) { - --idx; - break; - } - } - } - } - spin_unlock(&smc_ib_devices.lock); - - /* dump smcd devices */ - spin_lock(&smcd_dev_list.lock); - list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (smcd_dev->pnetid_by_user) { - if (pnetid && !smc_pnet_match(smcd_dev->pnetid, pnetid)) - continue; - if (idx++ < start_idx) - continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, smcd_dev->pnetid, - SMC_MAX_PNETID_LEN); - tmp_entry.smcd_dev = smcd_dev; - if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, - &tmp_entry)) { - --idx; - break; - } - } - } - spin_unlock(&smcd_dev_list.lock); - return idx; } @@ -659,6 +676,9 @@ static int smc_pnet_netdev_event(struct notifier_block *this, case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); return NOTIFY_OK; + case NETDEV_REGISTER: + smc_pnet_add_by_ndev(event_dev); + return NOTIFY_OK; default: return NOTIFY_DONE; } @@ -744,7 +764,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, read_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { - if (ndev == pnetelem->ndev) { + if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN); rc = 0; @@ -755,6 +775,34 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, return rc; } +/* find a roce device for the given pnetid */ +static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, + struct smc_init_info *ini) +{ + struct smc_ib_device *ibdev; + int i; + + ini->ib_dev = NULL; + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(ibdev->ibdev, i)) + continue; + if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && + smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; + goto out; + } + } + } +out: + spin_unlock(&smc_ib_devices.lock); +} + /* if handshake network device belongs to a roce device, return its * IB device and port */ @@ -801,8 +849,6 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct smc_ib_device *ibdev; - int i; ndev = pnet_find_base_ndev(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, @@ -811,25 +857,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } - - spin_lock(&smc_ib_devices.lock); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (i = 1; i <= SMC_MAX_PORTS; i++) { - if (!rdma_is_port_valid(ibdev->ibdev, i)) - continue; - if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && - smc_ib_port_active(ibdev, i) && - !test_bit(i - 1, ibdev->ports_going_away) && - !smc_ib_determine_gid(ibdev, i, ini->vlan_id, - ini->ib_gid, NULL)) { - ini->ib_dev = ibdev; - ini->ib_port = i; - goto out; - } - } - } -out: - spin_unlock(&smc_ib_devices.lock); + _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, @@ -895,3 +923,60 @@ out_rel: out: return; } + +/* Lookup and apply a pnet table entry to the given ib device. + */ +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) +{ + char *ib_name = smcibdev->ibdev->name; + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + read_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && + tmp_pe->ib_port == ib_port) { + smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name); + rc = 0; + break; + } + } + read_unlock(&pnettable->lock); + + return rc; +} + +/* Lookup and apply a pnet table entry to the given smcd device. + */ +int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) +{ + const char *ib_name = dev_name(&smcddev->dev); + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + read_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); + rc = 0; + break; + } + } + read_unlock(&pnettable->lock); + + return rc; +} diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 4564e4d69c2e..ea207f8fc6f7 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -46,5 +46,7 @@ void smc_pnet_exit(void); void smc_pnet_net_exit(struct net *net); void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini); void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini); +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port); +int smc_pnetid_by_table_smcd(struct smcd_dev *smcd); #endif diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 9f1ade86d70e..d74bfe6a90f1 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -269,19 +269,18 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, int num_sges, struct ib_rdma_wr *rdma_wr) { struct smc_link_group *lgr = conn->lgr; - struct smc_link *link; + struct smc_link *link = conn->lnk; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); rdma_wr->wr.num_sge = num_sges; rdma_wr->remote_addr = - lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + + lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr + /* RMBE within RMB */ conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; - rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; + rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smc_lgr_terminate_sched(lgr); @@ -310,8 +309,10 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t dst_off, size_t dst_len, struct smc_rdma_wr *wr_rdma_buf) { + struct smc_link *link = conn->lnk; + dma_addr_t dma_addr = - sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); + sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; @@ -507,7 +508,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) if (!pflags->urg_data_present) { rc = smc_tx_rdma_writes(conn, wr_rdma_buf); if (rc) { - smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], + smc_wr_tx_put_slot(conn->lnk, (struct smc_wr_tx_pend_priv *)pend); goto out_unlock; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 337ee52ad3d3..93223628c002 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -207,7 +207,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, } else { rc = wait_event_interruptible_timeout( link->wr_tx_wait, - link->state == SMC_LNK_INACTIVE || + !smc_link_usable(link) || lgr->terminating || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); diff --git a/tools/bpf/bpf_asm.c b/tools/bpf/bpf_asm.c index e5f95e3eede3..0063c3c029e7 100644 --- a/tools/bpf/bpf_asm.c +++ b/tools/bpf/bpf_asm.c @@ -11,7 +11,7 @@ * * How to get into it: * - * 1) read Documentation/networking/filter.txt + * 1) read Documentation/networking/filter.rst * 2) Run `bpf_asm [-c] <filter-prog file>` to translate into binary * blob that is loadable with xt_bpf, cls_bpf et al. Note: -c will * pretty print a C-like construct. diff --git a/tools/bpf/bpf_dbg.c b/tools/bpf/bpf_dbg.c index 9d3766e653a9..a0ebcdf59c31 100644 --- a/tools/bpf/bpf_dbg.c +++ b/tools/bpf/bpf_dbg.c @@ -13,7 +13,7 @@ * for making a verdict when multiple simple BPF programs are combined * into one in order to prevent parsing same headers multiple times. * - * More on how to debug BPF opcodes see Documentation/networking/filter.txt + * More on how to debug BPF opcodes see Documentation/networking/filter.rst * which is the main document on BPF. Mini howto for getting started: * * 1) `./bpf_dbg` to enter the shell (shell cmds denoted with '>'): diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index ca6665ea758a..cafedbbfefbe 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -343,6 +343,7 @@ enum { IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index 2bb8c81fc0b4..c9f03ef93338 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -168,9 +168,17 @@ #define __TEST_IMPL(test_name, _signal) \ static void test_name(struct __test_metadata *_metadata); \ + static inline void wrapper_##test_name( \ + struct __test_metadata *_metadata, \ + struct __fixture_variant_metadata *variant) \ + { \ + test_name(_metadata); \ + } \ static struct __test_metadata _##test_name##_object = \ - { .name = "global." #test_name, \ - .fn = &test_name, .termsig = _signal, \ + { .name = #test_name, \ + .fn = &wrapper_##test_name, \ + .fixture = &_fixture_global, \ + .termsig = _signal, \ .timeout = TEST_TIMEOUT_DEFAULT, }; \ static void __attribute__((constructor)) _register_##test_name(void) \ { \ @@ -212,10 +220,13 @@ * populated and cleaned up using FIXTURE_SETUP() and FIXTURE_TEARDOWN(). */ #define FIXTURE(fixture_name) \ + FIXTURE_VARIANT(fixture_name); \ + static struct __fixture_metadata _##fixture_name##_fixture_object = \ + { .name = #fixture_name, }; \ static void __attribute__((constructor)) \ _register_##fixture_name##_data(void) \ { \ - __fixture_count++; \ + __register_fixture(&_##fixture_name##_fixture_object); \ } \ FIXTURE_DATA(fixture_name) @@ -241,7 +252,10 @@ #define FIXTURE_SETUP(fixture_name) \ void fixture_name##_setup( \ struct __test_metadata __attribute__((unused)) *_metadata, \ - FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) + /** * FIXTURE_TEARDOWN(fixture_name) * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly. @@ -264,6 +278,59 @@ FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) /** + * FIXTURE_VARIANT(fixture_name) - Optionally called once per fixture + * to declare fixture variant + * + * @fixture_name: fixture name + * + * .. code-block:: c + * + * FIXTURE_VARIANT(datatype name) { + * type property1; + * ... + * }; + * + * Defines type of constant parameters provided to FIXTURE_SETUP() and TEST_F() + * as *variant*. Variants allow the same tests to be run with different + * arguments. + */ +#define FIXTURE_VARIANT(fixture_name) struct _fixture_variant_##fixture_name + +/** + * FIXTURE_VARIANT_ADD(fixture_name, variant_name) - Called once per fixture + * variant to setup and register the data + * + * @fixture_name: fixture name + * @variant_name: name of the parameter set + * + * .. code-block:: c + * + * FIXTURE_ADD(datatype name) { + * .property1 = val1; + * ... + * }; + * + * Defines a variant of the test fixture, provided to FIXTURE_SETUP() and + * TEST_F() as *variant*. Tests of each fixture will be run once for each + * variant. + */ +#define FIXTURE_VARIANT_ADD(fixture_name, variant_name) \ + extern FIXTURE_VARIANT(fixture_name) \ + _##fixture_name##_##variant_name##_variant; \ + static struct __fixture_variant_metadata \ + _##fixture_name##_##variant_name##_object = \ + { .name = #variant_name, \ + .data = &_##fixture_name##_##variant_name##_variant}; \ + static void __attribute__((constructor)) \ + _register_##fixture_name##_##variant_name(void) \ + { \ + __register_fixture_variant(&_##fixture_name##_fixture_object, \ + &_##fixture_name##_##variant_name##_object); \ + } \ + FIXTURE_VARIANT(fixture_name) \ + _##fixture_name##_##variant_name##_variant = + +/** * TEST_F(fixture_name, test_name) - Emits test registration and helpers for * fixture-based test cases * @@ -293,24 +360,27 @@ #define __TEST_F_IMPL(fixture_name, test_name, signal, tmout) \ static void fixture_name##_##test_name( \ struct __test_metadata *_metadata, \ - FIXTURE_DATA(fixture_name) *self); \ + FIXTURE_DATA(fixture_name) *self, \ + const FIXTURE_VARIANT(fixture_name) *variant); \ static inline void wrapper_##fixture_name##_##test_name( \ - struct __test_metadata *_metadata) \ + struct __test_metadata *_metadata, \ + struct __fixture_variant_metadata *variant) \ { \ /* fixture data is alloced, setup, and torn down per call. */ \ FIXTURE_DATA(fixture_name) self; \ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \ - fixture_name##_setup(_metadata, &self); \ + fixture_name##_setup(_metadata, &self, variant->data); \ /* Let setup failure terminate early. */ \ if (!_metadata->passed) \ return; \ - fixture_name##_##test_name(_metadata, &self); \ + fixture_name##_##test_name(_metadata, &self, variant->data); \ fixture_name##_teardown(_metadata, &self); \ } \ static struct __test_metadata \ _##fixture_name##_##test_name##_object = { \ - .name = #fixture_name "." #test_name, \ + .name = #test_name, \ .fn = &wrapper_##fixture_name##_##test_name, \ + .fixture = &_##fixture_name##_fixture_object, \ .termsig = signal, \ .timeout = tmout, \ }; \ @@ -321,7 +391,9 @@ } \ static void fixture_name##_##test_name( \ struct __test_metadata __attribute__((unused)) *_metadata, \ - FIXTURE_DATA(fixture_name) __attribute__((unused)) *self) + FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \ + const FIXTURE_VARIANT(fixture_name) \ + __attribute__((unused)) *variant) /** * TEST_HARNESS_MAIN - Simple wrapper to run the test harness @@ -631,11 +703,74 @@ } \ } while (0); OPTIONAL_HANDLER(_assert) +/* List helpers */ +#define __LIST_APPEND(head, item) \ +{ \ + /* Circular linked list where only prev is circular. */ \ + if (head == NULL) { \ + head = item; \ + item->next = NULL; \ + item->prev = item; \ + return; \ + } \ + if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { \ + item->next = NULL; \ + item->prev = head->prev; \ + item->prev->next = item; \ + head->prev = item; \ + } else { \ + item->next = head; \ + item->next->prev = item; \ + item->prev = item; \ + head = item; \ + } \ +} + +struct __test_metadata; +struct __fixture_variant_metadata; + +/* Contains all the information about a fixture. */ +struct __fixture_metadata { + const char *name; + struct __test_metadata *tests; + struct __fixture_variant_metadata *variant; + struct __fixture_metadata *prev, *next; +} _fixture_global __attribute__((unused)) = { + .name = "global", + .prev = &_fixture_global, +}; + +static struct __fixture_metadata *__fixture_list = &_fixture_global; +static int __constructor_order; + +#define _CONSTRUCTOR_ORDER_FORWARD 1 +#define _CONSTRUCTOR_ORDER_BACKWARD -1 + +static inline void __register_fixture(struct __fixture_metadata *f) +{ + __LIST_APPEND(__fixture_list, f); +} + +struct __fixture_variant_metadata { + const char *name; + const void *data; + struct __fixture_variant_metadata *prev, *next; +}; + +static inline void +__register_fixture_variant(struct __fixture_metadata *f, + struct __fixture_variant_metadata *variant) +{ + __LIST_APPEND(f->variant, variant); +} + /* Contains all the information for test execution and status checking. */ struct __test_metadata { const char *name; - void (*fn)(struct __test_metadata *); + void (*fn)(struct __test_metadata *, + struct __fixture_variant_metadata *); pid_t pid; /* pid of test when being run */ + struct __fixture_metadata *fixture; int termsig; int passed; int trigger; /* extra handler after the evaluation */ @@ -646,15 +781,6 @@ struct __test_metadata { struct __test_metadata *prev, *next; }; -/* Storage for the (global) tests to be run. */ -static struct __test_metadata *__test_list; -static unsigned int __test_count; -static unsigned int __fixture_count; -static int __constructor_order; - -#define _CONSTRUCTOR_ORDER_FORWARD 1 -#define _CONSTRUCTOR_ORDER_BACKWARD -1 - /* * Since constructors are called in reverse order, reverse the test * list so tests are run in source declaration order. @@ -666,25 +792,7 @@ static int __constructor_order; */ static inline void __register_test(struct __test_metadata *t) { - __test_count++; - /* Circular linked list where only prev is circular. */ - if (__test_list == NULL) { - __test_list = t; - t->next = NULL; - t->prev = t; - return; - } - if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { - t->next = NULL; - t->prev = __test_list->prev; - t->prev->next = t; - __test_list->prev = t; - } else { - t->next = __test_list; - t->next->prev = t; - t->prev = t; - __test_list = t; - } + __LIST_APPEND(t->fixture->tests, t); } static inline int __bail(int for_realz, bool no_print, __u8 step) @@ -790,43 +898,67 @@ void __wait_for_test(struct __test_metadata *t) } } -void __run_test(struct __test_metadata *t) +void __run_test(struct __fixture_metadata *f, + struct __fixture_variant_metadata *variant, + struct __test_metadata *t) { + /* reset test struct */ t->passed = 1; t->trigger = 0; - printf("[ RUN ] %s\n", t->name); + t->step = 0; + t->no_print = 0; + + printf("[ RUN ] %s%s%s.%s\n", + f->name, variant->name[0] ? "." : "", variant->name, t->name); t->pid = fork(); if (t->pid < 0) { printf("ERROR SPAWNING TEST CHILD\n"); t->passed = 0; } else if (t->pid == 0) { - t->fn(t); + t->fn(t, variant); /* return the step that failed or 0 */ _exit(t->passed ? 0 : t->step); } else { __wait_for_test(t); } - printf("[ %4s ] %s\n", (t->passed ? "OK" : "FAIL"), t->name); + printf("[ %4s ] %s%s%s.%s\n", (t->passed ? "OK" : "FAIL"), + f->name, variant->name[0] ? "." : "", variant->name, t->name); } static int test_harness_run(int __attribute__((unused)) argc, char __attribute__((unused)) **argv) { + struct __fixture_variant_metadata no_variant = { .name = "", }; + struct __fixture_variant_metadata *v; + struct __fixture_metadata *f; struct __test_metadata *t; int ret = 0; + unsigned int case_count = 0, test_count = 0; unsigned int count = 0; unsigned int pass_count = 0; + for (f = __fixture_list; f; f = f->next) { + for (v = f->variant ?: &no_variant; v; v = v->next) { + case_count++; + for (t = f->tests; t; t = t->next) + test_count++; + } + } + /* TODO(wad) add optional arguments similar to gtest. */ printf("[==========] Running %u tests from %u test cases.\n", - __test_count, __fixture_count + 1); - for (t = __test_list; t; t = t->next) { - count++; - __run_test(t); - if (t->passed) - pass_count++; - else - ret = 1; + test_count, case_count); + for (f = __fixture_list; f; f = f->next) { + for (v = f->variant ?: &no_variant; v; v = v->next) { + for (t = f->tests; t; t = t->next) { + count++; + __run_test(f, v, t); + if (t->passed) + pass_count++; + else + ret = 1; + } + } } printf("[==========] %u / %u tests passed.\n", pass_count, count); printf("[ %s ]\n", (ret ? "FAILED" : "PASSED")); diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index b785241127df..dd0e5fec6367 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -19,8 +19,8 @@ ret=0 ksft_skip=4 # all tests in this script. Can be overridden with -t option -IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime" -IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime" +IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_compat_mode" +IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_compat_mode" ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}" TESTS="${ALL_TESTS}" @@ -253,6 +253,33 @@ check_route6() check_output "${out}" "${expected}" } +start_ip_monitor() +{ + local mtype=$1 + + # start the monitor in the background + tmpfile=`mktemp /var/run/nexthoptestXXX` + mpid=`($IP monitor $mtype > $tmpfile & echo $!) 2>/dev/null` + sleep 0.2 + echo "$mpid $tmpfile" +} + +stop_ip_monitor() +{ + local mpid=$1 + local tmpfile=$2 + local el=$3 + + # check the monitor results + kill $mpid + lines=`wc -l $tmpfile | cut "-d " -f1` + test $lines -eq $el + rc=$? + rm -rf $tmpfile + + return $rc +} + ################################################################################ # basic operations (add, delete, replace) on nexthops and nexthop groups # @@ -883,6 +910,173 @@ ipv4_fcnal_runtime() log_test $? 0 "IPv4 route with MPLS encap, v6 gw - check" } +sysctl_nexthop_compat_mode_check() +{ + local sysctlname="net.ipv4.nexthop_compat_mode" + local lprefix=$1 + + IPE="ip netns exec me" + + $IPE sysctl -q $sysctlname 2>&1 >/dev/null + if [ $? -ne 0 ]; then + echo "SKIP: kernel lacks nexthop compat mode sysctl control" + return $ksft_skip + fi + + out=$($IPE sysctl $sysctlname 2>/dev/null) + log_test $? 0 "$lprefix default nexthop compat mode check" + check_output "${out}" "$sysctlname = 1" +} + +sysctl_nexthop_compat_mode_set() +{ + local sysctlname="net.ipv4.nexthop_compat_mode" + local mode=$1 + local lprefix=$2 + + IPE="ip netns exec me" + + out=$($IPE sysctl -w $sysctlname=$mode) + log_test $? 0 "$lprefix set compat mode - $mode" + check_output "${out}" "net.ipv4.nexthop_compat_mode = $mode" +} + +ipv6_compat_mode() +{ + local rc + + echo + echo "IPv6 nexthop api compat mode test" + echo "--------------------------------" + + sysctl_nexthop_compat_mode_check "IPv6" + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 122 group 62/63" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122" + # route add notification should contain expanded nexthops + stop_ip_monitor $ipmout 3 + log_test $? 0 "IPv6 compat mode on - route add notification" + + # route dump should contain expanded nexthops + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 pref medium nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop via 2001:db8:91::3 dev veth1 weight 1" + log_test $? 0 "IPv6 compat mode on - route dump" + + # change in nexthop group should generate route notification + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 62/64" + stop_ip_monitor $ipmout 3 + + log_test $? 0 "IPv6 compat mode on - nexthop change" + + # set compat mode off + sysctl_nexthop_compat_mode_set 0 "IPv6" + + run_cmd "$IP -6 ro del 2001:db8:101::1/128 nhid 122" + + run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1" + run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1" + run_cmd "$IP nexthop add id 122 group 62/63" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122" + # route add notification should not contain expanded nexthops + stop_ip_monitor $ipmout 1 + log_test $? 0 "IPv6 compat mode off - route add notification" + + # route dump should not contain expanded nexthops + check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 pref medium" + log_test $? 0 "IPv6 compat mode off - route dump" + + # change in nexthop group should not generate route notification + run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 62/64" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv6 compat mode off - nexthop change" + + # nexthop delete should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop del id 122" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv6 compat mode off - nexthop delete" + + # set compat mode back on + sysctl_nexthop_compat_mode_set 1 "IPv6" +} + +ipv4_compat_mode() +{ + local rc + + echo + echo "IPv4 nexthop api compat mode" + echo "----------------------------" + + sysctl_nexthop_compat_mode_check "IPv4" + if [ $? -eq $ksft_skip ]; then + return $ksft_skip + fi + + run_cmd "$IP nexthop add id 21 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 22 via 172.16.1.2 dev veth1" + run_cmd "$IP nexthop add id 122 group 21/22" + ipmout=$(start_ip_monitor route) + + run_cmd "$IP ro add 172.16.101.1/32 nhid 122" + stop_ip_monitor $ipmout 3 + + # route add notification should contain expanded nexthops + log_test $? 0 "IPv4 compat mode on - route add notification" + + # route dump should contain expanded nexthops + check_route "172.16.101.1" "172.16.101.1 nhid 122 nexthop via 172.16.1.2 dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1" + log_test $? 0 "IPv4 compat mode on - route dump" + + # change in nexthop group should generate route notification + run_cmd "$IP nexthop add id 23 via 172.16.1.3 dev veth1" + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 21/23" + stop_ip_monitor $ipmout 3 + log_test $? 0 "IPv4 compat mode on - nexthop change" + + sysctl_nexthop_compat_mode_set 0 "IPv4" + + # cleanup + run_cmd "$IP ro del 172.16.101.1/32 nhid 122" + + ipmout=$(start_ip_monitor route) + run_cmd "$IP ro add 172.16.101.1/32 nhid 122" + stop_ip_monitor $ipmout 1 + # route add notification should not contain expanded nexthops + log_test $? 0 "IPv4 compat mode off - route add notification" + + # route dump should not contain expanded nexthops + check_route "172.16.101.1" "172.16.101.1 nhid 122" + log_test $? 0 "IPv4 compat mode off - route dump" + + # change in nexthop group should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop replace id 122 group 21/22" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv4 compat mode off - nexthop change" + + # nexthop delete should not generate route notification + ipmout=$(start_ip_monitor route) + run_cmd "$IP nexthop del id 122" + stop_ip_monitor $ipmout 0 + log_test $? 0 "IPv4 compat mode off - nexthop delete" + + sysctl_nexthop_compat_mode_set 1 "IPv4" +} + basic() { echo diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index 813d02d1939d..d9eca227136b 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -2,7 +2,8 @@ # SPDX-License-Identifier: GPL-2.0 ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \ - mirred_egress_mirror_test gact_trap_test" + mirred_egress_mirror_test matchall_mirred_egress_mirror_test \ + gact_trap_test" NUM_NETIFS=4 source tc_common.sh source lib.sh @@ -50,6 +51,9 @@ switch_destroy() mirred_egress_test() { local action=$1 + local protocol=$2 + local classifier=$3 + local classifier_args=$4 RET=0 @@ -62,9 +66,9 @@ mirred_egress_test() tc_check_packets "dev $h2 ingress" 101 1 check_fail $? "Matched without redirect rule inserted" - tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \ - $tcflags dst_ip 192.0.2.2 action mirred egress $action \ - dev $swp2 + tc filter add dev $swp1 ingress protocol $protocol pref 1 handle 101 \ + $classifier $tcflags $classifier_args \ + action mirred egress $action dev $swp2 $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \ -t ip -q @@ -72,10 +76,11 @@ mirred_egress_test() tc_check_packets "dev $h2 ingress" 101 1 check_err $? "Did not match incoming $action packet" - tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower + tc filter del dev $swp1 ingress protocol $protocol pref 1 handle 101 \ + $classifier tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower - log_test "mirred egress $action ($tcflags)" + log_test "mirred egress $classifier $action ($tcflags)" } gact_drop_and_ok_test() @@ -187,12 +192,17 @@ cleanup() mirred_egress_redirect_test() { - mirred_egress_test "redirect" + mirred_egress_test "redirect" "ip" "flower" "dst_ip 192.0.2.2" } mirred_egress_mirror_test() { - mirred_egress_test "mirror" + mirred_egress_test "mirror" "ip" "flower" "dst_ip 192.0.2.2" +} + +matchall_mirred_egress_mirror_test() +{ + mirred_egress_test "mirror" "all" "matchall" "" } trap cleanup EXIT diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 0ea44d975b6c..c5282e62df75 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -101,6 +101,21 @@ FIXTURE(tls) bool notls; }; +FIXTURE_VARIANT(tls) +{ + unsigned int tls_version; +}; + +FIXTURE_VARIANT_ADD(tls, 12) +{ + .tls_version = TLS_1_2_VERSION, +}; + +FIXTURE_VARIANT_ADD(tls, 13) +{ + .tls_version = TLS_1_3_VERSION, +}; + FIXTURE_SETUP(tls) { struct tls12_crypto_info_aes_gcm_128 tls12; @@ -112,7 +127,7 @@ FIXTURE_SETUP(tls) len = sizeof(addr); memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_3_VERSION; + tls12.info.version = variant->tls_version; tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; addr.sin_family = AF_INET; @@ -733,7 +748,7 @@ TEST_F(tls, bidir) struct tls12_crypto_info_aes_gcm_128 tls12; memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_3_VERSION; + tls12.info.version = variant->tls_version; tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; ret = setsockopt(self->fd, SOL_TLS, TLS_RX, &tls12, @@ -1258,78 +1273,4 @@ TEST(keysizes) { close(cfd); } -TEST(tls12) { - int fd, cfd; - bool notls; - - struct tls12_crypto_info_aes_gcm_128 tls12; - struct sockaddr_in addr; - socklen_t len; - int sfd, ret; - - notls = false; - len = sizeof(addr); - - memset(&tls12, 0, sizeof(tls12)); - tls12.info.version = TLS_1_2_VERSION; - tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128; - - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(INADDR_ANY); - addr.sin_port = 0; - - fd = socket(AF_INET, SOCK_STREAM, 0); - sfd = socket(AF_INET, SOCK_STREAM, 0); - - ret = bind(sfd, &addr, sizeof(addr)); - ASSERT_EQ(ret, 0); - ret = listen(sfd, 10); - ASSERT_EQ(ret, 0); - - ret = getsockname(sfd, &addr, &len); - ASSERT_EQ(ret, 0); - - ret = connect(fd, &addr, sizeof(addr)); - ASSERT_EQ(ret, 0); - - ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")); - if (ret != 0) { - notls = true; - printf("Failure setting TCP_ULP, testing without tls\n"); - } - - if (!notls) { - ret = setsockopt(fd, SOL_TLS, TLS_TX, &tls12, - sizeof(tls12)); - ASSERT_EQ(ret, 0); - } - - cfd = accept(sfd, &addr, &len); - ASSERT_GE(cfd, 0); - - if (!notls) { - ret = setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", - sizeof("tls")); - ASSERT_EQ(ret, 0); - - ret = setsockopt(cfd, SOL_TLS, TLS_RX, &tls12, - sizeof(tls12)); - ASSERT_EQ(ret, 0); - } - - close(sfd); - - char const *test_str = "test_read"; - int send_len = 10; - char buf[10]; - - send_len = strlen(test_str) + 1; - EXPECT_EQ(send(fd, test_str, send_len, 0), send_len); - EXPECT_NE(recv(cfd, buf, send_len, 0), -1); - EXPECT_EQ(memcmp(buf, test_str, send_len), 0); - - close(fd); - close(cfd); -} - TEST_HARNESS_MAIN |