1 files changed, 322 insertions, 48 deletions
diff --git a/src/erasure-code/isa/isa-l/include/erasure_code.h b/src/erasure-code/isa/isa-l/include/erasure_code.h
index 0f3b6db0825..53e480f0193 100644
--- a/src/erasure-code/isa/isa-l/include/erasure_code.h
+++ b/src/erasure-code/isa/isa-l/include/erasure_code.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -74,73 +74,128 @@ extern "C" {
 void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls);
 
 /**
- * @brief Generate or decode erasure codes on blocks of data.
+ * @brief Generate or decode erasure codes on blocks of data, runs appropriate version.
  *
  * Given a list of source data blocks, generate one or multiple blocks of
  * encoded data as specified by a matrix of GF(2^8) coefficients. When given a
  * suitable set of coefficients, this function will perform the fast generation
  * or decoding of Reed-Solomon type erasure codes.
  *
- * @requires SSE4.1
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
  * @param len    Length of each block of data (vector) of source or dest data.
  * @param k      The number of vector sources or rows in the generator matrix
  * 		 for coding.
  * @param rows   The number of output vectors to concurrently encode/decode.
  * @param gftbls Pointer to array of input tables generated from coding
- *               coefficients in ec_init_tables(). Must be of size 32*k*rows
+ * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
  * @param data   Array of pointers to source input buffers.
  * @param coding Array of pointers to coded output buffers.
  * @returns none
  */
 
-void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, unsigned char **coding);
+void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+		    unsigned char **coding);
 
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
+ *
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires SSE4.1
+ */
+void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			unsigned char **coding);
 
 /**
- * @brief Generate or decode erasure codes on blocks of data, runs appropriate version.
+ * @brief Generate or decode erasure codes on blocks of data.
  *
- * Given a list of source data blocks, generate one or multiple blocks of
- * encoded data as specified by a matrix of GF(2^8) coefficients. When given a
- * suitable set of coefficients, this function will perform the fast generation
- * or decoding of Reed-Solomon type erasure codes.
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires AVX
+ */
+void ec_encode_data_avx(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
  *
- * This function determines what instruction sets are enabled and
- * selects the appropriate version at runtime.
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires AVX2
+ */
+void ec_encode_data_avx2(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			 unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data, runs baseline version.
+ *
+ * Baseline version of ec_encode_data() with same parameters.
+ */
+void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src,
+			 unsigned char **dest);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source, runs appropriate version.
+ *
+ * Given one source data block, update one or multiple blocks of encoded data as
+ * specified by a matrix of GF(2^8) coefficients. When given a suitable set of
+ * coefficients, this function will perform the fast generation or decoding of
+ * Reed-Solomon type erasure codes from one input source at a time.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
  *
  * @param len    Length of each block of data (vector) of source or dest data.
  * @param k      The number of vector sources or rows in the generator matrix
  * 		 for coding.
  * @param rows   The number of output vectors to concurrently encode/decode.
- * @param gftbls Pointer to array of input tables generated from coding
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param g_tbls Pointer to array of input tables generated from coding
  * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
- * @param data   Array of pointers to source input buffers.
+ * @param data   Pointer to single input source used to update output parity.
  * @param coding Array of pointers to coded output buffers.
  * @returns none
  */
+void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			   unsigned char *data, unsigned char **coding);
 
-void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, unsigned char **coding);
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires SSE4.1
+ */
 
+void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding);
 
 /**
- * @brief Generate or decode erasure codes on blocks of data, runs baseline version.
+ * @brief Generate update for encode or decode of erasure codes from single source.
  *
- * Given a list of source data blocks, generate one or multiple blocks of
- * encoded data as specified by a matrix of GF(2^8) coefficients.  When given a
- * suitable set of coefficients, this function will perform the fast generation
- * or decoding of Reed-Solomon type erasure codes.
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires AVX
+ */
+
+void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
  *
- * @param len    Length of each block of data (vector) of source or dest data.
- * @param srcs   The number of vector sources or rows in the generator matrix
- * 		 for coding.
- * @param dests  The number of output vectors to concurrently encode/decode.
- * @param v      Pointer to array of input tables generated from coding
- * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
- * @param src    Array of pointers to source input buffers.
- * @param dest   Array of pointers to coded output buffers.
- * @returns none
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires AVX2
+ */
+
+void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+				unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Baseline version of ec_encode_data_update().
  */
 
-void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src, unsigned char **dest);
+void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v,
+				unsigned char *data, unsigned char **dest);
 
 
 /**
@@ -150,8 +205,8 @@ void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigne
  * set of coefficients to produce each byte of the output. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 32*vlen byte constant array based on the input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
@@ -171,8 +226,8 @@ void gf_vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * set of coefficients to produce each byte of the output. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 32*vlen byte constant array based on the input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
@@ -192,8 +247,8 @@ void gf_vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * set of coefficients to produce each byte of the output. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 32*vlen byte constant array based on the input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
@@ -214,8 +269,8 @@ void gf_vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 2*32*vlen byte constant array based on the two sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
@@ -236,8 +291,8 @@ void gf_2vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 2*32*vlen byte constant array based on the two sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
@@ -258,8 +313,8 @@ void gf_2vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 2*32*vlen byte constant array based on the two sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
@@ -280,8 +335,8 @@ void gf_2vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 3*32*vlen byte constant array based on the three sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
@@ -302,8 +357,8 @@ void gf_3vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 3*32*vlen byte constant array based on the three sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
@@ -324,8 +379,8 @@ void gf_3vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 3*32*vlen byte constant array based on the three sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
@@ -346,8 +401,8 @@ void gf_3vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 4*32*vlen byte constant array based on the four sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
@@ -368,8 +423,8 @@ void gf_4vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 4*32*vlen byte constant array based on the four sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
@@ -390,8 +445,8 @@ void gf_4vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 4*32*vlen byte constant array based on the four sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
@@ -412,8 +467,8 @@ void gf_4vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 5*32*vlen byte constant array based on the five sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
@@ -434,8 +489,8 @@ void gf_5vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 5*32*vlen byte constant array based on the five sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
@@ -456,8 +511,8 @@ void gf_5vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 5*32*vlen byte constant array based on the five sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
@@ -478,8 +533,8 @@ void gf_5vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 6*32*vlen byte constant array based on the six sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
@@ -500,8 +555,8 @@ void gf_6vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 6*32*vlen byte constant array based on the six sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
@@ -522,8 +577,8 @@ void gf_6vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 6*32*vlen byte constant array based on the six sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
@@ -582,6 +637,224 @@ void gf_vect_dot_prod_base(int len, int vlen, unsigned char *gftbls,
 void gf_vect_dot_prod(int len, int vlen, unsigned char *gftbls,
                         unsigned char **src, unsigned char *dest);
 
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, runs appropriate version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and add to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32*vec byte constant array based on the input
+ * coefficients.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mad(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		 unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires SSE4.1
+ */
+
+void gf_vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		     unsigned char *dest);
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires AVX
+ */
+
+void gf_vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		     unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires AVX2
+ */
+
+void gf_vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, baseline version.
+ *
+ * Baseline version of gf_vect_mad() with same parameters.
+ */
+
+void gf_vect_mad_base(int len, int vec, int vec_i, unsigned char *v, unsigned char *src,
+		      unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate.  SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_2vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. AVX version of gf_2vect_mad_sse().
+ * @requires AVX
+ */
+void gf_2vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. AVX2 version of gf_2vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_2vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_3vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. AVX version of gf_3vect_mad_sse().
+ * @requires AVX
+ */
+void gf_3vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. AVX2 version of gf_3vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_3vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_4vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. AVX version of gf_4vect_mad_sse().
+ * @requires AVX
+ */
+void gf_4vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. AVX2 version of gf_4vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_4vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. SSE version.
+ * @requires SSE4.1
+ */
+void gf_5vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. AVX version.
+ * @requires AVX
+ */
+void gf_5vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. AVX2 version.
+ * @requires AVX2
+ */
+void gf_5vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. SSE version.
+ * @requires SSE4.1
+ */
+void gf_6vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. AVX version.
+ * @requires AVX
+ */
+void gf_6vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. AVX2 version.
+ * @requires AVX2
+ */
+void gf_6vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+
 /**********************************************************************
  * The remaining are lib support functions used in GF(2^8) operations.
  */
@@ -650,6 +923,7 @@ void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k);
 
 int gf_invert_matrix(unsigned char *in, unsigned char *out, const int n);
 
+
 /*************************************************************/
 
 #ifdef __cplusplus