erasure-code: Update ISA-L to 2.13

ISA-L 2.13 brings better performance on Avoton (20%). There's no impact on Xeon platform. The details are in the release notes. There's a new API ec_encode_data_update() for incremental encoding and decoding. The other highlevel API keeps the same as in 2.10 Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
author: Yuan Zhou <yuan.zhou@intel.com> 2015-03-30 07:39:31 +0200
committer: Yuan Zhou <yuan.zhou@intel.com> 2015-03-30 07:39:31 +0200
commit: 59aa6700fa7ae63d28a8045c2a11719f8f222e17 (patch)
tree: 8a87c086e45e42622b6bcf19071155fdcdbe2ecd /src/erasure-code
parent: Merge pull request #4209 from ceph/wip-java (diff)
download: ceph-59aa6700fa7ae63d28a8045c2a11719f8f222e17.tar.xz
ceph-59aa6700fa7ae63d28a8045c2a11719f8f222e17.zip
47 files changed, 8056 insertions, 464 deletions
diff --git a/src/erasure-code/isa/Makefile.am b/src/erasure-code/isa/Makefile.am
index 649ddaacb07..b36b8a6daf7 100644
--- a/src/erasure-code/isa/Makefile.am
+++ b/src/erasure-code/isa/Makefile.am
@@ -33,6 +33,24 @@ isa_sources = \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s \
+	erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s \
 	erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s \
 	erasure-code/isa/ErasureCodeIsa.cc \
@@ -49,7 +67,7 @@ libec_isa_la_CXXFLAGS = ${AM_CXXFLAGS} -I $(srcdir)/erasure-code/isa/isa-l/inclu
 libec_isa_la_CCASFLAGS = ${AM_CCASFLAGS} -I $(abs_srcdir)/erasure-code/isa/isa-l/include/
 
 libec_isa_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS)
-libec_isa_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:10:0
+libec_isa_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:13:0
 if LINUX
 libec_isa_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*'
 endif
diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_base.c b/src/erasure-code/isa/isa-l/erasure_code/ec_base.c
index 5e93cb6fe15..3c7e8382ca2 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/ec_base.c
+++ b/src/erasure-code/isa/isa-l/erasure_code/ec_base.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -275,6 +275,18 @@ void gf_vect_dot_prod_base(int len, int vlen, unsigned char *v,
 	}
 }
 
+void gf_vect_mad_base(int len, int vec, int vec_i,
+		      unsigned char *v, unsigned char *src, unsigned char *dest)
+{
+	int i;
+	unsigned char s;
+	for (i = 0; i < len; i++) {
+		s = dest[i];
+		s ^= gf_mul(src[i], v[vec_i * 32 + 1]);
+		dest[i] = s;
+	}
+}
+
 void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v,
 			 unsigned char **src, unsigned char **dest)
 {
@@ -292,6 +304,22 @@ void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v,
 	}
 }
 
+void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v,
+				unsigned char *data, unsigned char **dest)
+{
+	int i, l;
+	unsigned char s;
+
+	for (l = 0; l < rows; l++) {
+		for (i = 0; i < len; i++) {
+			s = dest[l][i];
+			s ^= gf_mul(data[i], v[vec_i * 32 + l * k * 32 + 1]);
+
+			dest[l][i] = s;
+		}
+	}
+}
+
 void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, unsigned char *dest)
 {
 	//2nd element of table array is ref value used to fill it in
diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_base.h b/src/erasure-code/isa/isa-l/erasure_code/ec_base.h
index 519ac7a2cac..d69a92d67bc 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/ec_base.h
+++ b/src/erasure-code/isa/isa-l/erasure_code/ec_base.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c b/src/erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c
index 9cea61e5d52..fe2cdc9ca99 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c
+++ b/src/erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -42,7 +42,6 @@ void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls)
 	}
 }
 
-#if __WORDSIZE == 64 || _WIN64 || __x86_64__
 void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
 			unsigned char **coding)
 {
@@ -77,7 +76,6 @@ void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigne
 void ec_encode_data_avx(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data,
 			unsigned char **coding)
 {
-
 	if (len < 16) {
 		ec_encode_data_base(len, k, rows, g_tbls, data, coding);
 		return;
@@ -136,6 +134,123 @@ void ec_encode_data_avx2(int len, int k, int rows, unsigned char *g_tbls, unsign
 
 }
 
+#if __WORDSIZE == 64 || _WIN64 || __x86_64__
+
+void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding)
+{
+	if (len < 16) {
+		ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+		return;
+	}
+
+	while (rows > 6) {
+		gf_6vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		g_tbls += 6 * k * 32;
+		coding += 6;
+		rows -= 6;
+	}
+	switch (rows) {
+	case 6:
+		gf_6vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 5:
+		gf_5vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 4:
+		gf_4vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 3:
+		gf_3vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_mad_sse(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_mad_sse(len, k, vec_i, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+
+}
+
+void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding)
+{
+	if (len < 16) {
+		ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+		return;
+	}
+	while (rows > 6) {
+		gf_6vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		g_tbls += 6 * k * 32;
+		coding += 6;
+		rows -= 6;
+	}
+	switch (rows) {
+	case 6:
+		gf_6vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 5:
+		gf_5vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 4:
+		gf_4vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 3:
+		gf_3vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_mad_avx(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_mad_avx(len, k, vec_i, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+
+}
+
+void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+				unsigned char *data, unsigned char **coding)
+{
+	if (len < 32) {
+		ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding);
+		return;
+	}
+	while (rows > 6) {
+		gf_6vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		g_tbls += 6 * k * 32;
+		coding += 6;
+		rows -= 6;
+	}
+	switch (rows) {
+	case 6:
+		gf_6vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 5:
+		gf_5vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 4:
+		gf_4vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 3:
+		gf_3vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 2:
+		gf_2vect_mad_avx2(len, k, vec_i, g_tbls, data, coding);
+		break;
+	case 1:
+		gf_vect_mad_avx2(len, k, vec_i, g_tbls, data, *coding);
+		break;
+	case 0:
+		break;
+	}
+
+}
+
 #endif //__WORDSIZE == 64 || _WIN64 || __x86_64__
 
 struct slver {
diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s b/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s
index 54f7301181e..f23db361ca9 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -28,42 +28,63 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 %ifidn __OUTPUT_FORMAT__, elf64
-%define WRT_OPT		wrt ..plt
+ %define WRT_OPT		wrt ..plt
 %else
-%define WRT_OPT
+ %define WRT_OPT
 %endif
 
+%include "reg_sizes.asm"
+
 %ifidn __OUTPUT_FORMAT__, elf32
 
 [bits 32]
 
-%define def_wrd		dd
-%define wrd_sz  	dword
-%define arg1		esi
+ %define def_wrd		dd
+ %define wrd_sz  	dword
+ %define arg1		esi
+ %define arg2		eax
+ %define arg3		ebx
+ %define arg4		ecx
+ %define arg5		edx
 
 %else
 
-%include "reg_sizes.asm"
-default rel
-[bits 64]
+ default rel
+ [bits 64]
 
-%define def_wrd 	dq
-%define wrd_sz  	qword
-%define arg1		rsi
+ %define def_wrd 	dq
+ %define wrd_sz  	qword
+ %define arg1		rsi
+ %define arg2		rax
+ %define arg3		rbx
+ %define arg4		rcx
+ %define arg5		rdx
 
-extern ec_encode_data_sse
-extern ec_encode_data_avx
-extern ec_encode_data_avx2
-extern gf_vect_mul_sse
-extern gf_vect_mul_avx
-extern gf_vect_dot_prod_sse
-extern gf_vect_dot_prod_avx
-extern gf_vect_dot_prod_avx2
+
+ extern ec_encode_data_update_sse
+ extern ec_encode_data_update_avx
+ extern ec_encode_data_update_avx2
+ extern gf_vect_mul_sse
+ extern gf_vect_mul_avx
+
+ extern gf_vect_mad_sse
+ extern gf_vect_mad_avx
+ extern gf_vect_mad_avx2
 %endif
 
 extern gf_vect_mul_base
 extern ec_encode_data_base
+extern ec_encode_data_update_base
 extern gf_vect_dot_prod_base
+extern gf_vect_mad_base
+
+extern gf_vect_dot_prod_sse
+extern gf_vect_dot_prod_avx
+extern gf_vect_dot_prod_avx2
+extern ec_encode_data_sse
+extern ec_encode_data_avx
+extern ec_encode_data_avx2
+
 
 section .data
 ;;; *_mbinit are initial values for *_dispatched; is updated on first call.
@@ -78,6 +99,12 @@ gf_vect_mul_dispatched:
 gf_vect_dot_prod_dispatched:
 	def_wrd      gf_vect_dot_prod_mbinit
 
+ec_encode_data_update_dispatched:
+	def_wrd      ec_encode_data_update_mbinit
+
+gf_vect_mad_dispatched:
+	def_wrd      gf_vect_mad_mbinit
+
 section .text
 ;;;;
 ; ec_encode_data multibinary function
@@ -91,50 +118,45 @@ ec_encode_data:
 
 ec_encode_data_dispatch_init:
 	push    arg1
-%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
-	lea     arg1, [ec_encode_data_base]
-%else
-	push    rax
-	push    rbx
-	push    rcx
-	push    rdx
+	push    arg2
+	push    arg3
+	push    arg4
+	push    arg5
 	lea     arg1, [ec_encode_data_base WRT_OPT] ; Default
 
 	mov     eax, 1
 	cpuid
-	lea     rbx, [ec_encode_data_sse WRT_OPT]
+	lea     arg3, [ec_encode_data_sse WRT_OPT]
 	test    ecx, FLAG_CPUID1_ECX_SSE4_1
-	cmovne  arg1, rbx
+	cmovne  arg1, arg3
 
 	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
 	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
-	lea	rbx, [ec_encode_data_avx WRT_OPT]
+	lea	arg3, [ec_encode_data_avx WRT_OPT]
 
 	jne	_done_ec_encode_data_init
-	mov	rsi, rbx
+	mov	arg1, arg3
 
 	;; Try for AVX2
 	xor	ecx, ecx
 	mov	eax, 7
 	cpuid
 	test	ebx, FLAG_CPUID1_EBX_AVX2
-	lea     rbx, [ec_encode_data_avx2 WRT_OPT]
-	cmovne	rsi, rbx
-
+	lea     arg3, [ec_encode_data_avx2 WRT_OPT]
+	cmovne	arg1, arg3
 	;; Does it have xmm and ymm support
 	xor	ecx, ecx
 	xgetbv
 	and	eax, FLAG_XGETBV_EAX_XMM_YMM
 	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
 	je	_done_ec_encode_data_init
-	lea     rsi, [ec_encode_data_sse WRT_OPT]
+	lea     arg1, [ec_encode_data_sse WRT_OPT]
 
 _done_ec_encode_data_init:
-	pop     rdx
-	pop     rcx
-	pop     rbx
-	pop     rax
-%endif			;; END 32-bit check
+	pop     arg5
+	pop     arg4
+	pop     arg3
+	pop     arg2
 	mov     [ec_encode_data_dispatched], arg1
 	pop     arg1
 	ret
@@ -190,6 +212,65 @@ _done_gf_vect_mul_dispatch_init:
 	pop     arg1
 	ret
 
+;;;;
+; ec_encode_data_update multibinary function
+;;;;
+global ec_encode_data_update:function
+ec_encode_data_update_mbinit:
+	call	ec_encode_data_update_dispatch_init
+
+ec_encode_data_update:
+	jmp	wrd_sz [ec_encode_data_update_dispatched]
+
+ec_encode_data_update_dispatch_init:
+	push    arg1
+%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
+	lea     arg1, [ec_encode_data_update_base]
+%else
+	push    rax
+	push    rbx
+	push    rcx
+	push    rdx
+	lea     arg1, [ec_encode_data_update_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	lea     rbx, [ec_encode_data_update_sse WRT_OPT]
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	cmovne  arg1, rbx
+
+	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	lea	rbx, [ec_encode_data_update_avx WRT_OPT]
+
+	jne	_done_ec_encode_data_update_init
+	mov	rsi, rbx
+
+	;; Try for AVX2
+	xor	ecx, ecx
+	mov	eax, 7
+	cpuid
+	test	ebx, FLAG_CPUID1_EBX_AVX2
+	lea     rbx, [ec_encode_data_update_avx2 WRT_OPT]
+	cmovne	rsi, rbx
+
+	;; Does it have xmm and ymm support
+	xor	ecx, ecx
+	xgetbv
+	and	eax, FLAG_XGETBV_EAX_XMM_YMM
+	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+	je	_done_ec_encode_data_update_init
+	lea     rsi, [ec_encode_data_update_sse WRT_OPT]
+
+_done_ec_encode_data_update_init:
+	pop     rdx
+	pop     rcx
+	pop     rbx
+	pop     rax
+%endif			;; END 32-bit check
+	mov     [ec_encode_data_update_dispatched], arg1
+	pop     arg1
+	ret
 
 ;;;;
 ; gf_vect_dot_prod multibinary function
@@ -203,26 +284,81 @@ gf_vect_dot_prod:
 
 gf_vect_dot_prod_dispatch_init:
 	push    arg1
+	push    arg2
+	push    arg3
+	push    arg4
+	push    arg5
+	lea     arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default
+
+	mov     eax, 1
+	cpuid
+	lea     arg3, [gf_vect_dot_prod_sse WRT_OPT]
+	test    ecx, FLAG_CPUID1_ECX_SSE4_1
+	cmovne  arg1, arg3
+
+	and		ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	cmp		ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+	lea     arg3, [gf_vect_dot_prod_avx WRT_OPT]
+
+	jne     _done_gf_vect_dot_prod_init
+	mov		arg1, arg3
+
+	;; Try for AVX2
+	xor		ecx, ecx
+	mov		eax, 7
+	cpuid
+	test	ebx, FLAG_CPUID1_EBX_AVX2
+	lea     arg3, [gf_vect_dot_prod_avx2 WRT_OPT]
+	cmovne	arg1, arg3
+	;; Does it have xmm and ymm support
+	xor	ecx, ecx
+	xgetbv
+	and	eax, FLAG_XGETBV_EAX_XMM_YMM
+	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
+	je	_done_gf_vect_dot_prod_init
+	lea     arg1, [gf_vect_dot_prod_sse WRT_OPT]
+
+_done_gf_vect_dot_prod_init:
+	pop     arg5
+	pop     arg4
+	pop     arg3
+	pop     arg2
+	mov     [gf_vect_dot_prod_dispatched], arg1
+	pop	arg1
+	ret
+
+;;;;
+; gf_vect_mad multibinary function
+;;;;
+global gf_vect_mad:function
+gf_vect_mad_mbinit:
+	call    gf_vect_mad_dispatch_init
+
+gf_vect_mad:
+	jmp     wrd_sz [gf_vect_mad_dispatched]
+
+gf_vect_mad_dispatch_init:
+	push    arg1
 %ifidn __OUTPUT_FORMAT__, elf32         ;; 32-bit check
-	lea     arg1, [gf_vect_dot_prod_base]
+	lea     arg1, [gf_vect_mad_base]
 %else
 	push	rax
 	push	rbx
 	push	rcx
 	push	rdx
-	lea     arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default
+	lea     arg1, [gf_vect_mad_base WRT_OPT] ; Default
 
 	mov     eax, 1
 	cpuid
-	lea     rbx, [gf_vect_dot_prod_sse WRT_OPT]
+	lea     rbx, [gf_vect_mad_sse WRT_OPT]
 	test    ecx, FLAG_CPUID1_ECX_SSE4_1
 	cmovne  arg1, rbx
 
 	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
 	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
-	lea     rbx, [gf_vect_dot_prod_avx WRT_OPT]
+	lea     rbx, [gf_vect_mad_avx WRT_OPT]
 
-	jne     _done_gf_vect_dot_prod_init
+	jne     _done_gf_vect_mad_init
 	mov	rsi, rbx
 
 	;; Try for AVX2
@@ -230,7 +366,7 @@ gf_vect_dot_prod_dispatch_init:
 	mov	eax, 7
 	cpuid
 	test	ebx, FLAG_CPUID1_EBX_AVX2
-	lea     rbx, [gf_vect_dot_prod_avx2 WRT_OPT]
+	lea     rbx, [gf_vect_mad_avx2 WRT_OPT]
 	cmovne	rsi, rbx
 
 	;; Does it have xmm and ymm support
@@ -238,16 +374,16 @@ gf_vect_dot_prod_dispatch_init:
 	xgetbv
 	and	eax, FLAG_XGETBV_EAX_XMM_YMM
 	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
-	je	_done_gf_vect_dot_prod_init
-	lea     rsi, [gf_vect_dot_prod_sse WRT_OPT]
+	je	_done_gf_vect_mad_init
+	lea     rsi, [gf_vect_mad_sse WRT_OPT]
 
-_done_gf_vect_dot_prod_init:
+_done_gf_vect_mad_init:
 	pop     rdx
 	pop     rcx
 	pop     rbx
 	pop     rax
 %endif			;; END 32-bit check
-	mov     [gf_vect_dot_prod_dispatched], arg1
+	mov     [gf_vect_mad_dispatched], arg1
 	pop	arg1
 	ret
 
@@ -260,9 +396,9 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 
-;;;       func                  core, ver, snum
-slversion ec_encode_data,	00,   02,  0133
-slversion gf_vect_mul,		00,   02,  0134
-slversion gf_vect_dot_prod,	00,   01,  0138
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+;;;       func                 		core, ver, snum
+slversion ec_encode_data,		00,   03,  0133
+slversion gf_vect_mul,			00,   02,  0134
+slversion ec_encode_data_update,	00,   02,  0212
+slversion gf_vect_dot_prod,		00,   02,  0138
+slversion gf_vect_mad,			00,   01,  0213
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s
index db8064a3971..1bd839cb66d 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_2vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -46,6 +44,9 @@
  %define tmp3  r9
  %define tmp4  r12		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define LOG_PS 3
 
@@ -70,6 +71,9 @@
  %define tmp3   r13		; must be saved and restored
  %define tmp4   r14		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  3*16 + 3*8 	; must be an odd multiple of 8
@@ -99,17 +103,92 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans		;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp4    trans2
+ %define tmp4_m  var(0)
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*1	;1 local variable
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*1	;1 local variable
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
 %define len   arg0
 %define vec   arg1
 %define mul_array arg2
 %define	src   arg3
-%define dest1 arg4
+%define dest1  arg4
 
 %define vec_i tmp2
 %define ptr   tmp3
 %define dest2 tmp4
 %define pos   return
 
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp4_m
+ %endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR vmovdqu
@@ -125,35 +204,54 @@
  %endif
 %endif
 
+%ifidn PS,8			; 64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
-%define xmask0f   xmm8
-%define xgft1_lo  xmm7
-%define xgft1_hi  xmm6
-%define xgft2_lo  xmm5
-%define xgft2_hi  xmm4
-
-%define x0     xmm0
-%define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
+%ifidn PS,8			;64-bit code
+ %define xmask0f   xmm8
+ %define xgft1_lo  xmm7
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xmm5
+ %define xgft2_hi  xmm4
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+%else				;32-bit code
+ %define xmask0f   xmm4
+ %define xgft1_lo  xmm7
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+%endif
 
 align 16
 global gf_2vect_dot_prod_avx:function
 
 func(gf_2vect_dot_prod_avx)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 16
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest1, [dest1]
+	SSTR	dest1_m, dest1
 
 .loop16
 	vpxor	xp1, xp1
@@ -162,16 +260,18 @@ func(gf_2vect_dot_prod_avx)
 	xor	vec_i, vec_i
 
 .next_vect
+	SLDR	src, src_m
 	mov	ptr, [src+vec_i]
 
 	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 	vmovdqu	xgft1_hi, [tmp+16]	;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %ifidn PS,8				; 64-bit code
 	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
-
-	XLDR	x0, [ptr+pos]		;Get next source vector
 	add	tmp, 32
 	add	vec_i, PS
+ %endif
+	XLDR	x0, [ptr+pos]		;Get next source vector
 
 	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
 	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
@@ -182,6 +282,12 @@ func(gf_2vect_dot_prod_avx)
 	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	vpxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
@@ -190,9 +296,12 @@ func(gf_2vect_dot_prod_avx)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
 
+	SLDR	len, len_m
 	add	pos, 16			;Loop on 16 bytes at a time
 	cmp	pos, len
 	jle	.loop16
@@ -231,6 +340,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                  core, ver, snum
-slversion gf_2vect_dot_prod_avx, 02,  03,  0191
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_2vect_dot_prod_avx, 02,  04,  0191
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s
index 5d75d810905..ada013bd628 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_2vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -48,7 +46,10 @@
  %define tmp3  r9
  %define tmp4  r12		; must be saved and restored
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define LOG_PS 3
 
  %define func(x) x:
@@ -74,6 +75,9 @@
  %define tmp3   r13		; must be saved and restored
  %define tmp4   r14		; must be saved and restored
  %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  3*16 + 3*8 	; must be an odd multiple of 8
@@ -103,6 +107,76 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans			;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp.w   edx
+ %define tmp.b   dl
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp4    trans2
+ %define tmp4_m  var(0)
+ %define return  eax
+ %macro SLDR 	 2			;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*1		;1 local variable
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*1		;1 local variable
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
 %define len   arg0
 %define vec   arg1
 %define mul_array arg2
@@ -114,6 +188,13 @@
 %define dest2 tmp4
 %define pos   return
 
+%ifidn PS,4				;32-bit code
+ %define  len_m   arg0_m
+ %define  src_m   arg3_m
+ %define  dest1_m arg4_m
+ %define  dest2_m tmp4_m
+%endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR vmovdqu
@@ -130,30 +211,48 @@
  %endif
 %endif
 
+%ifidn PS,8				;64-bit code
+ default rel
+ [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
-%define xmask0f   ymm8
-%define xmask0fx  xmm8
-%define xgft1_lo  ymm7
-%define xgft1_hi  ymm6
-%define xgft2_lo  ymm5
-%define xgft2_hi  ymm4
+%ifidn PS,8				;64-bit code
+ %define xmask0f   ymm8
+ %define xmask0fx  xmm8
+ %define xgft1_lo  ymm7
+ %define xgft1_hi  ymm6
+ %define xgft2_lo  ymm5
+ %define xgft2_hi  ymm4
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+%else					;32-bit code
+ %define xmask0f   ymm7
+ %define xmask0fx  xmm7
+ %define xgft1_lo  ymm5
+ %define xgft1_hi  ymm4
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
 
-%define x0     ymm0
-%define xtmpa  ymm1
-%define xp1    ymm2
-%define xp2    ymm3
+%endif
 
 align 16
 global gf_2vect_dot_prod_avx2:function
 
 func(gf_2vect_dot_prod_avx2)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 32
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	mov	tmp.b, 0x0f
@@ -161,8 +260,11 @@ func(gf_2vect_dot_prod_avx2)
 	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
 
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest1, [dest1]
+	SSTR	dest1_m, dest1
 
 .loop32
 	vpxor	xp1, xp1
@@ -171,22 +273,25 @@ func(gf_2vect_dot_prod_avx2)
 	xor	vec_i, vec_i
 
 .next_vect
+	SLDR	src, src_m
 	mov	ptr, [src+vec_i]
 
 	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
 	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
-
+ %ifidn PS,8				; 64-bit code
 	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
 	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
 
-
 	XLDR	x0, [ptr+pos]		;Get next source vector
 	add	tmp, 32
 	add	vec_i, PS
+ %else
+	XLDR	x0, [ptr+pos]		;Get next source vector
+ %endif
 
 	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
 	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
@@ -197,6 +302,14 @@ func(gf_2vect_dot_prod_avx2)
 	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	vpxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
@@ -205,9 +318,12 @@ func(gf_2vect_dot_prod_avx2)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
 
+	SLDR	len, len_m
 	add	pos, 32			;Loop on 32 bytes at a time
 	cmp	pos, len
 	jle	.loop32
@@ -243,6 +359,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                   core, ver, snum
-slversion gf_2vect_dot_prod_avx2, 04,  03,  0196
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_2vect_dot_prod_avx2, 04,  04,  0196
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s
index 4f324aecc43..e180830c1fc 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_2vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -46,6 +44,9 @@
  %define tmp3  r9
  %define tmp4  r12		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define LOG_PS 3
 
@@ -70,6 +71,9 @@
  %define tmp3   r13		; must be saved and restored
  %define tmp4   r14		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  3*16 + 3*8 	; must be an odd multiple of 8
@@ -99,23 +103,97 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans			;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp4    trans2
+ %define tmp4_m  var(0)
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub 	esp, PS*1		;1 local variable
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*1		;1 local variable
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
 %define len   arg0
 %define vec   arg1
 %define mul_array arg2
 %define	src   arg3
-%define dest1 arg4
+%define dest1  arg4
 
 %define vec_i tmp2
 %define ptr   tmp3
 %define dest2 tmp4
 %define pos   return
 
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	 arg0_m
+	%define  src_m 	 arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp4_m
+ %endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR movdqu
  %define XSTR movdqu
 %else
-
 ;;; Use Non-temporal load/stor
  %ifdef NO_NT_LDST
   %define XLDR movdqa
@@ -126,35 +204,54 @@
  %endif
 %endif
 
+%ifidn PS,8				;64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
-%define xmask0f   xmm8
-%define xgft1_lo  xmm7
-%define xgft1_hi  xmm6
-%define xgft2_lo  xmm5
-%define xgft2_hi  xmm4
-
-%define x0     xmm0
-%define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
+%ifidn PS,8				;64-bit code
+ %define xmask0f   xmm8
+ %define xgft1_lo  xmm7
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xmm5
+ %define xgft2_hi  xmm4
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+%else					;32-bit code
+ %define xmask0f   xmm4
+ %define xgft1_lo  xmm7
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+%endif
 
 align 16
 global gf_2vect_dot_prod_sse:function
 
 func(gf_2vect_dot_prod_sse)
 	FUNC_SAVE
+	SLDR 	len, len_m
 	sub	len, 16
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR 	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR 	dest2_m, dest2
 	mov	dest1, [dest1]
+	SSTR 	dest1_m, dest1
 
 .loop16
 	pxor	xp1, xp1
@@ -163,16 +260,18 @@ func(gf_2vect_dot_prod_sse)
 	xor	vec_i, vec_i
 
 .next_vect
+	SLDR 	src, src_m
 	mov	ptr, [src+vec_i]
 
 	movdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 	movdqu	xgft1_hi, [tmp+16]	;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %ifidn PS,8				;64-bit code
 	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
-
-	XLDR	x0, [ptr+pos]		;Get next source vector
 	add	tmp, 32
 	add	vec_i, PS
+ %endif
+	XLDR	x0, [ptr+pos]		;Get next source vector
 
 	movdqa	xtmpa, x0		;Keep unshifted copy of src
 	psraw	x0, 4			;Shift to put high nibble into bits 4-0
@@ -184,6 +283,13 @@ func(gf_2vect_dot_prod_sse)
 	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	pxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				;32-bit code
+	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
@@ -192,9 +298,12 @@ func(gf_2vect_dot_prod_sse)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR 	dest1, dest1_m
+	SLDR 	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
 
+	SLDR 	len, len_m
 	add	pos, 16			;Loop on 16 bytes at a time
 	cmp	pos, len
 	jle	.loop16
@@ -233,6 +342,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                  core, ver, snum
-slversion gf_2vect_dot_prod_sse, 00,  02,  0062
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_2vect_dot_prod_sse, 00,  03,  0062
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s
new file mode 100644
index 00000000000..021133eb031
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s
@@ -0,0 +1,242 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*9 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r15,  9*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r15,  [rsp + 9*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp2
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm14
+%define xgft1_lo  xmm13
+%define xgft1_hi  xmm12
+%define xgft2_lo  xmm11
+%define xgft2_hi  xmm10
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xd1     xmm6
+%define xd2     xmm7
+%define xtmpd1  xmm8
+%define xtmpd2  xmm9
+
+
+align 16
+global gf_2vect_mad_avx:function
+
+func(gf_2vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+	XLDR	xtmpd1, [dest1+len]	;backup the last 16 bytes in dest
+	XLDR	xtmpd2, [dest2+len]	;backup the last 16 bytes in dest
+
+.loop16
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+	XLDR	xd2, [dest2+pos]		;Get next dest vector
+.loop16_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph1, xgft1_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1	;xd1 += partial
+
+	vpshufb	xtmph2, xgft2_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2	;xd2 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	vmovdqa	xd1, xtmpd1	;Restore xd1
+	vmovdqa	xd2, xtmpd2	;Restore xd2
+	jmp	.loop16_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func             core, ver, snum
+slversion gf_2vect_mad_avx, 02,  00,  0204
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s
new file mode 100644
index 00000000000..e8442aba5e2
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s
@@ -0,0 +1,253 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+ %define stack_size  16*9 + 3*8 	; must be an odd multiple of 8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	vmovdqa	[rsp+16*3],xmm9
+	vmovdqa	[rsp+16*4],xmm10
+	vmovdqa	[rsp+16*5],xmm11
+	vmovdqa	[rsp+16*6],xmm12
+	vmovdqa	[rsp+16*7],xmm13
+	vmovdqa	[rsp+16*8],xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r15,  9*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	vmovdqa	xmm9, [rsp+16*3]
+	vmovdqa	xmm10, [rsp+16*4]
+	vmovdqa	xmm11, [rsp+16*5]
+	vmovdqa	xmm12, [rsp+16*6]
+	vmovdqa	xmm13, [rsp+16*7]
+	vmovdqa	xmm14, [rsp+16*8]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r15,  [rsp + 9*16 + 1*8]
+	add	rsp, stack_size
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp2
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm14
+%define xmask0fx  xmm14
+%define xgft1_lo  ymm13
+%define xgft1_hi  ymm12
+%define xgft2_lo  ymm11
+%define xgft2_hi  ymm10
+
+%define x0      ymm0
+%define xtmpa   ymm1
+%define xtmph1  ymm2
+%define xtmpl1  ymm3
+%define xtmph2  ymm4
+%define xtmpl2  ymm5
+%define xd1     ymm6
+%define xd2     ymm7
+%define xtmpd1  ymm8
+%define xtmpd2  ymm9
+
+align 16
+global gf_2vect_mad_avx2:function
+
+func(gf_2vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+					;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest1, [dest1]
+
+	XLDR	xtmpd1, [dest1+len]	;backup the last 16 bytes in dest
+	XLDR	xtmpd2, [dest2+len]	;backup the last 16 bytes in dest
+
+.loop32
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+	XLDR	xd2, [dest2+pos]		;Get next dest vector
+.loop32_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph1, xgft1_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1 ;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1	;xd1 += partial
+
+	vpshufb	xtmph2, xgft2_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2 ;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2	;xd2 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-32
+	vmovdqa	xd1, xtmpd1	;Restore xd1
+	vmovdqa	xd2, xtmpd2	;Restore xd2
+	jmp	.loop32_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func              core, ver, snum
+slversion gf_2vect_mad_avx2, 04,  00,  0205
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s
new file mode 100644
index 00000000000..a569a6ed268
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s
@@ -0,0 +1,245 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*9 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	save_reg	r12,  9*16 + 0*8
+	save_reg	r15,  9*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	mov	r12,  [rsp + 9*16 + 0*8]
+	mov	r15,  [rsp + 9*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp2
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm14
+%define xgft1_lo  xmm13
+%define xgft1_hi  xmm12
+%define xgft2_lo  xmm11
+%define xgft2_hi  xmm10
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xd1     xmm6
+%define xd2     xmm7
+%define xtmpd1  xmm8
+%define xtmpd2  xmm9
+
+
+align 16
+global gf_2vect_mad_sse:function
+func(gf_2vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	movdqu	xgft1_lo,[tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+	XLDR	xtmpd1, [dest1+len]	;backup the last 16 bytes in dest
+	XLDR	xtmpd2, [dest2+len]	;backup the last 16 bytes in dest
+
+.loop16:
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+	XLDR	xd2, [dest2+pos]		;Get next dest vector
+.loop16_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+	movdqa	xtmph1, xgft1_hi		;Reload const array registers
+	movdqa	xtmpl1, xgft1_lo
+	movdqa	xtmph2, xgft2_hi		;Reload const array registers
+	movdqa	xtmpl2, xgft2_lo
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	XSTR	[dest1+pos], xd1	;Store result
+	XSTR	[dest2+pos], xd2	;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	movdqa	xd1, xtmpd1	;Restore xd1
+	movdqa	xd2, xtmpd2	;Restore xd2
+	jmp	.loop16_overlap	;Do one more overlap pass
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func             core, ver, snum
+slversion gf_2vect_mad_sse, 00,  00,  0203
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s
index 6935cb19347..14097e06d63 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_3vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -46,6 +44,9 @@
  %define tmp3  r13		; must be saved and restored
  %define tmp4  r12		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define LOG_PS 3
 
@@ -73,6 +74,9 @@
  %define tmp3   r13		; must be saved and restored
  %define tmp4   r14		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  6*16 + 5*8 	; must be an odd multiple of 8
@@ -110,17 +114,97 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans		;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp3_m  var(0)
+ %define tmp4    trans2
+ %define tmp4_m  var(1)
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*2		;2 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*2		;2 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
 %define len   arg0
 %define vec   arg1
 %define mul_array arg2
 %define	src   arg3
-%define dest1 arg4
+%define dest1  arg4
 %define ptr   arg5
+
 %define vec_i tmp2
 %define dest2 tmp3
 %define dest3 tmp4
 %define pos   return
 
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+ %endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR vmovdqu
@@ -136,39 +220,62 @@
  %endif
 %endif
 
+%ifidn PS,8			; 64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
 
-[bits 64]
 section .text
 
-%define xmask0f   xmm11
-%define xgft1_lo  xmm10
-%define xgft1_hi  xmm9
-%define xgft2_lo  xmm8
-%define xgft2_hi  xmm7
-%define xgft3_lo  xmm6
-%define xgft3_hi  xmm5
-
-%define x0     xmm0
-%define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
-%define xp3    xmm4
+%ifidn PS,8			;64-bit code
+ %define xmask0f   xmm11
+ %define xgft1_lo  xmm10
+ %define xgft1_hi  xmm9
+ %define xgft2_lo  xmm8
+ %define xgft2_hi  xmm7
+ %define xgft3_lo  xmm6
+ %define xgft3_hi  xmm5
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+%else
+ %define xmask0f   xmm7
+ %define xgft1_lo  xmm6
+ %define xgft1_hi  xmm5
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+%endif
 
 align 16
 global gf_3vect_dot_prod_avx:function
 func(gf_3vect_dot_prod_avx)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 16
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
 	mov	dest1, [dest1]
-
+	SSTR	dest1_m, dest1
 
 .loop16:
 	vpxor	xp1, xp1
@@ -178,17 +285,19 @@ func(gf_3vect_dot_prod_avx)
 	xor	vec_i, vec_i
 
 .next_vect:
+	SLDR	src, src_m
 	mov	ptr, [src+vec_i]
 
 	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 	vmovdqu	xgft1_hi, [tmp+16]	;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %ifidn PS,8				; 64-bit code
 	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	vmovdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 	vmovdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
-
 	add	tmp, 32
 	add	vec_i, PS
+ %endif
 	XLDR	x0, [ptr+pos]		;Get next source vector
 
 	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
@@ -200,11 +309,23 @@ func(gf_3vect_dot_prod_avx)
 	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	vpxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+ %endif
 	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
 	vpxor	xp2, xgft2_hi		;xp2 += partial
 
+ %ifidn PS,4				; 32-bit code
+	sal	vec, 1
+	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	sar	vec, 1
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
@@ -213,10 +334,14 @@ func(gf_3vect_dot_prod_avx)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
 	XSTR	[dest3+pos], xp3
 
+	SLDR	len, len_m
 	add	pos, 16			;Loop on 16 bytes at a time
 	cmp	pos, len
 	jle	.loop16
@@ -255,6 +380,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                  core, ver, snum
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
-slversion gf_3vect_dot_prod_avx, 02,  03,  0192
+slversion gf_3vect_dot_prod_avx, 02,  04,  0192
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s
index 4ad01531712..d762104ba3d 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_3vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -48,7 +46,10 @@
  %define tmp3  r13		; must be saved and restored
  %define tmp4  r12		; must be saved and restored
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define LOG_PS 3
 
  %define func(x) x:
@@ -77,6 +78,9 @@
  %define tmp3   r13		; must be saved and restored
  %define tmp4   r14		; must be saved and restored
  %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  6*16 + 5*8 	; must be an odd multiple of 8
@@ -114,17 +118,99 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans   ecx
+ %define trans2  esi
+ %define arg0    trans			;trans and trans2 are for the variables in stack
+ %define arg0_m  arg(0)
+ %define arg1    ebx
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    trans
+ %define arg3_m  arg(3)
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp.w   edx
+ %define tmp.b   dl
+ %define tmp2    edi
+ %define tmp3    trans2
+ %define tmp3_m  var(0)
+ %define tmp4    trans2
+ %define tmp4_m  var(1)
+ %define return  eax
+ %macro SLDR     2			;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*2		;2 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*2		;2 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
 %define len   arg0
 %define vec   arg1
 %define mul_array arg2
 %define	src   arg3
 %define dest1 arg4
 %define ptr   arg5
+
 %define vec_i tmp2
 %define dest2 tmp3
 %define dest3 tmp4
 %define pos   return
 
+%ifidn PS,4				;32-bit code
+ %define  len_m   arg0_m
+ %define  src_m   arg3_m
+ %define  dest1_m arg4_m
+ %define  dest2_m tmp3_m
+ %define  dest3_m tmp4_m
+%endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR vmovdqu
@@ -140,32 +226,53 @@
  %endif
 %endif
 
+%ifidn PS,8				;64-bit code
+ default rel
+ [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
-%define xmask0f   ymm11
-%define xmask0fx  xmm11
-%define xgft1_lo  ymm10
-%define xgft1_hi  ymm9
-%define xgft2_lo  ymm8
-%define xgft2_hi  ymm7
-%define xgft3_lo  ymm6
-%define xgft3_hi  ymm5
-
-%define x0     ymm0
-%define xtmpa  ymm1
-%define xp1    ymm2
-%define xp2    ymm3
-%define xp3    ymm4
+%ifidn PS,8				;64-bit code
+ %define xmask0f   ymm11
+ %define xmask0fx  xmm11
+ %define xgft1_lo  ymm10
+ %define xgft1_hi  ymm9
+ %define xgft2_lo  ymm8
+ %define xgft2_hi  ymm7
+ %define xgft3_lo  ymm6
+ %define xgft3_hi  ymm5
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+ %define xp3    ymm4
+%else
+ %define xmask0f   ymm7
+ %define xmask0fx  xmm7
+ %define xgft1_lo  ymm6
+ %define xgft1_hi  ymm5
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+ %define xp3    ymm4
+
+%endif
 
 align 16
 global gf_3vect_dot_prod_avx2:function
 func(gf_3vect_dot_prod_avx2)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 32
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	mov	tmp.b, 0x0f
@@ -173,10 +280,13 @@ func(gf_3vect_dot_prod_avx2)
 	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
 
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
 	mov	dest1, [dest1]
-
+	SSTR	dest1_m, dest1
 
 .loop32:
 	vpxor	xp1, xp1
@@ -186,25 +296,27 @@ func(gf_3vect_dot_prod_avx2)
 	xor	vec_i, vec_i
 
 .next_vect:
+	SLDR	src, src_m
 	mov	ptr, [src+vec_i]
 
 	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
 	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
-
-	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+ %ifidn PS,8				; 64-bit code
+	vmovdqu	   xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
 	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
 
-	vmovdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	   xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
 	vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
 
 	add	tmp, 32
 	add	vec_i, PS
+ %endif
 	XLDR	x0, [ptr+pos]		;Get next source vector
 
 	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
@@ -216,11 +328,27 @@ func(gf_3vect_dot_prod_avx2)
 	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	vpxor	xp1, xgft1_hi		;xp1 += partial
 
-	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
-	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
-	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
-	vpxor	xp2, xgft2_hi		;xp2 += partial
-
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+ %endif
+	vpshufb	   xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	   xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	   xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpxor	   xp2, xgft2_hi		;xp2 += partial
+
+ %ifidn PS,4				; 32-bit code
+	sal     vec, 1
+	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
+	sar	vec, 1
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
@@ -229,10 +357,14 @@ func(gf_3vect_dot_prod_avx2)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
 	XSTR	[dest3+pos], xp3
 
+	SLDR	len, len_m
 	add	pos, 32			;Loop on 32 bytes at a time
 	cmp	pos, len
 	jle	.loop32
@@ -268,6 +400,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                   core, ver, snum
-slversion gf_3vect_dot_prod_avx2, 04,  03,  0197
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_3vect_dot_prod_avx2, 04,  04,  0197
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s
index 925fd3414d6..bfaf2178223 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_3vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -46,6 +44,9 @@
  %define tmp3  r13		; must be saved and restored
  %define tmp4  r12		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define LOG_PS 3
 
@@ -73,6 +74,9 @@
  %define tmp3   r13		; must be saved and restored
  %define tmp4   r14		; must be saved and restored
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  6*16 + 5*8 	; must be an odd multiple of 8
@@ -110,17 +114,97 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans	 ecx
+ %define trans2  esi
+ %define arg0	 trans		;trans and trans2 are for the variables in stack
+ %define arg0_m	 arg(0)
+ %define arg1	 ebx
+ %define arg2	 arg2_m
+ %define arg2_m	 arg(2)
+ %define arg3	 trans
+ %define arg3_m	 arg(3)
+ %define arg4	 trans
+ %define arg4_m	 arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp2	 edi
+ %define tmp3	 trans2
+ %define tmp3_m	 var(0)
+ %define tmp4	 trans2
+ %define tmp4_m	 var(1)
+ %define return	 eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*2		;2 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*2		;2 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
+
 %define len   arg0
 %define vec   arg1
 %define mul_array arg2
 %define	src   arg3
-%define dest1 arg4
+%define dest1  arg4
 %define ptr   arg5
+
 %define vec_i tmp2
 %define dest2 tmp3
 %define dest3 tmp4
 %define pos   return
 
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+ %endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR movdqu
@@ -136,39 +220,62 @@
  %endif
 %endif
 
+%ifidn PS,8				; 64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
 
-[bits 64]
 section .text
 
-%define xmask0f   xmm11
-%define xgft1_lo  xmm10
-%define xgft1_hi  xmm9
-%define xgft2_lo  xmm8
-%define xgft2_hi  xmm7
-%define xgft3_lo  xmm6
-%define xgft3_hi  xmm5
-
-%define x0     xmm0
-%define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
-%define xp3    xmm4
+%ifidn PS,8				;64-bit code
+ %define xmask0f   xmm11
+ %define xgft1_lo  xmm2
+ %define xgft1_hi  xmm3
+ %define xgft2_lo  xmm4
+ %define xgft2_hi  xmm7
+ %define xgft3_lo  xmm6
+ %define xgft3_hi  xmm5
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm10
+ %define xp2    xmm9
+ %define xp3    xmm8
+%else
+ %define xmask0f   xmm7
+ %define xgft1_lo  xmm6
+ %define xgft1_hi  xmm5
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+%endif
 
 align 16
 global gf_3vect_dot_prod_sse:function
 func(gf_3vect_dot_prod_sse)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 16
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
 	mov	dest1, [dest1]
-
+	SSTR	dest1_m, dest1
 
 .loop16:
 	pxor	xp1, xp1
@@ -178,17 +285,19 @@ func(gf_3vect_dot_prod_sse)
 	xor	vec_i, vec_i
 
 .next_vect:
+	SLDR src, src_m
 	mov	ptr, [src+vec_i]
 
 	movdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 	movdqu	xgft1_hi, [tmp+16]	;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %ifidn PS,8				;64-bit code
 	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	movdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 	movdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
-
 	add	tmp, 32
 	add	vec_i, PS
+ %endif
 	XLDR	x0, [ptr+pos]		;Get next source vector
 
 	movdqa	xtmpa, x0		;Keep unshifted copy of src
@@ -201,11 +310,23 @@ func(gf_3vect_dot_prod_sse)
 	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	pxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				;32-bit code
+	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+ %endif
 	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
 	pxor	xp2, xgft2_hi		;xp2 += partial
 
+ %ifidn PS,4				;32-bit code
+	sal	vec, 1
+	movdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	movdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	sar 	vec, 1
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
@@ -214,10 +335,14 @@ func(gf_3vect_dot_prod_sse)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
 	XSTR	[dest3+pos], xp3
 
+	SLDR	len, len_m
 	add	pos, 16			;Loop on 16 bytes at a time
 	cmp	pos, len
 	jle	.loop16
@@ -256,6 +381,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                  core, ver, snum
-slversion gf_3vect_dot_prod_sse, 00,  03,  0063
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_3vect_dot_prod_sse, 00,  05,  0063
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s
new file mode 100644
index 00000000000..5adbcccc6ad
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s
@@ -0,0 +1,294 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	vmovdqa	[rsp+16*3],xmm9
+	vmovdqa	[rsp+16*4],xmm10
+	vmovdqa	[rsp+16*5],xmm11
+	vmovdqa	[rsp+16*6],xmm12
+	vmovdqa	[rsp+16*7],xmm13
+	vmovdqa	[rsp+16*8],xmm14
+	vmovdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	vmovdqa	xmm9, [rsp+16*3]
+	vmovdqa	xmm10, [rsp+16*4]
+	vmovdqa	xmm11, [rsp+16*5]
+	vmovdqa	xmm12, [rsp+16*6]
+	vmovdqa	xmm13, [rsp+16*7]
+	vmovdqa	xmm14, [rsp+16*8]
+	vmovdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft1_lo  xmm14
+%define xgft1_hi  xmm13
+%define xgft2_lo  xmm12
+%define xgft2_hi  xmm11
+%define xgft3_lo  xmm10
+%define xgft3_hi  xmm9
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xd1     xmm8
+%define xd2     xtmpl1
+%define xd3     xtmph1
+
+align 16
+global gf_3vect_mad_avx:function
+func(gf_3vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xgft3_hi, [tmp+2*vec+16]; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest3, [dest1+2*PS]	; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	XLDR	xd2, [dest2+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+pos]	;reuse xtmph1. Get next dest vector
+
+	; dest2
+	vpshufb	xtmph2, xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3		;xd3 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len		;Overlapped offset length-16
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	movdqa	xtmph3, [constip16]	;Load const of i + 16
+	vpinsrb	xtmpl3, xtmpl3, len.w, 15
+	vpshufb	xtmpl3, xtmpl3, xmask0f		;Broadcast len to all bytes
+	vpcmpgtb	xtmpl3, xtmpl3, xtmph3
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xgft1_hi, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft1_hi, xgft1_hi, xgft1_lo	;GF add high and low partials
+	vpand	xgft1_hi, xgft1_hi, xtmpl3
+	vpxor	xd1, xd1, xgft1_hi
+
+	; dest2
+	vpshufb	xgft2_hi, xgft2_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft2_hi, xgft2_hi, xgft2_lo	;GF add high and low partials
+	vpand	xgft2_hi, xgft2_hi, xtmpl3
+	vpxor	xd2, xd2, xgft2_hi
+
+	; dest3
+	vpshufb	xgft3_hi, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_hi, xgft3_lo	;GF add high and low partials
+	vpand	xgft3_hi, xgft3_hi, xtmpl3
+	vpxor	xd3, xd3, xgft3_hi
+
+	XSTR	[dest1+tmp], xd1
+	XSTR	[dest2+tmp], xd2
+	XSTR	[dest3+tmp], xd3
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func             core, ver, snum
+slversion gf_3vect_mad_avx, 02,  00,  0207
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s
new file mode 100644
index 00000000000..077285c3768
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s
@@ -0,0 +1,323 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg0.w ecx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+ %define arg4   r12 		; must be saved, loaded and restored
+ %define arg5   r15 		; must be saved and restored
+
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	vmovdqa	[rsp+16*3],xmm9
+	vmovdqa	[rsp+16*4],xmm10
+	vmovdqa	[rsp+16*5],xmm11
+	vmovdqa	[rsp+16*6],xmm12
+	vmovdqa	[rsp+16*7],xmm13
+	vmovdqa	[rsp+16*8],xmm14
+	vmovdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	vmovdqa	xmm9, [rsp+16*3]
+	vmovdqa	xmm10, [rsp+16*4]
+	vmovdqa	xmm11, [rsp+16*5]
+	vmovdqa	xmm12, [rsp+16*6]
+	vmovdqa	xmm13, [rsp+16*7]
+	vmovdqa	xmm14, [rsp+16*8]
+	vmovdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+ %endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1 arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft1_hi  ymm13
+%define xgft2_lo  ymm12
+%define xgft3_lo  ymm11
+
+%define x0      ymm0
+%define xtmpa   ymm1
+%define xtmph1  ymm2
+%define xtmpl1  ymm3
+%define xtmph2  ymm4
+%define xtmpl2  ymm5
+%define xtmpl2x xmm5
+%define xtmph3  ymm6
+%define xtmpl3  ymm7
+%define xtmpl3x xmm7
+%define xd1     ymm8
+%define xd2     ymm9
+%define xd3     ymm10
+
+align 16
+global gf_3vect_mad_avx2:function
+func(gf_3vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+					;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo
+
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+					; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+					; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest3, [dest1+2*PS]	; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+	XLDR	xd2, [dest2+pos]		;Get next dest vector
+	XLDR	xd3, [dest3+pos]		;Get next dest vector
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xtmpl2, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+
+	vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xtmpl3, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xtmph3		;xd3 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan32:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp.b, 0x1f
+	vpinsrb	xtmpl2x, xtmpl2x, tmp.w, 0
+	vpbroadcastb xtmpl2, xtmpl2x	;Construct mask 0x1f1f1f...
+
+	mov	tmp, len		;Overlapped offset length-32
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph3, [constip32]	;Load const of i + 32
+	vpinsrb	xtmpl3x, xtmpl3x, len.w, 15
+	vinserti128	xtmpl3, xtmpl3, xtmpl3x, 1 ;swapped to xtmpl3x | xtmpl3x
+	vpshufb	xtmpl3, xtmpl3, xtmpl2	;Broadcast len to all bytes. xtmpl2=0x1f1f1f...
+	vpcmpgtb	xtmpl3, xtmpl3, xtmph3
+
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo
+
+	vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xgft1_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl3
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xgft2_lo	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmpl3
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xgft3_lo	;GF add high and low partials
+	vpand	xtmph3, xtmph3, xtmpl3
+	vpxor	xd3, xd3, xtmph3		;xd3 += partial
+
+	XSTR	[dest1+tmp], xd1
+	XSTR	[dest2+tmp], xd2
+	XSTR	[dest3+tmp], xd3
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 32
+constip32:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+	ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func              core, ver, snum
+slversion gf_3vect_mad_avx2, 04,  00,  0208
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s
new file mode 100644
index 00000000000..55ead69f2a9
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s
@@ -0,0 +1,304 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft1_lo  xmm14
+%define xgft1_hi  xmm13
+%define xgft2_lo  xmm12
+%define xgft2_hi  xmm11
+%define xgft3_lo  xmm10
+%define xgft3_hi  xmm9
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xd1     xmm8
+%define xd2     xtmpl1
+%define xd3     xtmph1
+
+align 16
+global gf_3vect_mad_sse:function
+func(gf_3vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5
+	lea	tmp, [mul_array + vec_i]
+
+	movdqu	xgft1_lo, [tmp]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xgft1_hi, [tmp+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xgft2_hi, [tmp+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xgft3_lo, [tmp+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xgft3_hi, [tmp+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	mov	dest2, [dest1+PS]	; reuse mul_array
+	mov	dest3, [dest1+2*PS]	; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+	movdqa	xtmph1, xgft1_hi	;Reload const array registers
+	movdqa	xtmpl1, xgft1_lo
+	movdqa	xtmph2, xgft2_hi	;Reload const array registers
+	movdqa	xtmpl2, xgft2_lo
+	movdqa	xtmph3, xgft3_hi	;Reload const array registers
+	movdqa	xtmpl3, xgft3_lo
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd2, [dest2+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+pos]	;reuse xtmph1. Get next dest vector
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	; dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	XSTR	[dest1+pos], xd1	;Store result
+	XSTR	[dest2+pos], xd2	;Store result
+	XSTR	[dest3+pos], xd3	;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len		;Overlapped offset length-16
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	movdqa	xtmph3, [constip16]	;Load const of i + 16
+	pinsrb	xtmpl3, len.w, 15
+	pshufb	xtmpl3, xmask0f		;Broadcast len to all bytes
+	pcmpgtb	xtmpl3, xtmph3
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
+	pand	xgft1_hi, xtmpl3
+	pxor	xd1, xgft1_hi
+
+	; dest2
+	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
+	pand	xgft2_hi, xtmpl3
+	pxor	xd2, xgft2_hi
+
+	; dest3
+	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
+	pand	xgft3_hi, xtmpl3
+	pxor	xd3, xgft3_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result
+	XSTR	[dest2+tmp], xd2	;Store result
+	XSTR	[dest3+tmp], xd3	;Store result
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func             core, ver, snum
+slversion gf_3vect_mad_sse, 00,  00,  0206
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s
index 6197f017007..5649bc69fb1 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_4vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -48,7 +46,10 @@
  %define tmp5  r14		; must be saved and restored
  %define tmp6  r15		; must be saved and restored
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define LOG_PS 3
 
  %define func(x) x:
@@ -81,6 +82,9 @@
  %define tmp5   rdi		; must be saved and restored
  %define tmp6   rsi		; must be saved and restored
  %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  9*16 + 7*8		; must be an odd multiple of 8
@@ -128,6 +132,82 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	var2
+;;;	var3
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS     4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans	 ecx
+ %define trans2  esi
+ %define arg0	 trans		;trans and trans2 are for the variables in stack
+ %define arg0_m	 arg(0)
+ %define arg1	 ebx
+ %define arg2	 arg2_m
+ %define arg2_m	 arg(2)
+ %define arg3	 trans
+ %define arg3_m	 arg(3)
+ %define arg4	 trans
+ %define arg4_m	 arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp2	 edi
+ %define tmp3	 trans2
+ %define tmp3_m	 var(0)
+ %define tmp4	 trans2
+ %define tmp4_m	 var(1)
+ %define tmp5	 trans2
+ %define tmp5_m	 var(2)
+ %define tmp6	 trans2
+ %define tmp6_m	 var(3)
+ %define return	 eax
+ %macro SLDR 2				;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*4		;4 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*4		;4 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
 
 %define len    arg0
 %define vec    arg1
@@ -140,7 +220,17 @@
 %define dest3  tmp4
 %define dest4  tmp5
 %define vskip3 tmp6
-%define pos   return
+%define pos    return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+	%define  dest4_m tmp5_m
+	%define  vskip3_m tmp6_m
+ %endif
 
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
@@ -157,46 +247,73 @@
  %endif
 %endif
 
+%ifidn PS,8				; 64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
 
-[bits 64]
 section .text
 
-%define xmask0f   xmm14
-%define xgft1_lo  xmm13
-%define xgft1_hi  xmm12
-%define xgft2_lo  xmm11
-%define xgft2_hi  xmm10
-%define xgft3_lo  xmm9
-%define xgft3_hi  xmm8
-%define xgft4_lo  xmm7
-%define xgft4_hi  xmm6
-
-
-%define x0     xmm0
-%define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
-%define xp3    xmm4
-%define xp4    xmm5
-
+%ifidn PS,8				;64-bit code
+ %define xmask0f   xmm14
+ %define xgft1_lo  xmm13
+ %define xgft1_hi  xmm12
+ %define xgft2_lo  xmm11
+ %define xgft2_hi  xmm10
+ %define xgft3_lo  xmm9
+ %define xgft3_hi  xmm8
+ %define xgft4_lo  xmm7
+ %define xgft4_hi  xmm6
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+ %define xp4    xmm5
+%else
+ %define xmm_trans xmm7			;reuse xmask0f and xgft1_lo
+ %define xmask0f   xmm_trans
+ %define xgft1_lo  xmm_trans
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+ %define xgft4_lo  xgft1_lo
+ %define xgft4_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+ %define xp4    xmm5
+%endif
 align 16
 global gf_4vect_dot_prod_avx:function
 func(gf_4vect_dot_prod_avx)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 16
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
 	mov	vskip3, vec
 	imul	vskip3, 96
+	SSTR	vskip3_m, vskip3
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
 	mov	dest4, [dest1+3*PS]
+	SSTR	dest4_m, dest4
 	mov	dest1, [dest1]
-
+	SSTR	dest1_m, dest1
 
 .loop16:
 	vpxor	xp1, xp1
@@ -207,41 +324,70 @@ func(gf_4vect_dot_prod_avx)
 	xor	vec_i, vec_i
 
 .next_vect:
+	SLDR 	src, src_m
 	mov	ptr, [src+vec_i]
 
+ %ifidn PS,8				;64-bit code
 	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 	vmovdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	vmovdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 	vmovdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
-	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
-	vmovdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	vmovdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
 
-	XLDR	x0, [ptr+pos]		;Get next source vector
+	XLDR	x0, 	[ptr+pos]	;Get next source vector
 	add	tmp, 32
 	add	vec_i, PS
 
 	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
 	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
 	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+ %else					;32-bit code
+	XLDR	x0, [ptr+pos]		;Get next source vector
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
 
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	vmovdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %endif
 
 	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	vpxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				;32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	vmovdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+ %endif
 	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
 	vpxor	xp2, xgft2_hi		;xp2 += partial
 
+ %ifidn PS,4				;32-bit code
+	sal	vec, 1
+	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	vmovdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	sar 	vec, 1
+ %endif
 	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
 	vpxor	xp3, xgft3_hi		;xp3 += partial
 
+ %ifidn PS,4				;32-bit code
+	SLDR	vskip3, vskip3_m
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	vmovdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	vpshufb	xgft4_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft4_hi, xgft4_lo	;GF add high and low partials
@@ -250,11 +396,16 @@ func(gf_4vect_dot_prod_avx)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
 	XSTR	[dest3+pos], xp3
+	SLDR	dest4, dest4_m
 	XSTR	[dest4+pos], xp4
 
+	SLDR	len, len_m
 	add	pos, 16			;Loop on 16 bytes at a time
 	cmp	pos, len
 	jle	.loop16
@@ -293,6 +444,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                  core, ver, snum
-slversion gf_4vect_dot_prod_avx, 00,  02,  0064
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_4vect_dot_prod_avx, 02,  04,  0193
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s
index e4267e201f2..dcd46f39cbf 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_4vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -50,7 +48,10 @@
  %define tmp5  r14		; must be saved and restored
  %define tmp6  r15		; must be saved and restored
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define LOG_PS 3
 
  %define func(x) x:
@@ -85,6 +86,9 @@
  %define tmp5   rdi		; must be saved and restored
  %define tmp6   rsi		; must be saved and restored
  %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  9*16 + 7*8		; must be an odd multiple of 8
@@ -132,6 +136,84 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	var2
+;;;	var3
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS     4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans	 ecx
+ %define trans2  esi
+ %define arg0	 trans		;trans and trans2 are for the variables in stack
+ %define arg0_m	 arg(0)
+ %define arg1	 ebx
+ %define arg2	 arg2_m
+ %define arg2_m	 arg(2)
+ %define arg3	 trans
+ %define arg3_m	 arg(3)
+ %define arg4	 trans
+ %define arg4_m	 arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp.w   edx
+ %define tmp.b   dl
+ %define tmp2	 edi
+ %define tmp3	 trans2
+ %define tmp3_m	 var(0)
+ %define tmp4	 trans2
+ %define tmp4_m	 var(1)
+ %define tmp5	 trans2
+ %define tmp5_m	 var(2)
+ %define tmp6	 trans2
+ %define tmp6_m	 var(3)
+ %define return	 eax
+ %macro SLDR 2				;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*4		;4 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*4		;4 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
 
 %define len    arg0
 %define vec    arg1
@@ -144,7 +226,17 @@
 %define dest3  tmp4
 %define dest4  tmp5
 %define vskip3 tmp6
-%define pos   return
+%define pos    return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+	%define  dest4_m tmp5_m
+	%define  vskip3_m tmp6_m
+ %endif
 
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
@@ -161,36 +253,59 @@
  %endif
 %endif
 
+%ifidn PS,8				;64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
 
-[bits 64]
 section .text
 
-%define xmask0f   ymm14
-%define xmask0fx  xmm14
-%define xgft1_lo  ymm13
-%define xgft1_hi  ymm12
-%define xgft2_lo  ymm11
-%define xgft2_hi  ymm10
-%define xgft3_lo  ymm9
-%define xgft3_hi  ymm8
-%define xgft4_lo  ymm7
-%define xgft4_hi  ymm6
-
-
-%define x0     ymm0
-%define xtmpa  ymm1
-%define xp1    ymm2
-%define xp2    ymm3
-%define xp3    ymm4
-%define xp4    ymm5
-
+%ifidn PS,8				;64-bit code
+ %define xmask0f   ymm14
+ %define xmask0fx  xmm14
+ %define xgft1_lo  ymm13
+ %define xgft1_hi  ymm12
+ %define xgft2_lo  ymm11
+ %define xgft2_hi  ymm10
+ %define xgft3_lo  ymm9
+ %define xgft3_hi  ymm8
+ %define xgft4_lo  ymm7
+ %define xgft4_hi  ymm6
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+ %define xp3    ymm4
+ %define xp4    ymm5
+%else
+ %define ymm_trans ymm7			;reuse xmask0f and xgft1_hi
+ %define xmask0f   ymm_trans
+ %define xmask0fx  xmm7
+ %define xgft1_lo  ymm6
+ %define xgft1_hi  ymm_trans
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+ %define xgft4_lo  xgft1_lo
+ %define xgft4_hi  xgft1_hi
+
+ %define x0     ymm0
+ %define xtmpa  ymm1
+ %define xp1    ymm2
+ %define xp2    ymm3
+ %define xp3    ymm4
+ %define xp4    ymm5
+%endif
 align 16
 global gf_4vect_dot_prod_avx2:function
 func(gf_4vect_dot_prod_avx2)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 32
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	mov	tmp.b, 0x0f
@@ -198,12 +313,17 @@ func(gf_4vect_dot_prod_avx2)
 	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
 	mov	vskip3, vec
 	imul	vskip3, 96
+	SSTR	vskip3_m, vskip3
 	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, dest1_m
 	mov	dest2, [dest1+PS]
+	SSTR	dest2_m, dest2
 	mov	dest3, [dest1+2*PS]
+	SSTR	dest3_m, dest3
 	mov	dest4, [dest1+3*PS]
+	SSTR	dest4_m, dest4
 	mov	dest1, [dest1]
-
+	SSTR	dest1_m, dest1
 
 .loop32:
 	vpxor	xp1, xp1
@@ -214,10 +334,12 @@ func(gf_4vect_dot_prod_avx2)
 	xor	vec_i, vec_i
 
 .next_vect:
+	SLDR	src, src_m
 	mov	ptr, [src+vec_i]
 	XLDR	x0, [ptr+pos]		;Get next source vector
-	add	vec_i, PS
 
+	add	vec_i, PS
+ %ifidn PS,8				;64-bit code
 	vpand	xgft4_lo, x0, xmask0f	;Mask low src nibble in bits 4-0
 	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
 	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
@@ -230,30 +352,64 @@ func(gf_4vect_dot_prod_avx2)
 						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	vmovdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
-	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
-						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     Dx{00}, Dx{10}, ..., Dx{f0}
 
 	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
 	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
 	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
 	vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
 	add	tmp, 32
+ %else					;32-bit code
+	mov	cl, 0x0f		;use ecx as a temp variable
+	vpinsrb	xmask0fx, xmask0fx, ecx, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	vpand	xgft4_lo, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xgft4_lo, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xgft4_lo, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+						;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+ %endif
 
 	vpshufb	xgft1_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	vpxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				; 32-bit code
+	vmovdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+ %endif
 	vpshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft2_hi, xgft2_lo	;GF add high and low partials
 	vpxor	xp2, xgft2_hi		;xp2 += partial
 
+ %ifidn PS,4				; 32-bit code
+	sal     vec, 1
+	vmovdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	sar	vec, 1
+ %endif
 	vpshufb	xgft3_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft3_hi, xgft3_lo	;GF add high and low partials
 	vpxor	xp3, xgft3_hi		;xp3 += partial
 
+ %ifidn PS,4				; 32-bit code
+	SLDR	vskip3, vskip3_m
+	vmovdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     DX{00}, Dx{10}, ..., Dx{f0}
+	vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	add	tmp, 32
+ %endif
 	vpshufb	xgft4_hi, x0		;Lookup mul table of high nibble
 	vpshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
 	vpxor	xgft4_hi, xgft4_lo	;GF add high and low partials
@@ -262,11 +418,16 @@ func(gf_4vect_dot_prod_avx2)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
 	XSTR	[dest3+pos], xp3
+	SLDR	dest4, dest4_m
 	XSTR	[dest4+pos], xp4
 
+	SLDR	len, len_m
 	add	pos, 32			;Loop on 32 bytes at a time
 	cmp	pos, len
 	jle	.loop32
@@ -302,6 +463,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                   core, ver, snum
-slversion gf_4vect_dot_prod_avx2, 04,  03,  0064
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_4vect_dot_prod_avx2, 04,  04,  0198
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s
index 920a8da71c8..4d716ef585b 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_4vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -48,7 +46,10 @@
  %define tmp5  r14		; must be saved and restored
  %define tmp6  r15		; must be saved and restored
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define LOG_PS 3
 
  %define func(x) x:
@@ -81,6 +82,9 @@
  %define tmp5   rdi		; must be saved and restored
  %define tmp6   rsi		; must be saved and restored
  %define return rax
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
  %define PS     8
  %define LOG_PS 3
  %define stack_size  9*16 + 7*8		; must be an odd multiple of 8
@@ -128,6 +132,82 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	var0
+;;;	var1
+;;;	var2
+;;;	var3
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS     4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+ %define var(x) [ebp - PS - PS*x]
+
+ %define trans	 ecx
+ %define trans2  esi
+ %define arg0	 trans		;trans and trans2 are for the variables in stack
+ %define arg0_m	 arg(0)
+ %define arg1	 ebx
+ %define arg2	 arg2_m
+ %define arg2_m	 arg(2)
+ %define arg3	 trans
+ %define arg3_m	 arg(3)
+ %define arg4	 trans
+ %define arg4_m	 arg(4)
+ %define arg5	 trans2
+ %define tmp	 edx
+ %define tmp2	 edi
+ %define tmp3	 trans2
+ %define tmp3_m	 var(0)
+ %define tmp4	 trans2
+ %define tmp4_m	 var(1)
+ %define tmp5	 trans2
+ %define tmp5_m	 var(2)
+ %define tmp6	 trans2
+ %define tmp6_m	 var(3)
+ %define return	 eax
+ %macro SLDR 2				;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	sub	esp, PS*4		;4 local variables
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg1, arg(1)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	add	esp, PS*4		;4 local variables
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
 
 %define len    arg0
 %define vec    arg1
@@ -140,7 +220,17 @@
 %define dest3  tmp4
 %define dest4  tmp5
 %define vskip3 tmp6
-%define pos   return
+%define pos    return
+
+ %ifidn PS,4				;32-bit code
+	%define  len_m 	arg0_m
+	%define  src_m 	arg3_m
+	%define  dest1_m arg4_m
+	%define  dest2_m tmp3_m
+	%define  dest3_m tmp4_m
+	%define  dest4_m tmp5_m
+	%define  vskip3_m tmp6_m
+ %endif
 
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
@@ -157,46 +247,73 @@
  %endif
 %endif
 
+%ifidn PS,8				; 64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
 
-[bits 64]
 section .text
 
-%define xmask0f   xmm14
-%define xgft1_lo  xmm13
-%define xgft1_hi  xmm12
-%define xgft2_lo  xmm11
-%define xgft2_hi  xmm10
-%define xgft3_lo  xmm9
-%define xgft3_hi  xmm8
-%define xgft4_lo  xmm7
-%define xgft4_hi  xmm6
-
-
-%define x0     xmm0
-%define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
-%define xp3    xmm4
-%define xp4    xmm5
-
+%ifidn PS,8				;64-bit code
+ %define xmask0f   xmm14
+ %define xgft1_lo  xmm2
+ %define xgft1_hi  xmm3
+ %define xgft2_lo  xmm11
+ %define xgft2_hi  xmm4
+ %define xgft3_lo  xmm9
+ %define xgft3_hi  xmm5
+ %define xgft4_lo  xmm7
+ %define xgft4_hi  xmm6
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm8
+ %define xp2    xmm10
+ %define xp3    xmm12
+ %define xp4    xmm13
+%else
+ %define xmm_trans xmm7			;reuse xmask0f and xgft1_lo
+ %define xmask0f   xmm_trans
+ %define xgft1_lo  xmm_trans
+ %define xgft1_hi  xmm6
+ %define xgft2_lo  xgft1_lo
+ %define xgft2_hi  xgft1_hi
+ %define xgft3_lo  xgft1_lo
+ %define xgft3_hi  xgft1_hi
+ %define xgft4_lo  xgft1_lo
+ %define xgft4_hi  xgft1_hi
+
+ %define x0     xmm0
+ %define xtmpa  xmm1
+ %define xp1    xmm2
+ %define xp2    xmm3
+ %define xp3    xmm4
+ %define xp4    xmm5
+%endif
 align 16
 global gf_4vect_dot_prod_sse:function
 func(gf_4vect_dot_prod_sse)
 	FUNC_SAVE
+	SLDR	len, len_m
 	sub	len, 16
+	SSTR	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
-	mov	vskip3, vec
-	imul	vskip3, 96
-	sal	vec, LOG_PS		;vec *= PS. Make vec_i count by PS
-	mov	dest2, [dest1+PS]
-	mov	dest3, [dest1+2*PS]
-	mov	dest4, [dest1+3*PS]
-	mov	dest1, [dest1]
-
+	mov	vskip3,  vec
+	imul	vskip3,  96
+	SSTR	vskip3_m, vskip3
+	sal	vec, 	 LOG_PS		;vec *= PS. Make vec_i count by PS
+	SLDR	dest1, 	 dest1_m
+	mov	dest2, 	 [dest1+PS]
+	SSTR	dest2_m, dest2
+	mov	dest3, 	 [dest1+2*PS]
+	SSTR	dest3_m, dest3
+	mov	dest4, 	 [dest1+3*PS]
+	SSTR	dest4_m, dest4
+	mov	dest1, 	 [dest1]
+	SSTR	dest1_m, dest1
 
 .loop16:
 	pxor	xp1, xp1
@@ -207,41 +324,72 @@ func(gf_4vect_dot_prod_sse)
 	xor	vec_i, vec_i
 
 .next_vect:
+	SLDR 	src, src_m
 	mov	ptr, [src+vec_i]
 
+ %ifidn PS,8				;64-bit code
 	movdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
 	movdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
 	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
 	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
 	movdqu	xgft3_lo, [tmp+vec*(64/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 	movdqu	xgft3_hi, [tmp+vec*(64/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
-	movdqu	xgft4_lo, [tmp+vskip3]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
-	movdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	movdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	movdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
 
-	XLDR	x0, [ptr+pos]		;Get next source vector
-	add	tmp, 32
-	add	vec_i, PS
+	XLDR	x0, 	[ptr+pos]	;Get next source vector
+	add	tmp, 	32
+	add	vec_i, 	PS
 
 	movdqa	xtmpa, x0		;Keep unshifted copy of src
 	psraw	x0, 4			;Shift to put high nibble into bits 4-0
 	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, 	xmask0f		;Mask low src nibble in bits 4-0
+ %else					;32-bit code
+	XLDR	x0, 	 [ptr+pos]	;Get next source vector
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	movdqa	xtmpa, 	x0		;Keep unshifted copy of src
+	psraw	x0, 	4		;Shift to put high nibble into bits 4-0
+	pand	x0, 	xmask0f		;Mask high src nibble in bits 4-0
 	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
 
+	movdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+	movdqu	xgft1_hi, [tmp+16]		;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+ %endif
+
 	pshufb	xgft1_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft1_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft1_hi, xgft1_lo	;GF add high and low partials
 	pxor	xp1, xgft1_hi		;xp1 += partial
 
+ %ifidn PS,4				;32-bit code
+	movdqu	xgft2_lo, [tmp+vec*(32/PS)]	;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+	movdqu	xgft2_hi, [tmp+vec*(32/PS)+16]	;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+ %endif
 	pshufb	xgft2_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft2_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft2_hi, xgft2_lo	;GF add high and low partials
 	pxor	xp2, xgft2_hi		;xp2 += partial
 
+ %ifidn PS,4				;32-bit code
+	sal	vec, 1
+	movdqu	xgft3_lo, [tmp+vec*(32/PS)]	;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+	movdqu	xgft3_hi, [tmp+vec*(32/PS)+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	sar 	vec, 1
+ %endif
 	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft3_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft3_hi, xgft3_lo	;GF add high and low partials
 	pxor	xp3, xgft3_hi		;xp3 += partial
 
+ %ifidn PS,4				;32-bit code
+	SLDR	vskip3, vskip3_m
+	movdqu	xgft4_lo, [tmp+vskip3]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+	movdqu	xgft4_hi, [tmp+vskip3+16]	;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+	add	tmp, 32
+	add	vec_i, PS
+ %endif
 	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft4_hi, xgft4_lo	;GF add high and low partials
@@ -250,11 +398,16 @@ func(gf_4vect_dot_prod_sse)
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR	dest1, dest1_m
+	SLDR	dest2, dest2_m
 	XSTR	[dest1+pos], xp1
 	XSTR	[dest2+pos], xp2
+	SLDR	dest3, dest3_m
 	XSTR	[dest3+pos], xp3
+	SLDR	dest4, dest4_m
 	XSTR	[dest4+pos], xp4
 
+	SLDR	len, len_m
 	add	pos, 16			;Loop on 16 bytes at a time
 	cmp	pos, len
 	jle	.loop16
@@ -293,6 +446,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                  core, ver, snum
-slversion gf_4vect_dot_prod_sse, 00,  03,  0064
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_4vect_dot_prod_sse, 00,  05,  0064
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s
new file mode 100644
index 00000000000..605e42a901f
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s
@@ -0,0 +1,343 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r15,  10*16 + 2*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r15,  [rsp + 10*16 + 2*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 tmp2
+%define dest4 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft3_hi xmm14
+%define xgft4_hi xmm13
+%define xgft4_lo xmm12
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xtmph4  xmm8
+%define xtmpl4  xmm9
+%define xd1     xmm10
+%define xd2     xmm11
+%define xd3     xtmph1
+%define xd4     xtmpl1
+
+align 16
+global gf_4vect_mad_avx:function
+func(gf_4vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	mov	tmp, vec
+
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+
+	sal	tmp, 6			;Multiply by 64
+	vmovdqu	xgft3_hi, [tmp3+tmp+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	sal	vec, 5			;Multiply by 32
+	add	tmp, vec
+	vmovdqu	xgft4_lo, [tmp3+tmp]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+	vmovdqu	xgft4_hi, [tmp3+tmp+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+
+	mov	dest2, [dest1+PS]		; reuse mul_array
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]		; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpl1, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1
+
+	XLDR	xd3, [dest3+pos]	;Reuse xtmph1, Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Reuse xtmpl1, Get next dest vector
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2
+
+	; dest3
+	vpshufb	xtmph3, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3
+
+	; dest4
+	vpshufb	xtmph4, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl4, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph4, xtmph4, xtmpl4		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph4
+
+	XSTR	[dest1+pos], xd1	;Store result
+	XSTR	[dest2+pos], xd2	;Store result
+	XSTR	[dest3+pos], xd3	;Store result
+	XSTR	[dest4+pos], xd4	;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+
+	mov	tmp, len	;Overlapped offset length-16
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xtmph4, [dest3+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmpl4, [constip16]	;Load const of i + 16
+	vpinsrb	xtmph3, xtmph3, len.w, 15
+	vpshufb	xtmph3, xtmph3, xmask0f		;Broadcast len to all bytes
+	vpcmpgtb	xtmph3, xtmph3, xtmpl4
+
+	XLDR	xtmpl4, [dest4+tmp]	;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpl1, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmph3
+	vpxor	xd1, xd1, xtmph1
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpl2, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmph3
+	vpxor	xd2, xd2, xtmph2
+
+	; dest3
+	vpshufb	xgft3_hi, xgft3_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft3_hi, xgft3_hi, xtmpl3	;GF add high and low partials
+	vpand	xgft3_hi, xgft3_hi, xtmph3
+	vpxor	xtmph4, xtmph4, xgft3_hi
+
+	; dest4
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_hi, xgft4_lo	;GF add high and low partials
+	vpand	xgft4_hi, xgft4_hi, xtmph3
+	vpxor	xtmpl4, xtmpl4, xgft4_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result
+	XSTR	[dest2+tmp], xd2	;Store result
+	XSTR	[dest3+tmp], xtmph4	;Store result
+	XSTR	[dest4+tmp], xtmpl4	;Store result
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func             core, ver, snum
+slversion gf_4vect_mad_avx, 02,  00,  020a
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s
new file mode 100644
index 00000000000..ad3eafa4db6
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s
@@ -0,0 +1,348 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+
+;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 vec
+%define dest4 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft2_lo  ymm13
+%define xgft3_lo  ymm12
+%define xgft4_lo  ymm11
+
+%define x0      ymm0
+%define xtmpa   ymm1
+%define xtmpl   ymm2
+%define xtmplx  xmm2
+%define xtmph1  ymm3
+%define xtmph1x xmm3
+%define xtmph2  ymm4
+%define xtmph3  ymm5
+%define xtmph4  ymm6
+%define xd1     ymm7
+%define xd2     ymm8
+%define xd3     ymm9
+%define xd4     ymm10
+
+align 16
+global gf_4vect_mad_avx2:function
+func(gf_4vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5			;Multiply by 32
+	lea	tmp, [mul_array + vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+					; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+					; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+					; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	add	tmp, vec
+	vmovdqu	xgft4_lo, [tmp+2*vec]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+					; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+
+	mov	dest2, [dest1+PS]		; reuse mul_array
+	mov	dest3, [dest1+2*PS]		; reuse vec
+	mov	dest4, [dest1+3*PS]		; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+	XLDR	xd3, [dest3+pos]	;Get next dest vector
+	XLDR	xd4, [dest4+pos]	;reuse xtmpl1. Get next dest vector
+
+	vpand	xtmpl, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vperm2i128 xtmpa, xtmpl, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmpl, x0, 0x12		;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3		;xd3 += partial
+
+	; dest4
+	vpshufb	xtmph4, xtmph4, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph4, xtmph4, xtmpl		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph4		;xd4 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+	XSTR	[dest4+pos], xd4
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan32:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp.b, 0x1f
+	vpinsrb	xtmph1x, xtmph1x, tmp.w, 0
+	vpbroadcastb xtmph1, xtmph1x	;Construct mask 0x1f1f1f...
+
+	mov	tmp, len		;Overlapped offset length-32
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph2, [constip32]	;Load const of i + 32
+	vpinsrb	xtmplx, xtmplx, len.w, 15
+	vinserti128	xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
+	vpshufb	xtmpl, xtmpl, xtmph1	;Broadcast len to all bytes. xtmph1=0x1f1f1f...
+	vpcmpgtb	xtmpl, xtmpl, xtmph2
+
+	vpand	xtmph1, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vperm2i128 xtmpa, xtmph1, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmph1, x0, 0x12		;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xgft1_lo	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xgft2_lo	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmpl
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xgft3_lo	;GF add high and low partials
+	vpand	xtmph3, xtmph3, xtmpl
+	vpxor	xd3, xd3, xtmph3		;xd3 += partial
+
+	; dest4
+	vpshufb	xtmph4, xtmph4, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph4, xtmph4, xgft4_lo	;GF add high and low partials
+	vpand	xtmph4, xtmph4, xtmpl
+	vpxor	xd4, xd4, xtmph4		;xd4 += partial
+
+	XSTR	[dest1+tmp], xd1
+	XSTR	[dest2+tmp], xd2
+	XSTR	[dest3+tmp], xd3
+	XSTR	[dest4+tmp], xd4
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+align 32
+constip32:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+	ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func              core, ver, snum
+slversion gf_4vect_mad_avx2, 04,  00,  020b
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s
new file mode 100644
index 00000000000..038f926de4f
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s
@@ -0,0 +1,348 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r15,  10*16 + 2*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r15,  [rsp + 10*16 + 2*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 tmp2
+%define dest4 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft3_hi xmm14
+%define xgft4_hi xmm13
+%define xgft4_lo xmm12
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xtmph4  xmm8
+%define xtmpl4  xmm9
+%define xd1     xmm10
+%define xd2     xmm11
+%define xd3     xtmph1
+%define xd4     xtmpl1
+
+align 16
+global gf_4vect_mad_sse:function
+func(gf_4vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	tmp, vec
+
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+
+	sal	tmp, 6			;Multiply by 64
+
+	movdqu	xgft3_hi, [tmp3+tmp+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	sal	vec, 5			;Multiply by 32
+	add	tmp, vec
+	movdqu	xgft4_lo, [tmp3+tmp]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+	movdqu	xgft4_hi, [tmp3+tmp+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+
+	mov	dest2, [dest1+PS]		; reuse mul_array
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]		; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+
+	movdqa	xtmph3, xgft3_hi
+	movdqa	xtmpl4, xgft4_lo
+	movdqa	xtmph4, xgft4_hi
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd3, [dest3+pos]	;Reuse xtmph1, Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Reuse xtmpl1, Get next dest vector
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	; dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	; dest4
+	pshufb	xtmph4, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl4, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph4, xtmpl4		;GF add high and low partials
+	pxor	xd4, xtmph4
+
+	XSTR	[dest1+pos], xd1	;Store result
+	XSTR	[dest2+pos], xd2	;Store result
+	XSTR	[dest3+pos], xd3	;Store result
+	XSTR	[dest4+pos], xd4	;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len	;Overlapped offset length-16
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xtmph4, [dest3+tmp]	;Reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	movdqa	xtmpl4, [constip16]	;Load const of i + 16
+	pinsrb	xtmph3, len.w, 15
+	pshufb	xtmph3, xmask0f		;Broadcast len to all bytes
+	pcmpgtb	xtmph3, xtmpl4
+
+	XLDR	xtmpl4, [dest4+tmp]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pand	xtmph1, xtmph3
+	pxor	xd1, xtmph1
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pand	xtmph2, xtmph3
+	pxor	xd2, xtmph2
+
+	; dest3
+	pshufb	xgft3_hi, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft3_hi, xtmpl3	;GF add high and low partials
+	pand	xgft3_hi, xtmph3
+	pxor	xtmph4, xgft3_hi
+
+	; dest4
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo	;GF add high and low partials
+	pand	xgft4_hi, xtmph3
+	pxor	xtmpl4, xgft4_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result
+	XSTR	[dest2+tmp], xd2	;Store result
+	XSTR	[dest3+tmp], xtmph4	;Store result
+	XSTR	[dest4+tmp], xtmpl4	;Store result
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func             core, ver, snum
+slversion gf_4vect_mad_sse, 00,  00,  0209
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s
index a5625659713..1ef451f2006 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_5vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -309,5 +307,3 @@ global %1_slver
 %endmacro
 ;;;       func                  core, ver, snum
 slversion gf_5vect_dot_prod_avx, 02,  03,  0194
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s
index 7f25c1622bb..a7a41c2d568 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_5vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -321,5 +319,3 @@ global %1_slver
 %endmacro
 ;;;       func                  core, ver, snum
 slversion gf_5vect_dot_prod_avx2, 04,  03,  0199
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s
index 003ad261424..6264db60d0c 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_5vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -165,23 +163,23 @@ default rel
 section .text
 
 %define xmask0f   xmm15
-%define xgft1_lo  xmm14
-%define xgft1_hi  xmm13
-%define xgft2_lo  xmm12
-%define xgft2_hi  xmm11
+%define xgft1_lo  xmm2
+%define xgft1_hi  xmm3
+%define xgft2_lo  xmm4
+%define xgft2_hi  xmm5
 %define xgft3_lo  xmm10
-%define xgft3_hi  xmm9
+%define xgft3_hi  xmm6
 %define xgft4_lo  xmm8
 %define xgft4_hi  xmm7
 
 
 %define x0     xmm0
 %define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
-%define xp3    xmm4
-%define xp4    xmm5
-%define xp5    xmm6
+%define xp1    xmm9
+%define xp2    xmm11
+%define xp3    xmm12
+%define xp4    xmm13
+%define xp5    xmm14
 
 align 16
 global gf_5vect_dot_prod_sse:function
@@ -309,6 +307,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                  core, ver, snum
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
-slversion gf_5vect_dot_prod_sse, 00,  03,  0065
+slversion gf_5vect_dot_prod_sse, 00,  04,  0065
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s
new file mode 100644
index 00000000000..4660a352278
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s
@@ -0,0 +1,371 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define tmp4   r14
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 5*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define tmp4   r13
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp4
+%define dest3 mul_array
+%define dest4 tmp2
+%define dest5 vec_i
+
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft5_hi xmm14
+%define xgft4_lo xmm13
+%define xgft4_hi xmm12
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xtmph5  xmm8
+%define xtmpl5  xmm9
+%define xd1     xmm10
+%define xd2     xmm11
+%define xd3     xtmpl1
+%define xd4     xtmph1
+%define xd5     xtmpl2
+
+
+align 16
+global gf_5vect_mad_avx:function
+func(gf_5vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	tmp, vec
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+	sal	tmp, 6			;Multiply by 64
+	vmovdqu	xgft5_hi, [tmp3+2*tmp+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	sal	vec, 5			;Multiply by 32
+	add	tmp, vec
+	vmovdqu	xgft4_hi, [tmp3+tmp+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+	vmovdqu	xgft4_lo, [tmp3+tmp]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+
+	mov	dest3, [dest1+2*PS]	; reuse mul_array
+	mov	dest4, [dest1+3*PS]
+	mov	dest5, [dest1+4*PS]	; reuse vec_i
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xtmpl5, [tmp3+4*vec]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpl1, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1
+
+	XLDR	xd3, [dest3+pos]	;Reuse xtmpl1, Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Reuse xtmph1, Get next dest vector
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2
+
+	XLDR	xd5, [dest5+pos]	;Reuse xtmpl2. Get next dest vector
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3
+
+	; dest4
+	vpshufb	xtmph2, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl3		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph2
+
+	; dest5
+	vpshufb	xtmph5, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl5, xtmpl5, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph5, xtmph5, xtmpl5	;GF add high and low partials
+	vpxor	xd5, xd5, xtmph5
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+	XSTR	[dest4+pos], xd4	;Store result into dest4
+	XSTR	[dest5+pos], xd5	;Store result into dest5
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len	;Overlapped offset length-16
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph1, [constip16]	;Load const of i + 16
+	vpinsrb	xtmph5, len.w, 15
+	vpshufb	xtmph5, xmask0f		;Broadcast len to all bytes
+	vpcmpgtb	xtmph5, xtmph5, xtmph1
+
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xtmpl5, [tmp3+4*vec]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpl1, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmph5
+	vpxor	xd1, xd1, xtmph1
+
+	XLDR	xd3, [dest3+tmp]	;Reuse xtmpl1, Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Reuse xtmph1, Get next dest vector
+
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmph5
+	vpxor	xd2, xd2, xtmph2
+
+	XLDR	xd5, [dest5+tmp]	;Reuse xtmpl2. Get next dest vector
+
+	; dest3
+	vpshufb	xtmph3, xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpand	xtmph3, xtmph3, xtmph5
+	vpxor	xd3, xd3, xtmph3
+
+	; dest4
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_hi, xgft4_lo		;GF add high and low partials
+	vpand	xgft4_hi, xgft4_hi, xtmph5
+	vpxor	xd4, xd4, xgft4_hi
+
+	; dest5
+	vpshufb	xgft5_hi, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl5, xtmpl5, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft5_hi, xgft5_hi, xtmpl5	;GF add high and low partials
+	vpand	xgft5_hi, xgft5_hi, xtmph5
+	vpxor	xd5, xd5, xgft5_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+	XSTR	[dest4+tmp], xd4	;Store result into dest4
+	XSTR	[dest5+tmp], xd5	;Store result into dest5
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func             core, ver, snum
+slversion gf_5vect_mad_avx, 02,  00,  020d
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s
new file mode 100644
index 00000000000..db84189af89
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s
@@ -0,0 +1,369 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r15,  10*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r15,  [rsp + 10*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp2
+%define dest3 mul_array
+%define dest4 vec
+%define dest5 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f   ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft2_lo  ymm13
+%define xgft3_lo  ymm12
+%define xgft4_lo  ymm11
+%define xgft5_lo  ymm10
+
+%define x0      ymm0
+%define xtmpa   ymm1
+%define xtmpl   ymm2
+%define xtmplx  xmm2
+%define xtmph1  ymm3
+%define xtmph1x xmm3
+%define xtmph2  ymm4
+%define xd1     ymm5
+%define xd2     ymm6
+%define xd3     ymm7
+%define xd4     ymm8
+%define xd5     ymm9
+
+align 16
+global gf_5vect_mad_avx2:function
+func(gf_5vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5			;Multiply by 32
+	lea	tmp, [mul_array + vec_i]
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+						;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]		;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft5_lo, [tmp+4*vec]		;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+						;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	add	tmp, vec
+	vmovdqu	xgft4_lo, [tmp+2*vec]		;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+
+	mov	dest3, [dest1+2*PS]	; reuse mul_array
+	mov	dest4, [dest1+3*PS]	; reuse vec
+	mov	dest5, [dest1+4*PS]	; reuse vec_i
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+.loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+	XLDR	xd3, [dest3+pos]	;Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Get next dest vector
+	XLDR	xd5, [dest5+pos]	;Get next dest vector
+
+	vpand	xtmpl, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xtmpl, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmpl, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	; dest3
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph1		;xd3 += partial
+
+	vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
+	; dest4
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph2		;xd4 += partial
+
+	; dest5
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl		;GF add high and low partials
+	vpxor	xd5, xd5, xtmph1		;xd5 += partial
+
+	XSTR	[dest1+pos], xd1
+	XSTR	[dest2+pos], xd2
+	XSTR	[dest3+pos], xd3
+	XSTR	[dest4+pos], xd4
+	XSTR	[dest5+pos], xd5
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan32:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp.b, 0x1f
+	vpinsrb	xtmph1x, xtmph1x, tmp.w, 0
+	vpbroadcastb xtmph1, xtmph1x	;Construct mask 0x1f1f1f...
+
+	mov	tmp, len		;Overlapped offset length-32
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Get next dest vector
+	XLDR	xd5, [dest5+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph2, [constip32]	;Load const of i + 32
+	vpinsrb	xtmplx, xtmplx, len.w, 15
+	vinserti128	xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
+	vpshufb	xtmpl, xtmpl, xtmph1	;Broadcast len to all bytes. xtmph1=0x1f1f1f...
+	vpcmpgtb	xtmpl, xtmpl, xtmph2
+
+	vpand	xtmph1, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xtmph1, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmph1, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+
+	; dest1
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xgft1_lo	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl
+	vpxor	xd1, xd1, xtmph1		;xd1 += partial
+
+	vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	; dest2
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xgft2_lo	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmpl
+	vpxor	xd2, xd2, xtmph2		;xd2 += partial
+
+	vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	; dest3
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xgft3_lo	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl
+	vpxor	xd3, xd3, xtmph1		;xd3 += partial
+
+	vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
+	; dest4
+	vpshufb	xtmph2, xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xgft4_lo	;GF add high and low partials
+	vpand	xtmph2, xtmph2, xtmpl
+	vpxor	xd4, xd4, xtmph2		;xd4 += partial
+
+	; dest5
+	vpshufb	xtmph1, xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xgft5_lo, xgft5_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xgft5_lo	;GF add high and low partials
+	vpand	xtmph1, xtmph1, xtmpl
+	vpxor	xd5, xd5, xtmph1		;xd5 += partial
+
+	XSTR	[dest1+tmp], xd1
+	XSTR	[dest2+tmp], xd2
+	XSTR	[dest3+tmp], xd3
+	XSTR	[dest4+tmp], xd4
+	XSTR	[dest5+tmp], xd5
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+align 32
+constip32:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+	ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func             core, ver, snum
+slversion gf_5vect_mad_avx2, 04,  00,  020e
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s
new file mode 100644
index 00000000000..615a7f769e0
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s
@@ -0,0 +1,379 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define tmp4   r14
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 5*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define tmp4   r13
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp4
+%define dest3 mul_array
+%define dest4 tmp2
+%define dest5 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft5_hi xmm14
+%define xgft4_lo xmm13
+%define xgft4_hi xmm12
+
+%define x0      xmm0
+%define xtmpa   xmm1
+%define xtmph1  xmm2
+%define xtmpl1  xmm3
+%define xtmph2  xmm4
+%define xtmpl2  xmm5
+%define xtmph3  xmm6
+%define xtmpl3  xmm7
+%define xtmph5  xmm8
+%define xtmpl5  xmm9
+%define xd1     xmm10
+%define xd2     xmm11
+%define xd3     xtmpl1
+%define xd4     xtmph1
+%define xd5     xtmpl2
+
+
+align 16
+global gf_5vect_mad_sse:function
+func(gf_5vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	tmp, vec
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+	sal	tmp, 6			;Multiply by 64
+	movdqu	xgft5_hi, [tmp3+2*tmp+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	sal	vec, 5			;Multiply by 32
+	add	tmp, vec
+	movdqu	xgft4_hi, [tmp3+tmp+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+	movdqu	xgft4_lo, [tmp3+tmp]	;Load array Dx{00}, Dx{01}, Dx{02}, ...
+
+	mov	dest3, [dest1+2*PS]	; reuse mul_array
+	mov	dest4, [dest1+3*PS]
+	mov	dest5, [dest1+4*PS]	; reuse vec_i
+	mov	dest2, [dest1+PS]
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xtmpl5, [tmp3+4*vec]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	movdqa	xtmph5, xgft5_hi		;Reload const array registers
+
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd3, [dest3+pos]	;Reuse xtmpl1, Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Reuse xtmph1. Get next dest vector
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	XLDR	xd5, [dest5+pos]	;Reuse xtmpl2. Get next dest vector
+
+	; dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	movdqa	xtmph2, xgft4_hi		;Reload const array registers
+	movdqa	xtmpl3, xgft4_lo		;Reload const array registers
+
+	; dest5
+	pshufb	xtmph5, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl5, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph5, xtmpl5		;GF add high and low partials
+	pxor	xd5, xtmph5
+
+	; dest4
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl3		;GF add high and low partials
+	pxor	xd4, xtmph2
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+	XSTR	[dest4+pos], xd4	;Store result into dest4
+	XSTR	[dest5+pos], xd5	;Store result into dest5
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp, len	;Overlapped offset length-16
+	XLDR	x0, [src+tmp]		;Get next source vector
+
+	sub	len, pos
+
+	movdqa	xtmpl1, [constip16]	;Load const of i + 16
+	pinsrb	xtmph5, len.w, 15
+	pshufb	xtmph5, xmask0f		;Broadcast len to all bytes
+	pcmpgtb	xtmph5, xtmpl1
+
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl1, [tmp3]		;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xtmpl5, [tmp3+4*vec]	;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	; dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pand	xtmph1, xtmph5
+	pxor	xd1, xtmph1
+
+	XLDR	xd3, [dest3+tmp]	;Reuse xtmpl1, Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Reuse xtmph1. Get next dest vector
+
+	; dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pand	xtmph2, xtmph5
+	pxor	xd2, xtmph2
+
+	XLDR	xd5, [dest5+tmp]	;Reuse xtmpl2. Get next dest vector
+
+	; dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pand	xtmph3, xtmph5
+	pxor	xd3, xtmph3
+
+	; dest4
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo		;GF add high and low partials
+	pand	xgft4_hi, xtmph5
+	pxor	xd4, xgft4_hi
+
+	; dest5
+	pshufb	xgft5_hi, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl5, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft5_hi, xtmpl5		;GF add high and low partials
+	pand	xgft5_hi, xtmph5
+	pxor	xd5, xgft5_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+	XSTR	[dest4+tmp], xd4	;Store result into dest4
+	XSTR	[dest5+tmp], xd5	;Store result into dest5
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f:
+	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func             core, ver, snum
+slversion gf_5vect_mad_sse, 00,  00,  020c
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s
index 28ca861357a..f439fbbbc16 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_6vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -321,5 +319,3 @@ global %1_slver
 %endmacro
 ;;;       func                  core, ver, snum
 slversion gf_6vect_dot_prod_avx, 02,  03,  0195
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s
index a957c9ecc0c..fac63022ac0 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_6vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -332,5 +330,3 @@ global %1_slver
 %endmacro
 ;;;       func                   core, ver, snum
 slversion gf_6vect_dot_prod_avx2, 04,  03,  019a
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s
index 4910ddd703a..c3cfa14b821 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_6vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -166,20 +164,20 @@ default rel
 section .text
 
 %define xmask0f   xmm15
-%define xgft1_lo  xmm14
-%define xgft1_hi  xmm13
-%define xgft2_lo  xmm12
-%define xgft2_hi  xmm11
-%define xgft3_lo  xmm10
-%define xgft3_hi  xmm9
+%define xgft1_lo  xmm2
+%define xgft1_hi  xmm3
+%define xgft2_lo  xmm4
+%define xgft2_hi  xmm5
+%define xgft3_lo  xmm6
+%define xgft3_hi  xmm7
 %define x0     xmm0
 %define xtmpa  xmm1
-%define xp1    xmm2
-%define xp2    xmm3
-%define xp3    xmm4
-%define xp4    xmm5
-%define xp5    xmm6
-%define xp6    xmm7
+%define xp1    xmm8
+%define xp2    xmm9
+%define xp3    xmm10
+%define xp4    xmm11
+%define xp5    xmm12
+%define xp6    xmm13
 
 align 16
 global gf_6vect_dot_prod_sse:function
@@ -320,6 +318,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                  core, ver, snum
-slversion gf_6vect_dot_prod_sse, 00,  03,  0066
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_6vect_dot_prod_sse, 00,  04,  0066
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s
new file mode 100644
index 00000000000..84b2eca5de6
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s
@@ -0,0 +1,400 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r13
+ %define tmp4   r14
+ %define tmp5   rdi
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 5*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp2   r10
+ %define tmp3   r12
+ %define tmp4   r13
+ %define tmp5   r14
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp4
+%define dest3 tmp2
+%define dest4 mul_array
+%define dest5 tmp5
+%define dest6 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft4_lo  xmm14
+%define xgft4_hi  xmm13
+%define xgft5_lo  xmm12
+%define xgft5_hi  xmm11
+%define xgft6_lo  xmm10
+%define xgft6_hi  xmm9
+
+%define x0         xmm0
+%define xtmpa      xmm1
+%define xtmph1     xmm2
+%define xtmpl1     xmm3
+%define xtmph2     xmm4
+%define xtmpl2     xmm5
+%define xtmph3     xmm6
+%define xtmpl3     xmm7
+%define xd1        xmm8
+%define xd2        xtmpl1
+%define xd3        xtmph1
+
+
+align 16
+global gf_6vect_mad_avx:function
+func(gf_6vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	mov	tmp, vec
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+	sal	tmp, 6			;Multiply by 64
+
+	sal	vec, 5			;Multiply by 32
+	lea	vec_i, [tmp + vec]	;vec_i = vec*96
+	lea	mul_array, [tmp + vec_i]	;mul_array = vec*160
+
+	vmovdqu	xgft5_lo, [tmp3+2*tmp]		;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	vmovdqu	xgft5_hi, [tmp3+2*tmp+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	vmovdqu	xgft4_lo, [tmp3+vec_i]		;Load array Dx{00}, Dx{01}, Dx{02}, ...
+	vmovdqu	xgft4_hi, [tmp3+vec_i+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+	vmovdqu	xgft6_lo, [tmp3+mul_array]	;Load array Fx{00}, Fx{01}, ..., Fx{0f}
+	vmovdqu	xgft6_hi, [tmp3+mul_array+16]	;     "     Fx{00}, Fx{10}, ..., Fx{f0}
+
+	mov	dest2, [dest1+PS]
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]  ; reuse mul_array
+	mov	dest5, [dest1+4*PS]
+	mov	dest6, [dest1+5*PS]  ; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vmovdqu	xtmpl1, [tmp3]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	XLDR	xd1, [dest1+pos]		;Get next dest vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+
+	;dest1
+	vpshufb	xtmph1, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xtmph1
+
+	XLDR	xd2, [dest2+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+pos]	;reuse xtmph1. Get next dest vector
+
+	;dest2
+	vpshufb	xtmph2, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xtmph2
+
+	;dest3
+	vpshufb	xtmph3, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xtmph3
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+
+	;dest4
+	XLDR	xd1, [dest4+pos]		;Get next dest vector
+	vpshufb	xtmph1, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl1, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph1, xtmph1, xtmpl1		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph1
+
+	XLDR	xd2, [dest5+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest6+pos]	;reuse xtmph1. Get next dest vector
+
+	;dest5
+	vpshufb	xtmph2, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl2, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph2, xtmph2, xtmpl2		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph2
+
+	;dest6
+	vpshufb	xtmph3, xgft6_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl3, xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph3, xtmph3, xtmpl3		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph3
+
+	XSTR	[dest4+pos], xd1	;Store result into dest4
+	XSTR	[dest5+pos], xd2	;Store result into dest5
+	XSTR	[dest6+pos], xd3	;Store result into dest6
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	;; Overlapped offset length-16
+	mov	tmp, len		;Backup len as len=rdi
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest4+tmp]	;Get next dest vector
+	XLDR	xd2, [dest5+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest6+tmp]	;reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	vmovdqa	xtmph3, [constip16]	;Load const of i + 16
+	vpinsrb	xtmpl3, len.w, 15
+	vpshufb	xtmpl3, xmask0f		;Broadcast len to all bytes
+	vpcmpgtb	xtmpl3, xtmpl3, xtmph3
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	;dest4
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_hi, xgft4_lo	;GF add high and low partials
+	vpand	xgft4_hi, xgft4_hi, xtmpl3
+	vpxor	xd1, xd1, xgft4_hi
+
+	;dest5
+	vpshufb	xgft5_hi, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft5_lo, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft5_hi, xgft5_hi, xgft5_lo	;GF add high and low partials
+	vpand	xgft5_hi, xgft5_hi, xtmpl3
+	vpxor	xd2, xd2, xgft5_hi
+
+	;dest6
+	vpshufb	xgft6_hi, xgft6_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft6_lo, xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft6_hi, xgft6_hi, xgft6_lo	;GF add high and low partials
+	vpand	xgft6_hi, xgft6_hi, xtmpl3
+	vpxor	xd3, xd3, xgft6_hi
+
+	XSTR	[dest4+tmp], xd1	;Store result into dest4
+	XSTR	[dest5+tmp], xd2	;Store result into dest5
+	XSTR	[dest6+tmp], xd3	;Store result into dest6
+
+	vmovdqu	xgft4_lo, [tmp3]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	vmovdqu	xgft4_hi, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	vmovdqu	xgft5_lo, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	vmovdqu	xgft5_hi, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	vmovdqu	xgft6_lo, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xgft6_hi, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;reuse xtmph1. Get next dest3 vector
+
+	;dest1
+	vpshufb	xgft4_hi, xgft4_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft4_hi, xgft4_hi, xgft4_lo		;GF add high and low partials
+	vpand	xgft4_hi, xgft4_hi, xtmpl3
+	vpxor	xd1, xd1, xgft4_hi
+
+	;dest2
+	vpshufb	xgft5_hi, xgft5_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft5_lo, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft5_hi, xgft5_hi, xgft5_lo	;GF add high and low partials
+	vpand	xgft5_hi, xgft5_hi, xtmpl3
+	vpxor	xd2, xd2, xgft5_hi
+
+	;dest3
+	vpshufb	xgft6_hi, xgft6_hi, x0		;Lookup mul table of high nibble
+	vpshufb	xgft6_lo, xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xgft6_hi, xgft6_hi, xgft6_lo	;GF add high and low partials
+	vpand	xgft6_hi, xgft6_hi, xtmpl3
+	vpxor	xd3, xd3, xgft6_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func             core, ver, snum
+slversion gf_6vect_mad_avx, 02,  00,  0210
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s
new file mode 100644
index 00000000000..d83847ab6c8
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s
@@ -0,0 +1,407 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define tmp2   r10
+ %define tmp3   r13
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r15,  10*16 + 2*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp.b r11b
+ %define tmp2   r10
+ %define tmp3   r12
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 tmp3
+%define dest3 tmp2
+%define dest4 mul_array
+%define dest5 vec
+%define dest6 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  ymm15
+%define xmask0fx  xmm15
+%define xgft1_lo  ymm14
+%define xgft2_lo  ymm13
+%define xgft3_lo  ymm12
+%define xgft4_lo  ymm11
+%define xgft5_lo  ymm10
+%define xgft6_lo  ymm9
+
+%define x0         ymm0
+%define xtmpa      ymm1
+%define xtmpl      ymm2
+%define xtmplx     xmm2
+%define xtmph      ymm3
+%define xtmphx     xmm3
+%define xd1        ymm4
+%define xd2        ymm5
+%define xd3        ymm6
+%define xd4        ymm7
+%define xd5        ymm8
+%define xd6        xd1
+
+align 16
+global gf_6vect_mad_avx2:function
+func(gf_6vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	sal	vec, 5			;Multiply by 32
+	lea	tmp, [mul_array + vec_i]
+	mov	vec_i, vec
+	mov	mul_array, vec
+	sal	vec_i, 1
+	sal	mul_array, 1
+	add	vec_i, vec		;vec_i=vec*96
+	add	mul_array, vec_i	;vec_i=vec*160
+
+	vmovdqu	xgft1_lo, [tmp]			;Load array Ax{00}, Ax{01}, ..., Ax{0f}
+						;     "     Ax{00}, Ax{10}, ..., Ax{f0}
+	vmovdqu	xgft2_lo, [tmp+vec]		;Load array Bx{00}, Bx{01}, ..., Bx{0f}
+						;     "     Bx{00}, Bx{10}, ..., Bx{f0}
+	vmovdqu	xgft3_lo, [tmp+2*vec]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
+						;     "     Cx{00}, Cx{10}, ..., Cx{f0}
+	vmovdqu	xgft4_lo, [tmp+vec_i]		;Load array Fx{00}, Fx{01}, ..., Fx{0f}
+						;     "     Fx{00}, Fx{10}, ..., Fx{f0}
+	vmovdqu	xgft5_lo, [tmp+4*vec]		;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+						;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	vmovdqu	xgft6_lo, [tmp+mul_array]	;Load array Dx{00}, Dx{01}, ..., Dx{0f}
+						;     "     Dx{00}, Dx{10}, ..., Dx{f0}
+
+	mov	dest2, [dest1+PS]    ; reuse tmp3
+	mov	dest3, [dest1+2*PS]  ; reuse tmp2
+	mov	dest4, [dest1+3*PS]  ; reuse mul_array
+	mov	dest5, [dest1+4*PS]  ; reuse vec
+	mov	dest6, [dest1+5*PS]  ; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop32:
+	XLDR	x0, [src+pos]		;Get next source vector
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+	XLDR	xd2, [dest2+pos]	;Get next dest vector
+	XLDR	xd3, [dest3+pos]	;Get next dest vector
+	XLDR	xd4, [dest4+pos]	;Get next dest vector
+	XLDR	xd5, [dest5+pos]	;Get next dest vector
+
+	vpand	xtmpl, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xtmpl, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmpl, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	;dest1
+	vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft1_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd1, xd1, xtmph		;xd1 += partial
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+
+	;dest2
+	vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft2_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd2, xd2, xtmph		;xd2 += partial
+
+	;dest3
+	vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft3_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd3, xd3, xtmph		;xd3 += partial
+
+	XLDR	xd6, [dest6+pos]	;reuse xd1. Get next dest vector
+
+	;dest4
+	vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd4, xd4, xtmph		;xd4 += partial
+
+	;dest5
+	vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd5, xd5, xtmph		;xd5 += partial
+
+	;dest6
+	vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl		;GF add high and low partials
+	vpxor	xd6, xd6, xtmph		;xd6 += partial
+
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+	XSTR	[dest4+pos], xd4	;Store result into dest4
+	XSTR	[dest5+pos], xd5	;Store result into dest5
+	XSTR	[dest6+pos], xd6	;Store result into dest6
+
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan32:
+	;; Tail len
+	;; Do one more overlap pass
+	mov	tmp.b, 0x1f
+	vpinsrb	xtmphx, xtmphx, tmp.w, 0
+	vpbroadcastb xtmph, xtmphx	;Construct mask 0x1f1f1f...
+
+	mov	tmp, len		;Overlapped offset length-32
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;Get next dest vector
+	XLDR	xd4, [dest4+tmp]	;Get next dest vector
+	XLDR	xd5, [dest5+tmp]	;Get next dest vector
+
+	sub	len, pos
+
+	vpinsrb	xtmplx, xtmplx, len.w, 15
+	vinserti128	xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx
+	vpshufb	xtmpl, xtmpl, xtmph	;Broadcast len to all bytes. xtmph=0x1f1f1f...
+	vpcmpgtb	xtmpl, xtmpl, [constip32]
+
+	vpand	xtmph, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+	vperm2i128 xtmpa, xtmph, x0, 0x30 	;swap xtmpa from 1lo|2lo to 1lo|2hi
+	vperm2i128 x0, xtmph, x0, 0x12	;swap x0 from    1hi|2hi to 1hi|2lo
+
+	;dest1
+	vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft1_lo, xgft1_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft1_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd1, xd1, xtmph		;xd1 += partial
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+
+	;dest2
+	vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft2_lo, xgft2_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft2_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd2, xd2, xtmph		;xd2 += partial
+
+	;dest3
+	vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft3_lo, xgft3_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft3_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd3, xd3, xtmph		;xd3 += partial
+
+	XLDR	xd6, [dest6+tmp]	;reuse xd1. Get next dest vector
+
+	;dest4
+	vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft4_lo, xgft4_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft4_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd4, xd4, xtmph		;xd4 += partial
+
+	;dest5
+	vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft5_lo, xgft5_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft5_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd5, xd5, xtmph		;xd5 += partial
+
+	;dest6
+	vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo
+	vpshufb	xtmph, xtmph, x0		;Lookup mul table of high nibble
+	vpshufb	xgft6_lo, xgft6_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xgft6_lo		;GF add high and low partials
+	vpand	xtmph, xtmph, xtmpl
+	vpxor	xd6, xd6, xtmph		;xd6 += partial
+
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+	XSTR	[dest4+tmp], xd4	;Store result into dest4
+	XSTR	[dest5+tmp], xd5	;Store result into dest5
+	XSTR	[dest6+tmp], xd6	;Store result into dest6
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+align 32
+constip32:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+	ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func              core, ver, snum
+slversion gf_6vect_mad_avx2, 04,  00,  0211
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s
new file mode 100644
index 00000000000..f9b4eecd171
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s
@@ -0,0 +1,412 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%define PS 8
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp2   r10
+ %define tmp3   r13
+ %define tmp4   r14
+ %define tmp5   rdi
+ %define return rax
+ %define return.w eax
+ %define stack_size 16*10 + 5*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	movdqa	[rsp+16*3],xmm9
+	movdqa	[rsp+16*4],xmm10
+	movdqa	[rsp+16*5],xmm11
+	movdqa	[rsp+16*6],xmm12
+	movdqa	[rsp+16*7],xmm13
+	movdqa	[rsp+16*8],xmm14
+	movdqa	[rsp+16*9],xmm15
+	save_reg	r12,  10*16 + 0*8
+	save_reg	r13,  10*16 + 1*8
+	save_reg	r14,  10*16 + 2*8
+	save_reg	r15,  10*16 + 3*8
+	save_reg	rdi,  10*16 + 4*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	movdqa	xmm9, [rsp+16*3]
+	movdqa	xmm10, [rsp+16*4]
+	movdqa	xmm11, [rsp+16*5]
+	movdqa	xmm12, [rsp+16*6]
+	movdqa	xmm13, [rsp+16*7]
+	movdqa	xmm14, [rsp+16*8]
+	movdqa	xmm15, [rsp+16*9]
+	mov	r12,  [rsp + 10*16 + 0*8]
+	mov	r13,  [rsp + 10*16 + 1*8]
+	mov	r14,  [rsp + 10*16 + 2*8]
+	mov	r15,  [rsp + 10*16 + 3*8]
+	mov	rdi,  [rsp + 10*16 + 4*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define tmp.w r11d
+ %define tmp2   r10
+ %define tmp3   r12
+ %define tmp4   r13
+ %define tmp5   r14
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+	push	r12
+	push	r13
+	push	r14
+ %endmacro
+ %macro FUNC_RESTORE 0
+	pop	r14
+	pop	r13
+	pop	r12
+ %endmacro
+%endif
+
+;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest1  arg5
+%define pos   return
+%define pos.w return.w
+
+%define dest2 mul_array
+%define dest3 tmp2
+%define dest4 tmp4
+%define dest5 tmp5
+%define dest6 vec_i
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm15
+%define xgft4_lo  xmm14
+%define xgft4_hi  xmm13
+%define xgft5_lo  xmm12
+%define xgft5_hi  xmm11
+%define xgft6_lo  xmm10
+%define xgft6_hi  xmm9
+
+%define x0         xmm0
+%define xtmpa      xmm1
+%define xtmph1     xmm2
+%define xtmpl1     xmm3
+%define xtmph2     xmm4
+%define xtmpl2     xmm5
+%define xtmph3     xmm6
+%define xtmpl3     xmm7
+%define xd1        xmm8
+%define xd2        xtmpl1
+%define xd3        xtmph1
+
+
+align 16
+global gf_6vect_mad_sse:function
+func(gf_6vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	mov	tmp, vec
+	sal	vec_i, 5		;Multiply by 32
+	lea	tmp3, [mul_array + vec_i]
+	sal	tmp, 6			;Multiply by 64
+
+	sal	vec, 5			;Multiply by 32
+	lea	vec_i, [tmp + vec]	;vec_i = 96
+	lea	mul_array, [tmp + vec_i]	;mul_array = 160
+
+	movdqu	xgft5_lo, [tmp3+2*tmp]		;Load array Ex{00}, Ex{01}, ..., Ex{0f}
+	movdqu	xgft5_hi, [tmp3+2*tmp+16]	;     "     Ex{00}, Ex{10}, ..., Ex{f0}
+	movdqu	xgft4_lo, [tmp3+vec_i]		;Load array Dx{00}, Dx{01}, Dx{02}, ...
+	movdqu	xgft4_hi, [tmp3+vec_i+16]	; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0}
+	movdqu	xgft6_lo, [tmp3+mul_array]	;Load array Fx{00}, Fx{01}, ..., Fx{0f}
+	movdqu	xgft6_hi, [tmp3+mul_array+16]	;     "     Fx{00}, Fx{10}, ..., Fx{f0}
+
+	mov	dest2, [dest1+PS]
+	mov	dest3, [dest1+2*PS]
+	mov	dest4, [dest1+3*PS]  ; reuse mul_array
+	mov	dest5, [dest1+4*PS]
+	mov	dest6, [dest1+5*PS]  ; reuse vec_i
+	mov	dest1, [dest1]
+
+.loop16:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	movdqu	xtmpl1, [tmp3]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xtmph1, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xtmpl2, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xtmph2, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xtmpl3, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xtmph3, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	XLDR	xd1, [dest1+pos]	;Get next dest vector
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	;dest1
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd2, [dest2+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+pos]	;reuse xtmph1. Get next dest3 vector
+
+	;dest2
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	;dest3
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	XSTR	[dest1+pos], xd1	;Store result into dest1
+	XSTR	[dest2+pos], xd2	;Store result into dest2
+	XSTR	[dest3+pos], xd3	;Store result into dest3
+
+	movdqa	xtmph1, xgft4_hi	;Reload const array registers
+	movdqa	xtmpl1, xgft4_lo	;Reload const array registers
+	movdqa	xtmph2, xgft5_hi	;Reload const array registers
+	movdqa	xtmpl2, xgft5_lo	;Reload const array registers
+	movdqa	xtmph3, xgft6_hi	;Reload const array registers
+	movdqa	xtmpl3, xgft6_lo	;Reload const array registers
+
+	;dest4
+	XLDR	xd1, [dest4+pos]	;Get next dest vector
+	pshufb	xtmph1, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl1, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph1, xtmpl1		;GF add high and low partials
+	pxor	xd1, xtmph1
+
+	XLDR	xd2, [dest5+pos]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest6+pos]	;reuse xtmph1. Get next dest vector
+
+	;dest5
+	pshufb	xtmph2, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl2, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph2, xtmpl2		;GF add high and low partials
+	pxor	xd2, xtmph2
+
+	;dest6
+	pshufb	xtmph3, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl3, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph3, xtmpl3		;GF add high and low partials
+	pxor	xd3, xtmph3
+
+	XSTR	[dest4+pos], xd1	;Store result into dest4
+	XSTR	[dest5+pos], xd2	;Store result into dest5
+	XSTR	[dest6+pos], xd3	;Store result into dest6
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+.lessthan16:
+	;; Tail len
+	;; Do one more overlap pass
+	;; Overlapped offset length-16
+	mov	tmp, len		;Backup len as len=rdi
+
+	XLDR	x0, [src+tmp]		;Get next source vector
+	XLDR	xd1, [dest4+tmp]	;Get next dest vector
+	XLDR	xd2, [dest5+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest6+tmp]	;reuse xtmph1. Get next dest vector
+
+	sub	len, pos
+
+	movdqa	xtmph3, [constip16]	;Load const of i + 16
+	pinsrb	xtmpl3, len.w, 15
+	pshufb	xtmpl3, xmask0f		;Broadcast len to all bytes
+	pcmpgtb	xtmpl3, xtmph3
+
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
+	;dest4
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo	;GF add high and low partials
+	pand	xgft4_hi, xtmpl3
+	pxor	xd1, xgft4_hi
+
+	;dest5
+	pshufb	xgft5_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft5_hi, xgft5_lo	;GF add high and low partials
+	pand	xgft5_hi, xtmpl3
+	pxor	xd2, xgft5_hi
+
+	;dest6
+	pshufb	xgft6_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft6_hi, xgft6_lo	;GF add high and low partials
+	pand	xgft6_hi, xtmpl3
+	pxor	xd3, xgft6_hi
+
+	XSTR	[dest4+tmp], xd1	;Store result into dest4
+	XSTR	[dest5+tmp], xd2	;Store result into dest5
+	XSTR	[dest6+tmp], xd3	;Store result into dest6
+
+	movdqu	xgft4_lo, [tmp3]	;Load array Ax{00}, Ax{01}, Ax{02}, ...
+	movdqu	xgft4_hi, [tmp3+16]	; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0}
+	movdqu	xgft5_lo, [tmp3+vec]	;Load array Bx{00}, Bx{01}, Bx{02}, ...
+	movdqu	xgft5_hi, [tmp3+vec+16]	; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0}
+	movdqu	xgft6_lo, [tmp3+2*vec]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xgft6_hi, [tmp3+2*vec+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	XLDR	xd1, [dest1+tmp]	;Get next dest vector
+	XLDR	xd2, [dest2+tmp]	;reuse xtmpl1. Get next dest vector
+	XLDR	xd3, [dest3+tmp]	;reuse xtmph1. Get next dest3 vector
+
+	;dest1
+	pshufb	xgft4_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft4_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft4_hi, xgft4_lo		;GF add high and low partials
+	pand	xgft4_hi, xtmpl3
+	pxor	xd1, xgft4_hi
+
+	;dest2
+	pshufb	xgft5_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft5_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft5_hi, xgft5_lo	;GF add high and low partials
+	pand	xgft5_hi, xtmpl3
+	pxor	xd2, xgft5_hi
+
+	;dest3
+	pshufb	xgft6_hi, x0		;Lookup mul table of high nibble
+	pshufb	xgft6_lo, xtmpa		;Lookup mul table of low nibble
+	pxor	xgft6_hi, xgft6_lo	;GF add high and low partials
+	pand	xgft6_hi, xtmpl3
+	pxor	xd3, xgft6_hi
+
+	XSTR	[dest1+tmp], xd1	;Store result into dest1
+	XSTR	[dest2+tmp], xd2	;Store result into dest2
+	XSTR	[dest3+tmp], xd3	;Store result into dest3
+
+.return_pass:
+	FUNC_RESTORE
+	mov	return, 0
+	ret
+
+.return_fail:
+	FUNC_RESTORE
+	mov	return, 1
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+constip16:
+	ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func             core, ver, snum
+slversion gf_6vect_mad_sse, 00,  00,  020f
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s
index 894783f6346..2fad367a887 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_vect_dot_prod_avx(len, vec, *g_tbls, **buffs, *dest);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -39,12 +37,14 @@
  %define arg2  rdx
  %define arg3  rcx
  %define arg4  r8
- %define arg5  r9
 
  %define tmp   r11
  %define tmp2  r10
  %define tmp3  r9
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define func(x) x:
  %define FUNC_SAVE
@@ -62,6 +62,9 @@
  %define tmp2   r10
  %define tmp3   rdi 		; must be saved and loaded
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define frame_size 2*8
  %define arg(x)      [rsp + frame_size + PS + PS*x]
@@ -80,6 +83,67 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+
+ %define trans   ecx			;trans is for the variables in stack
+ %define arg0    trans
+ %define arg0_m  arg(0)
+ %define arg1    trans
+ %define arg1_m  arg(1)
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    ebx
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    esi
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg3, arg(3)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
 
 %define len   arg0
 %define vec   arg1
@@ -91,6 +155,12 @@
 %define ptr   tmp3
 %define pos   return
 
+ %ifidn PS,4				;32-bit code
+	%define  vec_m 	arg1_m
+	%define  len_m 	arg0_m
+	%define  dest_m arg4_m
+ %endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR vmovdqu
@@ -106,10 +176,11 @@
  %endif
 %endif
 
+%ifidn PS,8			; 64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
 %define xmask0f  xmm5
@@ -124,7 +195,9 @@ align 16
 global gf_vect_dot_prod_avx:function
 func(gf_vect_dot_prod_avx)
 	FUNC_SAVE
+	SLDR 	len, len_m
 	sub	len, 16
+	SSTR 	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
@@ -135,10 +208,12 @@ func(gf_vect_dot_prod_avx)
 	xor	vec_i, vec_i
 
 .next_vect:
+
 	mov	ptr, [src+vec_i*PS]
 	vmovdqu	xgft_lo, [tmp]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 	vmovdqu	xgft_hi, [tmp+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 	XLDR	x0, [ptr+pos]		;Get next source vector
+
 	add	tmp, 32
 	add	vec_i, 1
 
@@ -150,11 +225,16 @@ func(gf_vect_dot_prod_avx)
 	vpshufb	xgft_lo, xgft_lo, xtmpa	;Lookup mul table of low nibble
 	vpxor	xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
 	vpxor	xp, xp, xgft_hi		;xp += partial
+
+	SLDR	vec, vec_m
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR 	dest, dest_m
 	XSTR	[dest+pos], xp
+
 	add	pos, 16			;Loop on 16 bytes at a time
+	SLDR 	len, len_m
 	cmp	pos, len
 	jle	.loop16
 
@@ -182,7 +262,6 @@ section .data
 
 align 16
 
-poly:
 mask0f:
 ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
@@ -195,6 +274,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                 core, ver, snum
-slversion gf_vect_dot_prod_avx, 02,  03,  0061
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_vect_dot_prod_avx, 02,  04,  0061
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s
index f5f928748cd..737378058b3 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, *dest);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -47,7 +45,10 @@
  %define tmp2  r10
  %define tmp3  r9
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define func(x) x:
  %define FUNC_SAVE
  %define FUNC_RESTORE
@@ -66,7 +67,10 @@
  %define tmp2   r10
  %define tmp3   rdi 		; must be saved and loaded
  %define return rax
- %define PS 8
+ %macro  SLDR   2
+ %endmacro
+ %define SSTR   SLDR
+ %define PS     8
  %define frame_size 2*8
  %define arg(x)      [rsp + frame_size + PS + PS*x]
 
@@ -84,6 +88,69 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+
+ %define trans   ecx			;trans is for the variables in stack
+ %define arg0    trans
+ %define arg0_m  arg(0)
+ %define arg1    trans
+ %define arg1_m  arg(1)
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    ebx
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp.w   edx
+ %define tmp.b   dl
+ %define tmp2    edi
+ %define tmp3    esi
+ %define return  eax
+ %macro SLDR     2			;stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg3, arg(3)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
 
 %define len   arg0
 %define vec   arg1
@@ -95,6 +162,12 @@
 %define ptr   tmp3
 %define pos   return
 
+%ifidn PS,4				;32-bit code
+ %define  vec_m  arg1_m
+ %define  len_m  arg0_m
+ %define  dest_m arg4_m
+%endif
+
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
  %define XLDR vmovdqu
@@ -110,10 +183,11 @@
  %endif
 %endif
 
+%ifidn PS,8				;64-bit code
+ default rel
+ [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
 %define xmask0f  ymm3
@@ -129,7 +203,9 @@ align 16
 global gf_vect_dot_prod_avx2:function
 func(gf_vect_dot_prod_avx2)
 	FUNC_SAVE
+	SLDR 	len, len_m
 	sub	len, 32
+	SSTR 	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	mov	tmp.b, 0x0f
@@ -142,6 +218,7 @@ func(gf_vect_dot_prod_avx2)
 	xor	vec_i, vec_i
 
 .next_vect:
+
 	mov	ptr, [src+vec_i*PS]
 
 	vmovdqu	xgft_lo, [tmp]		;Load array Cx{00}, Cx{01}, Cx{02}, ...
@@ -150,6 +227,7 @@ func(gf_vect_dot_prod_avx2)
 	vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
 
 	XLDR	x0, [ptr+pos]		;Get next source vector
+
 	add	tmp, 32
 	add	vec_i, 1
 
@@ -161,11 +239,16 @@ func(gf_vect_dot_prod_avx2)
 	vpshufb	xgft_lo, xgft_lo, xtmpa	;Lookup mul table of low nibble
 	vpxor	xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
 	vpxor	xp, xp, xgft_hi		;xp += partial
+
+	SLDR	vec, vec_m
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR 	dest, dest_m
 	XSTR	[dest+pos], xp
+
 	add	pos, 32			;Loop on 32 bytes at a time
+	SLDR 	len, len_m
 	cmp	pos, len
 	jle	.loop32
 
@@ -200,6 +283,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                  core, ver, snum
-slversion gf_vect_dot_prod_avx2, 04,  03,  0190
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_vect_dot_prod_avx2, 04,  04,  0190
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s
index 2e13c186673..4d2c77f438c 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_vect_dot_prod_sse(len, vec, *g_tbls, **buffs, *dest);
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -44,6 +42,9 @@
  %define tmp2  r10
  %define tmp3  r9
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define func(x) x:
  %define FUNC_SAVE
@@ -61,6 +62,9 @@
  %define tmp2   r10
  %define tmp3   rdi 		; must be saved and loaded
  %define return rax
+ %macro  SLDR 2
+ %endmacro
+ %define SSTR SLDR
  %define PS 8
  %define frame_size 2*8
  %define arg(x)      [rsp + frame_size + PS + PS*x]
@@ -79,6 +83,67 @@
  %endmacro
 %endif
 
+%ifidn __OUTPUT_FORMAT__, elf32
+
+;;;================== High Address;
+;;;	arg4
+;;;	arg3
+;;;	arg2
+;;;	arg1
+;;;	arg0
+;;;	return
+;;;<================= esp of caller
+;;;	ebp
+;;;<================= ebp = esp
+;;;	esi
+;;;	edi
+;;;	ebx
+;;;<================= esp of callee
+;;;
+;;;================== Low Address;
+
+ %define PS 4
+ %define LOG_PS 2
+ %define func(x) x:
+ %define arg(x) [ebp + PS*2 + PS*x]
+
+ %define trans   ecx			;trans is for the variables in stack
+ %define arg0    trans
+ %define arg0_m  arg(0)
+ %define arg1    trans
+ %define arg1_m  arg(1)
+ %define arg2    arg2_m
+ %define arg2_m  arg(2)
+ %define arg3    ebx
+ %define arg4    trans
+ %define arg4_m  arg(4)
+ %define tmp	 edx
+ %define tmp2    edi
+ %define tmp3    esi
+ %define return  eax
+ %macro SLDR 2	;; stack load/restore
+	mov %1, %2
+ %endmacro
+ %define SSTR SLDR
+
+ %macro FUNC_SAVE 0
+	push	ebp
+	mov	ebp, esp
+	push	esi
+	push	edi
+	push	ebx
+	mov	arg3, arg(3)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	pop	ebx
+	pop	edi
+	pop	esi
+	mov	esp, ebp
+	pop	ebp
+ %endmacro
+
+%endif	; output formats
 
 %define len   arg0
 %define vec   arg1
@@ -90,6 +155,11 @@
 %define ptr   tmp3
 %define pos   return
 
+ %ifidn PS,4				;32-bit code
+	%define  vec_m 	arg1_m
+	%define  len_m 	arg0_m
+	%define  dest_m arg4_m
+ %endif
 
 %ifndef EC_ALIGNED_ADDR
 ;;; Use Un-aligned load/store
@@ -106,10 +176,11 @@
  %endif
 %endif
 
+%ifidn PS,8				;64-bit code
+ default rel
+  [bits 64]
+%endif
 
-default rel
-
-[bits 64]
 section .text
 
 %define xmask0f  xmm5
@@ -124,7 +195,9 @@ align 16
 global gf_vect_dot_prod_sse:function
 func(gf_vect_dot_prod_sse)
 	FUNC_SAVE
+	SLDR 	len, len_m
 	sub	len, 16
+	SSTR 	len_m, len
 	jl	.return_fail
 	xor	pos, pos
 	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
@@ -135,26 +208,34 @@ func(gf_vect_dot_prod_sse)
 	xor	vec_i, vec_i
 
 .next_vect:
+
 	mov	ptr, [src+vec_i*PS]
 	movdqu	xgft_lo, [tmp]		;Load array Cx{00}, Cx{01}, ..., Cx{0f}
 	movdqu	xgft_hi, [tmp+16]	;     "     Cx{00}, Cx{10}, ..., Cx{f0}
 	XLDR	x0, [ptr+pos]		;Get next source vector
+
 	add	tmp, 32
 	add	vec_i, 1
+
 	movdqa	xtmpa, x0		;Keep unshifted copy of src
 	psraw	x0, 4			;Shift to put high nibble into bits 4-0
 	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
 	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+
 	pshufb	xgft_hi, x0		;Lookup mul table of high nibble
 	pshufb	xgft_lo, xtmpa		;Lookup mul table of low nibble
 	pxor	xgft_hi, xgft_lo	;GF add high and low partials
 	pxor	xp, xgft_hi		;xp += partial
+
+	SLDR 	vec, vec_m
 	cmp	vec_i, vec
 	jl	.next_vect
 
+	SLDR 	dest, dest_m
 	XSTR	[dest+pos], xp
 
 	add	pos, 16			;Loop on 16 bytes at a time
+	SLDR 	len, len_m
 	cmp	pos, len
 	jle	.loop16
 
@@ -181,6 +262,7 @@ endproc_frame
 section .data
 
 align 16
+
 mask0f:	ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
 
 %macro slversion 4
@@ -192,6 +274,4 @@ global %1_slver
 	db 0x%3, 0x%2
 %endmacro
 ;;;       func                 core, ver, snum
-slversion gf_vect_dot_prod_sse, 00,  03,  0060
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
+slversion gf_vect_dot_prod_sse, 00,  04,  0060
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s
new file mode 100644
index 00000000000..4874b030c96
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s
@@ -0,0 +1,202 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+ %define PS 8
+ %define stack_size 16*3 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r15,  3*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r15,  [rsp + 3*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest  arg5
+%define pos   return
+%define pos.w return.w
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm8
+%define xgft_lo  xmm7
+%define xgft_hi  xmm6
+
+%define x0     xmm0
+%define xtmpa  xmm1
+%define xtmph  xmm2
+%define xtmpl  xmm3
+%define xd     xmm4
+%define xtmpd  xmm5
+
+align 16
+global gf_vect_mad_avx:function
+func(gf_vect_mad_avx)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	vmovdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+
+	sal	vec_i, 5		;Multiply by 32
+	vmovdqu	xgft_lo, [vec_i+mul_array]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	vmovdqu	xgft_hi, [vec_i+mul_array+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+
+	XLDR	xtmpd, [dest+len]	;backup the last 16 bytes in dest
+
+.loop16:
+	XLDR	xd, [dest+pos]		;Get next dest vector
+.loop16_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph, xgft_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl ;GF add high and low partials
+	vpxor	xd, xd, xtmph		;xd += partial
+
+	XSTR	[dest+pos], xd
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	vmovdqa	xd, xtmpd	;Restore xd
+	jmp	.loop16_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func            core, ver, snum
+slversion gf_vect_mad_avx, 02,  00,  0201
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s
new file mode 100644
index 00000000000..5ee9c2f93ca
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s
@@ -0,0 +1,209 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0   rcx
+ %define arg0.w ecx
+ %define arg1   rdx
+ %define arg2   r8
+ %define arg3   r9
+ %define arg4   r12 		; must be saved and loaded
+ %define arg5   r15
+
+ %define tmp    r11
+ %define tmp.w  r11d
+ %define tmp.b  r11b
+ %define return rax
+ %define return.w eax
+ %define PS 8
+ %define stack_size 16*3 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+ %macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	vmovdqa	[rsp+16*0],xmm6
+	vmovdqa	[rsp+16*1],xmm7
+	vmovdqa	[rsp+16*2],xmm8
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r15,  3*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+	vmovdqa	xmm6, [rsp+16*0]
+	vmovdqa	xmm7, [rsp+16*1]
+	vmovdqa	xmm8, [rsp+16*2]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r15,  [rsp + 3*16 + 1*8]
+	add	rsp, stack_size
+ %endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+
+ %define tmp      r11
+ %define tmp.w    r11d
+ %define tmp.b    r11b
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+
+;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec   arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest  arg5
+%define pos   return
+%define pos.w return.w
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR vmovdqu
+ %define XSTR vmovdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR vmovdqa
+  %define XSTR vmovdqa
+ %else
+  %define XLDR vmovntdqa
+  %define XSTR vmovntdq
+ %endif
+%endif
+
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  ymm8
+%define xmask0fx xmm8
+%define xgft_lo  ymm7
+%define xgft_hi  ymm6
+
+%define x0     ymm0
+%define xtmpa  ymm1
+%define xtmph  ymm2
+%define xtmpl  ymm3
+%define xd     ymm4
+%define xtmpd  ymm5
+
+align 16
+global gf_vect_mad_avx2:function
+func(gf_vect_mad_avx2)
+	FUNC_SAVE
+	sub	len, 32
+	jl	.return_fail
+	xor	pos, pos
+	mov	tmp.b, 0x0f
+	vpinsrb	xmask0fx, xmask0fx, tmp.w, 0
+	vpbroadcastb xmask0f, xmask0fx	;Construct mask 0x0f0f0f...
+
+	sal	vec_i, 5		;Multiply by 32
+	vmovdqu	xgft_lo, [vec_i+mul_array]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+						; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+	vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
+	vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
+
+	XLDR	xtmpd, [dest+len]	;backup the last 32 bytes in dest
+
+.loop32:
+	XLDR	xd, [dest+pos]		;Get next dest vector
+.loop32_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+
+	vpand	xtmpa, x0, xmask0f	;Mask low src nibble in bits 4-0
+	vpsraw	x0, x0, 4		;Shift to put high nibble into bits 4-0
+	vpand	x0, x0, xmask0f		;Mask high src nibble in bits 4-0
+
+	vpshufb	xtmph, xgft_hi, x0	;Lookup mul table of high nibble
+	vpshufb	xtmpl, xgft_lo, xtmpa	;Lookup mul table of low nibble
+	vpxor	xtmph, xtmph, xtmpl ;GF add high and low partials
+	vpxor	xd, xd, xtmph		;xd += partial
+
+	XSTR	[dest+pos], xd
+	add	pos, 32			;Loop on 32 bytes at a time
+	cmp	pos, len
+	jle	.loop32
+
+	lea	tmp, [len + 32]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-32
+	vmovdqa	xd, xtmpd	;Restore xd
+	jmp	.loop32_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func             core, ver, snum
+slversion gf_vect_mad_avx2, 04,  00,  0202
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s
new file mode 100644
index 00000000000..bfe8f31de82
--- /dev/null
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s
@@ -0,0 +1,203 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;
+;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest);
+;;;
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0  rcx
+ %define arg0.w ecx
+ %define arg1  rdx
+ %define arg2  r8
+ %define arg3  r9
+ %define arg4  r12
+ %define arg5  r15
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+ %define PS 8
+ %define stack_size 16*3 + 3*8
+ %define arg(x)      [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+
+%macro FUNC_SAVE 0
+	sub	rsp, stack_size
+	movdqa	[rsp+16*0],xmm6
+	movdqa	[rsp+16*1],xmm7
+	movdqa	[rsp+16*2],xmm8
+	save_reg	r12,  3*16 + 0*8
+	save_reg	r15,  3*16 + 1*8
+	end_prolog
+	mov	arg4, arg(4)
+	mov	arg5, arg(5)
+%endmacro
+
+%macro FUNC_RESTORE 0
+	movdqa	xmm6, [rsp+16*0]
+	movdqa	xmm7, [rsp+16*1]
+	movdqa	xmm8, [rsp+16*2]
+	mov	r12,  [rsp + 3*16 + 0*8]
+	mov	r15,  [rsp + 3*16 + 1*8]
+	add	rsp, stack_size
+%endmacro
+
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define arg0  rdi
+ %define arg0.w edi
+ %define arg1  rsi
+ %define arg2  rdx
+ %define arg3  rcx
+ %define arg4  r8
+ %define arg5  r9
+ %define tmp   r11
+ %define return rax
+ %define return.w eax
+
+ %define func(x) x:
+ %define FUNC_SAVE
+ %define FUNC_RESTORE
+%endif
+
+;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest)
+%define len   arg0
+%define len.w arg0.w
+%define vec    arg1
+%define vec_i    arg2
+%define mul_array arg3
+%define	src   arg4
+%define dest  arg5
+%define pos   return
+%define pos.w return.w
+
+%ifndef EC_ALIGNED_ADDR
+;;; Use Un-aligned load/store
+ %define XLDR movdqu
+ %define XSTR movdqu
+%else
+;;; Use Non-temporal load/stor
+ %ifdef NO_NT_LDST
+  %define XLDR movdqa
+  %define XSTR movdqa
+ %else
+  %define XLDR movntdqa
+  %define XSTR movntdq
+ %endif
+%endif
+
+default rel
+
+[bits 64]
+section .text
+
+%define xmask0f  xmm8
+%define xgft_lo  xmm7
+%define xgft_hi  xmm6
+
+%define x0     xmm0
+%define xtmpa  xmm1
+%define xtmph  xmm2
+%define xtmpl  xmm3
+%define xd     xmm4
+%define xtmpd  xmm5
+
+
+align 16
+global gf_vect_mad_sse:function
+func(gf_vect_mad_sse)
+	FUNC_SAVE
+	sub	len, 16
+	jl	.return_fail
+
+	xor	pos, pos
+	movdqa	xmask0f, [mask0f]	;Load mask of lower nibble in each byte
+	sal	vec_i, 5		;Multiply by 32
+	movdqu	xgft_lo, [vec_i+mul_array]	;Load array Cx{00}, Cx{01}, Cx{02}, ...
+	movdqu	xgft_hi, [vec_i+mul_array+16]	; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
+
+	XLDR	xtmpd, [dest+len]	;backup the last 16 bytes in dest
+
+.loop16:
+	XLDR	xd, [dest+pos]		;Get next dest vector
+.loop16_overlap:
+	XLDR	x0, [src+pos]		;Get next source vector
+	movdqa	xtmph, xgft_hi		;Reload const array registers
+	movdqa	xtmpl, xgft_lo
+	movdqa	xtmpa, x0		;Keep unshifted copy of src
+	psraw	x0, 4			;Shift to put high nibble into bits 4-0
+	pand	x0, xmask0f		;Mask high src nibble in bits 4-0
+	pand	xtmpa, xmask0f		;Mask low src nibble in bits 4-0
+	pshufb	xtmph, x0		;Lookup mul table of high nibble
+	pshufb	xtmpl, xtmpa		;Lookup mul table of low nibble
+	pxor	xtmph, xtmpl		;GF add high and low partials
+
+	pxor	xd, xtmph
+	XSTR	[dest+pos], xd		;Store result
+
+	add	pos, 16			;Loop on 16 bytes at a time
+	cmp	pos, len
+	jle	.loop16
+
+	lea	tmp, [len + 16]
+	cmp	pos, tmp
+	je	.return_pass
+
+	;; Tail len
+	mov	pos, len	;Overlapped offset length-16
+	movdqa	xd, xtmpd	;Restore xd
+	jmp	.loop16_overlap	;Do one more overlap pass
+
+.return_pass:
+	mov	return, 0
+	FUNC_RESTORE
+	ret
+
+.return_fail:
+	mov	return, 1
+	FUNC_RESTORE
+	ret
+
+endproc_frame
+
+section .data
+
+align 16
+
+mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
+
+%macro slversion 4
+global %1_slver_%2%3%4
+global %1_slver
+%1_slver:
+%1_slver_%2%3%4:
+	dw 0x%4
+	db 0x%3, 0x%2
+%endmacro
+;;;       func            core, ver, snum
+slversion gf_vect_mad_sse, 00,  00,  0200
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s
index 0536ed7950a..1924da71566 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_vect_mul_avx(len, mul_array, src, dest)
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -170,5 +168,3 @@ global %1_slver
 %endmacro
 ;;;       func             core, ver, snum
 slversion gf_vect_mul_avx, 01,   02,  0036
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s
index c6d7d586967..61ecaac9dd6 100644
--- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s
+++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
@@ -30,8 +30,6 @@
 ;;;
 ;;; gf_vect_mul_sse(len, mul_array, src, dest)
 ;;;
-;;; Author: Gregory Tucker
-
 
 %ifidn __OUTPUT_FORMAT__, elf64
  %define arg0  rdi
@@ -176,5 +174,3 @@ global %1_slver
 %endmacro
 ;;;       func        core, ver, snum
 slversion gf_vect_mul_sse, 00,   02,  0034
-; inform linker that this doesn't require executable stack
-section .note.GNU-stack noalloc noexec nowrite progbits
diff --git a/src/erasure-code/isa/isa-l/include/erasure_code.h b/src/erasure-code/isa/isa-l/include/erasure_code.h
index 0f3b6db0825..53e480f0193 100644
--- a/src/erasure-code/isa/isa-l/include/erasure_code.h
+++ b/src/erasure-code/isa/isa-l/include/erasure_code.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -74,73 +74,128 @@ extern "C" {
 void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls);
 
 /**
- * @brief Generate or decode erasure codes on blocks of data.
+ * @brief Generate or decode erasure codes on blocks of data, runs appropriate version.
  *
  * Given a list of source data blocks, generate one or multiple blocks of
  * encoded data as specified by a matrix of GF(2^8) coefficients. When given a
  * suitable set of coefficients, this function will perform the fast generation
  * or decoding of Reed-Solomon type erasure codes.
  *
- * @requires SSE4.1
+ * This function determines what instruction sets are enabled and
+ * selects the appropriate version at runtime.
+ *
  * @param len    Length of each block of data (vector) of source or dest data.
  * @param k      The number of vector sources or rows in the generator matrix
  * 		 for coding.
  * @param rows   The number of output vectors to concurrently encode/decode.
  * @param gftbls Pointer to array of input tables generated from coding
- *               coefficients in ec_init_tables(). Must be of size 32*k*rows
+ * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
  * @param data   Array of pointers to source input buffers.
  * @param coding Array of pointers to coded output buffers.
  * @returns none
  */
 
-void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, unsigned char **coding);
+void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+		    unsigned char **coding);
 
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
+ *
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires SSE4.1
+ */
+void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			unsigned char **coding);
 
 /**
- * @brief Generate or decode erasure codes on blocks of data, runs appropriate version.
+ * @brief Generate or decode erasure codes on blocks of data.
  *
- * Given a list of source data blocks, generate one or multiple blocks of
- * encoded data as specified by a matrix of GF(2^8) coefficients. When given a
- * suitable set of coefficients, this function will perform the fast generation
- * or decoding of Reed-Solomon type erasure codes.
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires AVX
+ */
+void ec_encode_data_avx(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data.
  *
- * This function determines what instruction sets are enabled and
- * selects the appropriate version at runtime.
+ * Arch specific version of ec_encode_data() with same parameters.
+ * @requires AVX2
+ */
+void ec_encode_data_avx2(int len, int k, int rows, unsigned char *gftbls, unsigned char **data,
+			 unsigned char **coding);
+
+/**
+ * @brief Generate or decode erasure codes on blocks of data, runs baseline version.
+ *
+ * Baseline version of ec_encode_data() with same parameters.
+ */
+void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src,
+			 unsigned char **dest);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source, runs appropriate version.
+ *
+ * Given one source data block, update one or multiple blocks of encoded data as
+ * specified by a matrix of GF(2^8) coefficients. When given a suitable set of
+ * coefficients, this function will perform the fast generation or decoding of
+ * Reed-Solomon type erasure codes from one input source at a time.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
  *
  * @param len    Length of each block of data (vector) of source or dest data.
  * @param k      The number of vector sources or rows in the generator matrix
  * 		 for coding.
  * @param rows   The number of output vectors to concurrently encode/decode.
- * @param gftbls Pointer to array of input tables generated from coding
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param g_tbls Pointer to array of input tables generated from coding
  * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
- * @param data   Array of pointers to source input buffers.
+ * @param data   Pointer to single input source used to update output parity.
  * @param coding Array of pointers to coded output buffers.
  * @returns none
  */
+void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			   unsigned char *data, unsigned char **coding);
 
-void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, unsigned char **coding);
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires SSE4.1
+ */
 
+void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding);
 
 /**
- * @brief Generate or decode erasure codes on blocks of data, runs baseline version.
+ * @brief Generate update for encode or decode of erasure codes from single source.
  *
- * Given a list of source data blocks, generate one or multiple blocks of
- * encoded data as specified by a matrix of GF(2^8) coefficients.  When given a
- * suitable set of coefficients, this function will perform the fast generation
- * or decoding of Reed-Solomon type erasure codes.
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires AVX
+ */
+
+void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+			       unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
  *
- * @param len    Length of each block of data (vector) of source or dest data.
- * @param srcs   The number of vector sources or rows in the generator matrix
- * 		 for coding.
- * @param dests  The number of output vectors to concurrently encode/decode.
- * @param v      Pointer to array of input tables generated from coding
- * 		 coefficients in ec_init_tables(). Must be of size 32*k*rows
- * @param src    Array of pointers to source input buffers.
- * @param dest   Array of pointers to coded output buffers.
- * @returns none
+ * Arch specific version of ec_encode_data_update() with same parameters.
+ * @requires AVX2
+ */
+
+void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls,
+				unsigned char *data, unsigned char **coding);
+
+/**
+ * @brief Generate update for encode or decode of erasure codes from single source.
+ *
+ * Baseline version of ec_encode_data_update().
  */
 
-void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src, unsigned char **dest);
+void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v,
+				unsigned char *data, unsigned char **dest);
 
 
 /**
@@ -150,8 +205,8 @@ void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigne
  * set of coefficients to produce each byte of the output. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 32*vlen byte constant array based on the input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
@@ -171,8 +226,8 @@ void gf_vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * set of coefficients to produce each byte of the output. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 32*vlen byte constant array based on the input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
@@ -192,8 +247,8 @@ void gf_vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * set of coefficients to produce each byte of the output. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 32*vlen byte constant array based on the input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based
@@ -214,8 +269,8 @@ void gf_vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 2*32*vlen byte constant array based on the two sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
@@ -236,8 +291,8 @@ void gf_2vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 2*32*vlen byte constant array based on the two sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
@@ -258,8 +313,8 @@ void gf_2vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 2*32*vlen byte constant array based on the two sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants
@@ -280,8 +335,8 @@ void gf_2vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 3*32*vlen byte constant array based on the three sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
@@ -302,8 +357,8 @@ void gf_3vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 3*32*vlen byte constant array based on the three sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
@@ -324,8 +379,8 @@ void gf_3vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 3*32*vlen byte constant array based on the three sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants
@@ -346,8 +401,8 @@ void gf_3vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 4*32*vlen byte constant array based on the four sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
@@ -368,8 +423,8 @@ void gf_4vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 4*32*vlen byte constant array based on the four sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
@@ -390,8 +445,8 @@ void gf_4vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 4*32*vlen byte constant array based on the four sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants
@@ -412,8 +467,8 @@ void gf_4vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 5*32*vlen byte constant array based on the five sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
@@ -434,8 +489,8 @@ void gf_5vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 5*32*vlen byte constant array based on the five sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
@@ -456,8 +511,8 @@ void gf_5vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 5*32*vlen byte constant array based on the five sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants
@@ -478,8 +533,8 @@ void gf_5vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 6*32*vlen byte constant array based on the six sets of input coefficients.
- *
  * @requires SSE4.1
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
@@ -500,8 +555,8 @@ void gf_6vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 6*32*vlen byte constant array based on the six sets of input coefficients.
- *
  * @requires AVX
+ *
  * @param len    Length of each vector in bytes. Must be >= 16.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
@@ -522,8 +577,8 @@ void gf_6vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls,
  * sets of coefficients to produce each byte of the outputs. Can be used for
  * erasure coding encode and decode. Function requires pre-calculation of a
  * 6*32*vlen byte constant array based on the six sets of input coefficients.
- *
  * @requires AVX2
+ *
  * @param len    Length of each vector in bytes. Must be >= 32.
  * @param vlen   Number of vector sources.
  * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants
@@ -582,6 +637,224 @@ void gf_vect_dot_prod_base(int len, int vlen, unsigned char *gftbls,
 void gf_vect_dot_prod(int len, int vlen, unsigned char *gftbls,
                         unsigned char **src, unsigned char *dest);
 
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, runs appropriate version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constant and add to destination array. Can be used for erasure coding encode
+ * and decode update when only one source is available at a time. Function
+ * requires pre-calculation of a 32*vec byte constant array based on the input
+ * coefficients.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Array of pointers to source inputs.
+ * @param dest   Pointer to destination data array.
+ * @returns none
+ */
+
+void gf_vect_mad(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		 unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires SSE4.1
+ */
+
+void gf_vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		     unsigned char *dest);
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires AVX
+ */
+
+void gf_vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		     unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, arch specific version.
+ *
+ * Arch specific version of gf_vect_mad() with same parameters.
+ * @requires AVX2
+ */
+
+void gf_vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply accumulate, baseline version.
+ *
+ * Baseline version of gf_vect_mad() with same parameters.
+ */
+
+void gf_vect_mad_base(int len, int vec, int vec_i, unsigned char *v, unsigned char *src,
+		      unsigned char *dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate.  SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_2vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. AVX version of gf_2vect_mad_sse().
+ * @requires AVX
+ */
+void gf_2vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 2 accumulate. AVX2 version of gf_2vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_2vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_3vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. AVX version of gf_3vect_mad_sse().
+ * @requires AVX
+ */
+void gf_3vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 3 accumulate. AVX2 version of gf_3vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_3vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. SSE version.
+ *
+ * Does a GF(2^8) multiply across each byte of input source with expanded
+ * constants and add to destination arrays. Can be used for erasure coding
+ * encode and decode update when only one source is available at a
+ * time. Function requires pre-calculation of a 32*vec byte constant array based
+ * on the input coefficients.
+ * @requires SSE4.1
+ *
+ * @param len    Length of each vector in bytes. Must be >= 32.
+ * @param vec    The number of vector sources or rows in the generator matrix
+ * 		 for coding.
+ * @param vec_i  The vector index corresponding to the single input source.
+ * @param gftbls Pointer to array of input tables generated from coding
+ * 		 coefficients in ec_init_tables(). Must be of size 32*vec.
+ * @param src    Pointer to source input array.
+ * @param dest   Array of pointers to destination input/outputs.
+ * @returns none
+ */
+
+void gf_4vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. AVX version of gf_4vect_mad_sse().
+ * @requires AVX
+ */
+void gf_4vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 4 accumulate. AVX2 version of gf_4vect_mad_sse().
+ * @requires AVX2
+ */
+void gf_4vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. SSE version.
+ * @requires SSE4.1
+ */
+void gf_5vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. AVX version.
+ * @requires AVX
+ */
+void gf_5vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 5 accumulate. AVX2 version.
+ * @requires AVX2
+ */
+void gf_5vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. SSE version.
+ * @requires SSE4.1
+ */
+void gf_6vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. AVX version.
+ * @requires AVX
+ */
+void gf_6vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		      unsigned char **dest);
+
+/**
+ * @brief GF(2^8) vector multiply with 6 accumulate. AVX2 version.
+ * @requires AVX2
+ */
+void gf_6vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src,
+		       unsigned char **dest);
+
+
 /**********************************************************************
  * The remaining are lib support functions used in GF(2^8) operations.
  */
@@ -650,6 +923,7 @@ void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k);
 
 int gf_invert_matrix(unsigned char *in, unsigned char *out, const int n);
 
+
 /*************************************************************/
 
 #ifdef __cplusplus
diff --git a/src/erasure-code/isa/isa-l/include/gf_vect_mul.h b/src/erasure-code/isa/isa-l/include/gf_vect_mul.h
index ef19845788d..bf4fd01a60b 100644
--- a/src/erasure-code/isa/isa-l/include/gf_vect_mul.h
+++ b/src/erasure-code/isa/isa-l/include/gf_vect_mul.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -52,8 +52,8 @@ extern "C" {
  * 32-element constant array based on constant C. gftbl(C) = {C{00},
  * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
  * and src must be aligned to 32B.
-
  * @requires SSE4.1
+ *
  * @param len   Length of vector in bytes. Must be aligned to 32B.
  * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
  * @param src   Pointer to src data array. Must be aligned to 32B.
@@ -73,8 +73,8 @@ int gf_vect_mul_sse(int len, unsigned char *gftbl, void *src, void *dest);
  * 32-element constant array based on constant C. gftbl(C) = {C{00},
  * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len
  * and src must be aligned to 32B.
-
  * @requires AVX
+ *
  * @param len   Length of vector in bytes. Must be aligned to 32B.
  * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C.
  * @param src   Pointer to src data array. Must be aligned to 32B.
diff --git a/src/erasure-code/isa/isa-l/include/reg_sizes.asm b/src/erasure-code/isa/isa-l/include/reg_sizes.asm
index ed212520bf6..219ba069ebd 100644
--- a/src/erasure-code/isa/isa-l/include/reg_sizes.asm
+++ b/src/erasure-code/isa/isa-l/include/reg_sizes.asm
@@ -1,5 +1,5 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 ;
 ;  Redistribution and use in source and binary forms, with or without
 ;  modification, are permitted provided that the following conditions
diff --git a/src/erasure-code/isa/isa-l/include/types.h b/src/erasure-code/isa/isa-l/include/types.h
index 0feed472e6a..695d94eefa7 100644
--- a/src/erasure-code/isa/isa-l/include/types.h
+++ b/src/erasure-code/isa/isa-l/include/types.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2011-2014 Intel Corporation All rights reserved.
+  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
author	Yuan Zhou <yuan.zhou@intel.com>	2015-03-30 07:39:31 +0200
committer	Yuan Zhou <yuan.zhou@intel.com>	2015-03-30 07:39:31 +0200
commit	59aa6700fa7ae63d28a8045c2a11719f8f222e17 (patch)
tree	8a87c086e45e42622b6bcf19071155fdcdbe2ecd /src/erasure-code
parent	Merge pull request #4209 from ceph/wip-java (diff)
download	ceph-59aa6700fa7ae63d28a8045c2a11719f8f222e17.tar.xz ceph-59aa6700fa7ae63d28a8045c2a11719f8f222e17.zip