diff options
author | Yuan Zhou <yuan.zhou@intel.com> | 2015-03-30 07:39:31 +0200 |
---|---|---|
committer | Yuan Zhou <yuan.zhou@intel.com> | 2015-03-30 07:39:31 +0200 |
commit | 59aa6700fa7ae63d28a8045c2a11719f8f222e17 (patch) | |
tree | 8a87c086e45e42622b6bcf19071155fdcdbe2ecd /src/erasure-code | |
parent | Merge pull request #4209 from ceph/wip-java (diff) | |
download | ceph-59aa6700fa7ae63d28a8045c2a11719f8f222e17.tar.xz ceph-59aa6700fa7ae63d28a8045c2a11719f8f222e17.zip |
erasure-code: Update ISA-L to 2.13
ISA-L 2.13 brings better performance on Avoton (20%). There's no impact on Xeon
platform. The details are in the release notes.
There's a new API ec_encode_data_update() for incremental encoding
and decoding. The other highlevel API keeps the same as in 2.10
Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
Diffstat (limited to 'src/erasure-code')
47 files changed, 8056 insertions, 464 deletions
diff --git a/src/erasure-code/isa/Makefile.am b/src/erasure-code/isa/Makefile.am index 649ddaacb07..b36b8a6daf7 100644 --- a/src/erasure-code/isa/Makefile.am +++ b/src/erasure-code/isa/Makefile.am @@ -33,6 +33,24 @@ isa_sources = \ erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s \ erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s \ erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s \ + erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s \ erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s \ erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s \ erasure-code/isa/ErasureCodeIsa.cc \ @@ -49,7 +67,7 @@ libec_isa_la_CXXFLAGS = ${AM_CXXFLAGS} -I $(srcdir)/erasure-code/isa/isa-l/inclu libec_isa_la_CCASFLAGS = ${AM_CCASFLAGS} -I $(abs_srcdir)/erasure-code/isa/isa-l/include/ libec_isa_la_LIBADD = $(LIBCRUSH) $(PTHREAD_LIBS) $(EXTRALIBS) -libec_isa_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:10:0 +libec_isa_la_LDFLAGS = ${AM_LDFLAGS} -version-info 2:13:0 if LINUX libec_isa_la_LDFLAGS += -export-symbols-regex '.*__erasure_code_.*' endif diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_base.c b/src/erasure-code/isa/isa-l/erasure_code/ec_base.c index 5e93cb6fe15..3c7e8382ca2 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/ec_base.c +++ b/src/erasure-code/isa/isa-l/erasure_code/ec_base.c @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2011-2014 Intel Corporation All rights reserved. + Copyright(c) 2011-2015 Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -275,6 +275,18 @@ void gf_vect_dot_prod_base(int len, int vlen, unsigned char *v, } } +void gf_vect_mad_base(int len, int vec, int vec_i, + unsigned char *v, unsigned char *src, unsigned char *dest) +{ + int i; + unsigned char s; + for (i = 0; i < len; i++) { + s = dest[i]; + s ^= gf_mul(src[i], v[vec_i * 32 + 1]); + dest[i] = s; + } +} + void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src, unsigned char **dest) { @@ -292,6 +304,22 @@ void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, } } +void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v, + unsigned char *data, unsigned char **dest) +{ + int i, l; + unsigned char s; + + for (l = 0; l < rows; l++) { + for (i = 0; i < len; i++) { + s = dest[l][i]; + s ^= gf_mul(data[i], v[vec_i * 32 + l * k * 32 + 1]); + + dest[l][i] = s; + } + } +} + void gf_vect_mul_base(int len, unsigned char *a, unsigned char *src, unsigned char *dest) { //2nd element of table array is ref value used to fill it in diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_base.h b/src/erasure-code/isa/isa-l/erasure_code/ec_base.h index 519ac7a2cac..d69a92d67bc 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/ec_base.h +++ b/src/erasure-code/isa/isa-l/erasure_code/ec_base.h @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2011-2014 Intel Corporation All rights reserved. + Copyright(c) 2011-2015 Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c b/src/erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c index 9cea61e5d52..fe2cdc9ca99 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c +++ b/src/erasure-code/isa/isa-l/erasure_code/ec_highlevel_func.c @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2011-2014 Intel Corporation All rights reserved. + Copyright(c) 2011-2015 Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -42,7 +42,6 @@ void ec_init_tables(int k, int rows, unsigned char *a, unsigned char *g_tbls) } } -#if __WORDSIZE == 64 || _WIN64 || __x86_64__ void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data, unsigned char **coding) { @@ -77,7 +76,6 @@ void ec_encode_data_sse(int len, int k, int rows, unsigned char *g_tbls, unsigne void ec_encode_data_avx(int len, int k, int rows, unsigned char *g_tbls, unsigned char **data, unsigned char **coding) { - if (len < 16) { ec_encode_data_base(len, k, rows, g_tbls, data, coding); return; @@ -136,6 +134,123 @@ void ec_encode_data_avx2(int len, int k, int rows, unsigned char *g_tbls, unsign } +#if __WORDSIZE == 64 || _WIN64 || __x86_64__ + +void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding); + return; + } + + while (rows > 6) { + gf_6vect_mad_sse(len, k, vec_i, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; + } + switch (rows) { + case 6: + gf_6vect_mad_sse(len, k, vec_i, g_tbls, data, coding); + break; + case 5: + gf_5vect_mad_sse(len, k, vec_i, g_tbls, data, coding); + break; + case 4: + gf_4vect_mad_sse(len, k, vec_i, g_tbls, data, coding); + break; + case 3: + gf_3vect_mad_sse(len, k, vec_i, g_tbls, data, coding); + break; + case 2: + gf_2vect_mad_sse(len, k, vec_i, g_tbls, data, coding); + break; + case 1: + gf_vect_mad_sse(len, k, vec_i, g_tbls, data, *coding); + break; + case 0: + break; + } + +} + +void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding) +{ + if (len < 16) { + ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding); + return; + } + while (rows > 6) { + gf_6vect_mad_avx(len, k, vec_i, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; + } + switch (rows) { + case 6: + gf_6vect_mad_avx(len, k, vec_i, g_tbls, data, coding); + break; + case 5: + gf_5vect_mad_avx(len, k, vec_i, g_tbls, data, coding); + break; + case 4: + gf_4vect_mad_avx(len, k, vec_i, g_tbls, data, coding); + break; + case 3: + gf_3vect_mad_avx(len, k, vec_i, g_tbls, data, coding); + break; + case 2: + gf_2vect_mad_avx(len, k, vec_i, g_tbls, data, coding); + break; + case 1: + gf_vect_mad_avx(len, k, vec_i, g_tbls, data, *coding); + break; + case 0: + break; + } + +} + +void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding) +{ + if (len < 32) { + ec_encode_data_update_base(len, k, rows, vec_i, g_tbls, data, coding); + return; + } + while (rows > 6) { + gf_6vect_mad_avx2(len, k, vec_i, g_tbls, data, coding); + g_tbls += 6 * k * 32; + coding += 6; + rows -= 6; + } + switch (rows) { + case 6: + gf_6vect_mad_avx2(len, k, vec_i, g_tbls, data, coding); + break; + case 5: + gf_5vect_mad_avx2(len, k, vec_i, g_tbls, data, coding); + break; + case 4: + gf_4vect_mad_avx2(len, k, vec_i, g_tbls, data, coding); + break; + case 3: + gf_3vect_mad_avx2(len, k, vec_i, g_tbls, data, coding); + break; + case 2: + gf_2vect_mad_avx2(len, k, vec_i, g_tbls, data, coding); + break; + case 1: + gf_vect_mad_avx2(len, k, vec_i, g_tbls, data, *coding); + break; + case 0: + break; + } + +} + #endif //__WORDSIZE == 64 || _WIN64 || __x86_64__ struct slver { diff --git a/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s b/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s index 54f7301181e..f23db361ca9 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/ec_multibinary.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -28,42 +28,63 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %ifidn __OUTPUT_FORMAT__, elf64 -%define WRT_OPT wrt ..plt + %define WRT_OPT wrt ..plt %else -%define WRT_OPT + %define WRT_OPT %endif +%include "reg_sizes.asm" + %ifidn __OUTPUT_FORMAT__, elf32 [bits 32] -%define def_wrd dd -%define wrd_sz dword -%define arg1 esi + %define def_wrd dd + %define wrd_sz dword + %define arg1 esi + %define arg2 eax + %define arg3 ebx + %define arg4 ecx + %define arg5 edx %else -%include "reg_sizes.asm" -default rel -[bits 64] + default rel + [bits 64] -%define def_wrd dq -%define wrd_sz qword -%define arg1 rsi + %define def_wrd dq + %define wrd_sz qword + %define arg1 rsi + %define arg2 rax + %define arg3 rbx + %define arg4 rcx + %define arg5 rdx -extern ec_encode_data_sse -extern ec_encode_data_avx -extern ec_encode_data_avx2 -extern gf_vect_mul_sse -extern gf_vect_mul_avx -extern gf_vect_dot_prod_sse -extern gf_vect_dot_prod_avx -extern gf_vect_dot_prod_avx2 + + extern ec_encode_data_update_sse + extern ec_encode_data_update_avx + extern ec_encode_data_update_avx2 + extern gf_vect_mul_sse + extern gf_vect_mul_avx + + extern gf_vect_mad_sse + extern gf_vect_mad_avx + extern gf_vect_mad_avx2 %endif extern gf_vect_mul_base extern ec_encode_data_base +extern ec_encode_data_update_base extern gf_vect_dot_prod_base +extern gf_vect_mad_base + +extern gf_vect_dot_prod_sse +extern gf_vect_dot_prod_avx +extern gf_vect_dot_prod_avx2 +extern ec_encode_data_sse +extern ec_encode_data_avx +extern ec_encode_data_avx2 + section .data ;;; *_mbinit are initial values for *_dispatched; is updated on first call. @@ -78,6 +99,12 @@ gf_vect_mul_dispatched: gf_vect_dot_prod_dispatched: def_wrd gf_vect_dot_prod_mbinit +ec_encode_data_update_dispatched: + def_wrd ec_encode_data_update_mbinit + +gf_vect_mad_dispatched: + def_wrd gf_vect_mad_mbinit + section .text ;;;; ; ec_encode_data multibinary function @@ -91,50 +118,45 @@ ec_encode_data: ec_encode_data_dispatch_init: push arg1 -%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check - lea arg1, [ec_encode_data_base] -%else - push rax - push rbx - push rcx - push rdx + push arg2 + push arg3 + push arg4 + push arg5 lea arg1, [ec_encode_data_base WRT_OPT] ; Default mov eax, 1 cpuid - lea rbx, [ec_encode_data_sse WRT_OPT] + lea arg3, [ec_encode_data_sse WRT_OPT] test ecx, FLAG_CPUID1_ECX_SSE4_1 - cmovne arg1, rbx + cmovne arg1, arg3 and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) - lea rbx, [ec_encode_data_avx WRT_OPT] + lea arg3, [ec_encode_data_avx WRT_OPT] jne _done_ec_encode_data_init - mov rsi, rbx + mov arg1, arg3 ;; Try for AVX2 xor ecx, ecx mov eax, 7 cpuid test ebx, FLAG_CPUID1_EBX_AVX2 - lea rbx, [ec_encode_data_avx2 WRT_OPT] - cmovne rsi, rbx - + lea arg3, [ec_encode_data_avx2 WRT_OPT] + cmovne arg1, arg3 ;; Does it have xmm and ymm support xor ecx, ecx xgetbv and eax, FLAG_XGETBV_EAX_XMM_YMM cmp eax, FLAG_XGETBV_EAX_XMM_YMM je _done_ec_encode_data_init - lea rsi, [ec_encode_data_sse WRT_OPT] + lea arg1, [ec_encode_data_sse WRT_OPT] _done_ec_encode_data_init: - pop rdx - pop rcx - pop rbx - pop rax -%endif ;; END 32-bit check + pop arg5 + pop arg4 + pop arg3 + pop arg2 mov [ec_encode_data_dispatched], arg1 pop arg1 ret @@ -190,6 +212,65 @@ _done_gf_vect_mul_dispatch_init: pop arg1 ret +;;;; +; ec_encode_data_update multibinary function +;;;; +global ec_encode_data_update:function +ec_encode_data_update_mbinit: + call ec_encode_data_update_dispatch_init + +ec_encode_data_update: + jmp wrd_sz [ec_encode_data_update_dispatched] + +ec_encode_data_update_dispatch_init: + push arg1 +%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check + lea arg1, [ec_encode_data_update_base] +%else + push rax + push rbx + push rcx + push rdx + lea arg1, [ec_encode_data_update_base WRT_OPT] ; Default + + mov eax, 1 + cpuid + lea rbx, [ec_encode_data_update_sse WRT_OPT] + test ecx, FLAG_CPUID1_ECX_SSE4_1 + cmovne arg1, rbx + + and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + lea rbx, [ec_encode_data_update_avx WRT_OPT] + + jne _done_ec_encode_data_update_init + mov rsi, rbx + + ;; Try for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID1_EBX_AVX2 + lea rbx, [ec_encode_data_update_avx2 WRT_OPT] + cmovne rsi, rbx + + ;; Does it have xmm and ymm support + xor ecx, ecx + xgetbv + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + je _done_ec_encode_data_update_init + lea rsi, [ec_encode_data_update_sse WRT_OPT] + +_done_ec_encode_data_update_init: + pop rdx + pop rcx + pop rbx + pop rax +%endif ;; END 32-bit check + mov [ec_encode_data_update_dispatched], arg1 + pop arg1 + ret ;;;; ; gf_vect_dot_prod multibinary function @@ -203,26 +284,81 @@ gf_vect_dot_prod: gf_vect_dot_prod_dispatch_init: push arg1 + push arg2 + push arg3 + push arg4 + push arg5 + lea arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default + + mov eax, 1 + cpuid + lea arg3, [gf_vect_dot_prod_sse WRT_OPT] + test ecx, FLAG_CPUID1_ECX_SSE4_1 + cmovne arg1, arg3 + + and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) + lea arg3, [gf_vect_dot_prod_avx WRT_OPT] + + jne _done_gf_vect_dot_prod_init + mov arg1, arg3 + + ;; Try for AVX2 + xor ecx, ecx + mov eax, 7 + cpuid + test ebx, FLAG_CPUID1_EBX_AVX2 + lea arg3, [gf_vect_dot_prod_avx2 WRT_OPT] + cmovne arg1, arg3 + ;; Does it have xmm and ymm support + xor ecx, ecx + xgetbv + and eax, FLAG_XGETBV_EAX_XMM_YMM + cmp eax, FLAG_XGETBV_EAX_XMM_YMM + je _done_gf_vect_dot_prod_init + lea arg1, [gf_vect_dot_prod_sse WRT_OPT] + +_done_gf_vect_dot_prod_init: + pop arg5 + pop arg4 + pop arg3 + pop arg2 + mov [gf_vect_dot_prod_dispatched], arg1 + pop arg1 + ret + +;;;; +; gf_vect_mad multibinary function +;;;; +global gf_vect_mad:function +gf_vect_mad_mbinit: + call gf_vect_mad_dispatch_init + +gf_vect_mad: + jmp wrd_sz [gf_vect_mad_dispatched] + +gf_vect_mad_dispatch_init: + push arg1 %ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check - lea arg1, [gf_vect_dot_prod_base] + lea arg1, [gf_vect_mad_base] %else push rax push rbx push rcx push rdx - lea arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default + lea arg1, [gf_vect_mad_base WRT_OPT] ; Default mov eax, 1 cpuid - lea rbx, [gf_vect_dot_prod_sse WRT_OPT] + lea rbx, [gf_vect_mad_sse WRT_OPT] test ecx, FLAG_CPUID1_ECX_SSE4_1 cmovne arg1, rbx and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) - lea rbx, [gf_vect_dot_prod_avx WRT_OPT] + lea rbx, [gf_vect_mad_avx WRT_OPT] - jne _done_gf_vect_dot_prod_init + jne _done_gf_vect_mad_init mov rsi, rbx ;; Try for AVX2 @@ -230,7 +366,7 @@ gf_vect_dot_prod_dispatch_init: mov eax, 7 cpuid test ebx, FLAG_CPUID1_EBX_AVX2 - lea rbx, [gf_vect_dot_prod_avx2 WRT_OPT] + lea rbx, [gf_vect_mad_avx2 WRT_OPT] cmovne rsi, rbx ;; Does it have xmm and ymm support @@ -238,16 +374,16 @@ gf_vect_dot_prod_dispatch_init: xgetbv and eax, FLAG_XGETBV_EAX_XMM_YMM cmp eax, FLAG_XGETBV_EAX_XMM_YMM - je _done_gf_vect_dot_prod_init - lea rsi, [gf_vect_dot_prod_sse WRT_OPT] + je _done_gf_vect_mad_init + lea rsi, [gf_vect_mad_sse WRT_OPT] -_done_gf_vect_dot_prod_init: +_done_gf_vect_mad_init: pop rdx pop rcx pop rbx pop rax %endif ;; END 32-bit check - mov [gf_vect_dot_prod_dispatched], arg1 + mov [gf_vect_mad_dispatched], arg1 pop arg1 ret @@ -260,9 +396,9 @@ global %1_slver db 0x%3, 0x%2 %endmacro -;;; func core, ver, snum -slversion ec_encode_data, 00, 02, 0133 -slversion gf_vect_mul, 00, 02, 0134 -slversion gf_vect_dot_prod, 00, 01, 0138 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +;;; func core, ver, snum +slversion ec_encode_data, 00, 03, 0133 +slversion gf_vect_mul, 00, 02, 0134 +slversion ec_encode_data_update, 00, 02, 0212 +slversion gf_vect_dot_prod, 00, 02, 0138 +slversion gf_vect_mad, 00, 01, 0213 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s index db8064a3971..1bd839cb66d 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_2vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -46,6 +44,9 @@ %define tmp3 r9 %define tmp4 r12 ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 @@ -70,6 +71,9 @@ %define tmp3 r13 ; must be saved and restored %define tmp4 r14 ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 %define stack_size 3*16 + 3*8 ; must be an odd multiple of 8 @@ -99,17 +103,92 @@ %endmacro %endif +%ifidn __OUTPUT_FORMAT__, elf32 + +;;;================== High Address; +;;; arg4 +;;; arg3 +;;; arg2 +;;; arg1 +;;; arg0 +;;; return +;;;<================= esp of caller +;;; ebp +;;;<================= ebp = esp +;;; var0 +;;; esi +;;; edi +;;; ebx +;;;<================= esp of callee +;;; +;;;================== Low Address; + + %define PS 4 + %define LOG_PS 2 + %define func(x) x: + %define arg(x) [ebp + PS*2 + PS*x] + %define var(x) [ebp - PS - PS*x] + + %define trans ecx + %define trans2 esi + %define arg0 trans ;trans and trans2 are for the variables in stack + %define arg0_m arg(0) + %define arg1 ebx + %define arg2 arg2_m + %define arg2_m arg(2) + %define arg3 trans + %define arg3_m arg(3) + %define arg4 trans + %define arg4_m arg(4) + %define tmp edx + %define tmp2 edi + %define tmp3 trans2 + %define tmp4 trans2 + %define tmp4_m var(0) + %define return eax + %macro SLDR 2 ;; stack load/restore + mov %1, %2 + %endmacro + %define SSTR SLDR + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + sub esp, PS*1 ;1 local variable + push esi + push edi + push ebx + mov arg1, arg(1) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + add esp, PS*1 ;1 local variable + pop ebp + %endmacro + +%endif ; output formats + %define len arg0 %define vec arg1 %define mul_array arg2 %define src arg3 -%define dest1 arg4 +%define dest1 arg4 %define vec_i tmp2 %define ptr tmp3 %define dest2 tmp4 %define pos return + %ifidn PS,4 ;32-bit code + %define len_m arg0_m + %define src_m arg3_m + %define dest1_m arg4_m + %define dest2_m tmp4_m + %endif + %ifndef EC_ALIGNED_ADDR ;;; Use Un-aligned load/store %define XLDR vmovdqu @@ -125,35 +204,54 @@ %endif %endif +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] +%endif -default rel - -[bits 64] section .text -%define xmask0f xmm8 -%define xgft1_lo xmm7 -%define xgft1_hi xmm6 -%define xgft2_lo xmm5 -%define xgft2_hi xmm4 - -%define x0 xmm0 -%define xtmpa xmm1 -%define xp1 xmm2 -%define xp2 xmm3 +%ifidn PS,8 ;64-bit code + %define xmask0f xmm8 + %define xgft1_lo xmm7 + %define xgft1_hi xmm6 + %define xgft2_lo xmm5 + %define xgft2_hi xmm4 + + %define x0 xmm0 + %define xtmpa xmm1 + %define xp1 xmm2 + %define xp2 xmm3 +%else ;32-bit code + %define xmask0f xmm4 + %define xgft1_lo xmm7 + %define xgft1_hi xmm6 + %define xgft2_lo xgft1_lo + %define xgft2_hi xgft1_hi + + %define x0 xmm0 + %define xtmpa xmm1 + %define xp1 xmm2 + %define xp2 xmm3 +%endif align 16 global gf_2vect_dot_prod_avx:function func(gf_2vect_dot_prod_avx) FUNC_SAVE + SLDR len, len_m sub len, 16 + SSTR len_m, len jl .return_fail xor pos, pos vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS + SLDR dest1, dest1_m mov dest2, [dest1+PS] + SSTR dest2_m, dest2 mov dest1, [dest1] + SSTR dest1_m, dest1 .loop16 vpxor xp1, xp1 @@ -162,16 +260,18 @@ func(gf_2vect_dot_prod_avx) xor vec_i, vec_i .next_vect + SLDR src, src_m mov ptr, [src+vec_i] vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0} + %ifidn PS,8 ; 64-bit code vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0} - - XLDR x0, [ptr+pos] ;Get next source vector add tmp, 32 add vec_i, PS + %endif + XLDR x0, [ptr+pos] ;Get next source vector vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 @@ -182,6 +282,12 @@ func(gf_2vect_dot_prod_avx) vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xp1, xgft1_hi ;xp1 += partial + %ifidn PS,4 ; 32-bit code + vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0} + add tmp, 32 + add vec_i, PS + %endif vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft2_hi, xgft2_lo ;GF add high and low partials @@ -190,9 +296,12 @@ func(gf_2vect_dot_prod_avx) cmp vec_i, vec jl .next_vect + SLDR dest1, dest1_m + SLDR dest2, dest2_m XSTR [dest1+pos], xp1 XSTR [dest2+pos], xp2 + SLDR len, len_m add pos, 16 ;Loop on 16 bytes at a time cmp pos, len jle .loop16 @@ -231,6 +340,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -slversion gf_2vect_dot_prod_avx, 02, 03, 0191 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_2vect_dot_prod_avx, 02, 04, 0191 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s index 5d75d810905..ada013bd628 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_avx2.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_2vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -48,7 +46,10 @@ %define tmp3 r9 %define tmp4 r12 ; must be saved and restored %define return rax - %define PS 8 + %macro SLDR 2 + %endmacro + %define SSTR SLDR + %define PS 8 %define LOG_PS 3 %define func(x) x: @@ -74,6 +75,9 @@ %define tmp3 r13 ; must be saved and restored %define tmp4 r14 ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 %define stack_size 3*16 + 3*8 ; must be an odd multiple of 8 @@ -103,6 +107,76 @@ %endmacro %endif +%ifidn __OUTPUT_FORMAT__, elf32 + +;;;================== High Address; +;;; arg4 +;;; arg3 +;;; arg2 +;;; arg1 +;;; arg0 +;;; return +;;;<================= esp of caller +;;; ebp +;;;<================= ebp = esp +;;; var0 +;;; esi +;;; edi +;;; ebx +;;;<================= esp of callee +;;; +;;;================== Low Address; + + %define PS 4 + %define LOG_PS 2 + %define func(x) x: + %define arg(x) [ebp + PS*2 + PS*x] + %define var(x) [ebp - PS - PS*x] + + %define trans ecx + %define trans2 esi + %define arg0 trans ;trans and trans2 are for the variables in stack + %define arg0_m arg(0) + %define arg1 ebx + %define arg2 arg2_m + %define arg2_m arg(2) + %define arg3 trans + %define arg3_m arg(3) + %define arg4 trans + %define arg4_m arg(4) + %define tmp edx + %define tmp.w edx + %define tmp.b dl + %define tmp2 edi + %define tmp3 trans2 + %define tmp4 trans2 + %define tmp4_m var(0) + %define return eax + %macro SLDR 2 ;stack load/restore + mov %1, %2 + %endmacro + %define SSTR SLDR + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + sub esp, PS*1 ;1 local variable + push esi + push edi + push ebx + mov arg1, arg(1) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + add esp, PS*1 ;1 local variable + pop ebp + %endmacro + +%endif ; output formats + %define len arg0 %define vec arg1 %define mul_array arg2 @@ -114,6 +188,13 @@ %define dest2 tmp4 %define pos return +%ifidn PS,4 ;32-bit code + %define len_m arg0_m + %define src_m arg3_m + %define dest1_m arg4_m + %define dest2_m tmp4_m +%endif + %ifndef EC_ALIGNED_ADDR ;;; Use Un-aligned load/store %define XLDR vmovdqu @@ -130,30 +211,48 @@ %endif %endif +%ifidn PS,8 ;64-bit code + default rel + [bits 64] +%endif -default rel - -[bits 64] section .text -%define xmask0f ymm8 -%define xmask0fx xmm8 -%define xgft1_lo ymm7 -%define xgft1_hi ymm6 -%define xgft2_lo ymm5 -%define xgft2_hi ymm4 +%ifidn PS,8 ;64-bit code + %define xmask0f ymm8 + %define xmask0fx xmm8 + %define xgft1_lo ymm7 + %define xgft1_hi ymm6 + %define xgft2_lo ymm5 + %define xgft2_hi ymm4 + + %define x0 ymm0 + %define xtmpa ymm1 + %define xp1 ymm2 + %define xp2 ymm3 +%else ;32-bit code + %define xmask0f ymm7 + %define xmask0fx xmm7 + %define xgft1_lo ymm5 + %define xgft1_hi ymm4 + %define xgft2_lo xgft1_lo + %define xgft2_hi xgft1_hi + + %define x0 ymm0 + %define xtmpa ymm1 + %define xp1 ymm2 + %define xp2 ymm3 -%define x0 ymm0 -%define xtmpa ymm1 -%define xp1 ymm2 -%define xp2 ymm3 +%endif align 16 global gf_2vect_dot_prod_avx2:function func(gf_2vect_dot_prod_avx2) FUNC_SAVE + SLDR len, len_m sub len, 32 + SSTR len_m, len jl .return_fail xor pos, pos mov tmp.b, 0x0f @@ -161,8 +260,11 @@ func(gf_2vect_dot_prod_avx2) vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS + SLDR dest1, dest1_m mov dest2, [dest1+PS] + SSTR dest2_m, dest2 mov dest1, [dest1] + SSTR dest1_m, dest1 .loop32 vpxor xp1, xp1 @@ -171,22 +273,25 @@ func(gf_2vect_dot_prod_avx2) xor vec_i, vec_i .next_vect + SLDR src, src_m mov ptr, [src+vec_i] vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} ; " Ax{00}, Ax{10}, ..., Ax{f0} vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo - + %ifidn PS,8 ; 64-bit code vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} ; " Bx{00}, Bx{10}, ..., Bx{f0} vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo - XLDR x0, [ptr+pos] ;Get next source vector add tmp, 32 add vec_i, PS + %else + XLDR x0, [ptr+pos] ;Get next source vector + %endif vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 @@ -197,6 +302,14 @@ func(gf_2vect_dot_prod_avx2) vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xp1, xgft1_hi ;xp1 += partial + %ifidn PS,4 ; 32-bit code + vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + ; " Bx{00}, Bx{10}, ..., Bx{f0} + vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi + vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo + add tmp, 32 + add vec_i, PS + %endif vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft2_hi, xgft2_lo ;GF add high and low partials @@ -205,9 +318,12 @@ func(gf_2vect_dot_prod_avx2) cmp vec_i, vec jl .next_vect + SLDR dest1, dest1_m + SLDR dest2, dest2_m XSTR [dest1+pos], xp1 XSTR [dest2+pos], xp2 + SLDR len, len_m add pos, 32 ;Loop on 32 bytes at a time cmp pos, len jle .loop32 @@ -243,6 +359,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -slversion gf_2vect_dot_prod_avx2, 04, 03, 0196 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_2vect_dot_prod_avx2, 04, 04, 0196 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s index 4f324aecc43..e180830c1fc 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_dot_prod_sse.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_2vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -46,6 +44,9 @@ %define tmp3 r9 %define tmp4 r12 ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 @@ -70,6 +71,9 @@ %define tmp3 r13 ; must be saved and restored %define tmp4 r14 ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 %define stack_size 3*16 + 3*8 ; must be an odd multiple of 8 @@ -99,23 +103,97 @@ %endmacro %endif +%ifidn __OUTPUT_FORMAT__, elf32 + +;;;================== High Address; +;;; arg4 +;;; arg3 +;;; arg2 +;;; arg1 +;;; arg0 +;;; return +;;;<================= esp of caller +;;; ebp +;;;<================= ebp = esp +;;; var0 +;;; esi +;;; edi +;;; ebx +;;;<================= esp of callee +;;; +;;;================== Low Address; + + %define PS 4 + %define LOG_PS 2 + %define func(x) x: + %define arg(x) [ebp + PS*2 + PS*x] + %define var(x) [ebp - PS - PS*x] + + %define trans ecx + %define trans2 esi + %define arg0 trans ;trans and trans2 are for the variables in stack + %define arg0_m arg(0) + %define arg1 ebx + %define arg2 arg2_m + %define arg2_m arg(2) + %define arg3 trans + %define arg3_m arg(3) + %define arg4 trans + %define arg4_m arg(4) + %define tmp edx + %define tmp2 edi + %define tmp3 trans2 + %define tmp4 trans2 + %define tmp4_m var(0) + %define return eax + %macro SLDR 2 ;; stack load/restore + mov %1, %2 + %endmacro + %define SSTR SLDR + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + sub esp, PS*1 ;1 local variable + push esi + push edi + push ebx + mov arg1, arg(1) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + add esp, PS*1 ;1 local variable + pop ebp + %endmacro + +%endif ; output formats + %define len arg0 %define vec arg1 %define mul_array arg2 %define src arg3 -%define dest1 arg4 +%define dest1 arg4 %define vec_i tmp2 %define ptr tmp3 %define dest2 tmp4 %define pos return + %ifidn PS,4 ;32-bit code + %define len_m arg0_m + %define src_m arg3_m + %define dest1_m arg4_m + %define dest2_m tmp4_m + %endif + %ifndef EC_ALIGNED_ADDR ;;; Use Un-aligned load/store %define XLDR movdqu %define XSTR movdqu %else - ;;; Use Non-temporal load/stor %ifdef NO_NT_LDST %define XLDR movdqa @@ -126,35 +204,54 @@ %endif %endif +%ifidn PS,8 ;64-bit code + default rel + [bits 64] +%endif -default rel - -[bits 64] section .text -%define xmask0f xmm8 -%define xgft1_lo xmm7 -%define xgft1_hi xmm6 -%define xgft2_lo xmm5 -%define xgft2_hi xmm4 - -%define x0 xmm0 -%define xtmpa xmm1 -%define xp1 xmm2 -%define xp2 xmm3 +%ifidn PS,8 ;64-bit code + %define xmask0f xmm8 + %define xgft1_lo xmm7 + %define xgft1_hi xmm6 + %define xgft2_lo xmm5 + %define xgft2_hi xmm4 + + %define x0 xmm0 + %define xtmpa xmm1 + %define xp1 xmm2 + %define xp2 xmm3 +%else ;32-bit code + %define xmask0f xmm4 + %define xgft1_lo xmm7 + %define xgft1_hi xmm6 + %define xgft2_lo xgft1_lo + %define xgft2_hi xgft1_hi + + %define x0 xmm0 + %define xtmpa xmm1 + %define xp1 xmm2 + %define xp2 xmm3 +%endif align 16 global gf_2vect_dot_prod_sse:function func(gf_2vect_dot_prod_sse) FUNC_SAVE + SLDR len, len_m sub len, 16 + SSTR len_m, len jl .return_fail xor pos, pos movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS + SLDR dest1, dest1_m mov dest2, [dest1+PS] + SSTR dest2_m, dest2 mov dest1, [dest1] + SSTR dest1_m, dest1 .loop16 pxor xp1, xp1 @@ -163,16 +260,18 @@ func(gf_2vect_dot_prod_sse) xor vec_i, vec_i .next_vect + SLDR src, src_m mov ptr, [src+vec_i] movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0} + %ifidn PS,8 ;64-bit code movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0} - - XLDR x0, [ptr+pos] ;Get next source vector add tmp, 32 add vec_i, PS + %endif + XLDR x0, [ptr+pos] ;Get next source vector movdqa xtmpa, x0 ;Keep unshifted copy of src psraw x0, 4 ;Shift to put high nibble into bits 4-0 @@ -184,6 +283,13 @@ func(gf_2vect_dot_prod_sse) pxor xgft1_hi, xgft1_lo ;GF add high and low partials pxor xp1, xgft1_hi ;xp1 += partial + %ifidn PS,4 ;32-bit code + movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0} + + add tmp, 32 + add vec_i, PS + %endif pshufb xgft2_hi, x0 ;Lookup mul table of high nibble pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble pxor xgft2_hi, xgft2_lo ;GF add high and low partials @@ -192,9 +298,12 @@ func(gf_2vect_dot_prod_sse) cmp vec_i, vec jl .next_vect + SLDR dest1, dest1_m + SLDR dest2, dest2_m XSTR [dest1+pos], xp1 XSTR [dest2+pos], xp2 + SLDR len, len_m add pos, 16 ;Loop on 16 bytes at a time cmp pos, len jle .loop16 @@ -233,6 +342,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -slversion gf_2vect_dot_prod_sse, 00, 02, 0062 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_2vect_dot_prod_sse, 00, 03, 0062 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s new file mode 100644 index 00000000000..021133eb031 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx.asm.s @@ -0,0 +1,242 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp2 r10 + %define return rax + %define return.w eax + %define stack_size 16*9 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + movdqa [rsp+16*3],xmm9 + movdqa [rsp+16*4],xmm10 + movdqa [rsp+16*5],xmm11 + movdqa [rsp+16*6],xmm12 + movdqa [rsp+16*7],xmm13 + movdqa [rsp+16*8],xmm14 + save_reg r12, 9*16 + 0*8 + save_reg r15, 9*16 + 1*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + movdqa xmm9, [rsp+16*3] + movdqa xmm10, [rsp+16*4] + movdqa xmm11, [rsp+16*5] + movdqa xmm12, [rsp+16*6] + movdqa xmm13, [rsp+16*7] + movdqa xmm14, [rsp+16*8] + mov r12, [rsp + 9*16 + 0*8] + mov r15, [rsp + 9*16 + 1*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define return rax + %define return.w eax + + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +;;; gf_2vect_mad_avx(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 tmp2 + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + + +default rel + +[bits 64] +section .text + +%define xmask0f xmm14 +%define xgft1_lo xmm13 +%define xgft1_hi xmm12 +%define xgft2_lo xmm11 +%define xgft2_hi xmm10 + +%define x0 xmm0 +%define xtmpa xmm1 +%define xtmph1 xmm2 +%define xtmpl1 xmm3 +%define xtmph2 xmm4 +%define xtmpl2 xmm5 +%define xd1 xmm6 +%define xd2 xmm7 +%define xtmpd1 xmm8 +%define xtmpd2 xmm9 + + +align 16 +global gf_2vect_mad_avx:function + +func(gf_2vect_mad_avx) + FUNC_SAVE + sub len, 16 + jl .return_fail + + xor pos, pos + vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + sal vec_i, 5 ;Multiply by 32 + sal vec, 5 + lea tmp, [mul_array + vec_i] + vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + vmovdqu xgft2_hi, [tmp+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + + mov dest2, [dest1+PS] + mov dest1, [dest1] + + XLDR xtmpd1, [dest1+len] ;backup the last 16 bytes in dest + XLDR xtmpd2, [dest2+len] ;backup the last 16 bytes in dest + +.loop16 + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector +.loop16_overlap: + XLDR x0, [src+pos] ;Get next source vector + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials + vpxor xd1, xd1, xtmph1 ;xd1 += partial + + vpshufb xtmph2, xgft2_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials + vpxor xd2, xd2, xtmph2 ;xd2 += partial + + XSTR [dest1+pos], xd1 + XSTR [dest2+pos], xd2 + + add pos, 16 ;Loop on 16 bytes at a time + cmp pos, len + jle .loop16 + + lea tmp, [len + 16] + cmp pos, tmp + je .return_pass + + ;; Tail len + mov pos, len ;Overlapped offset length-16 + vmovdqa xd1, xtmpd1 ;Restore xd1 + vmovdqa xd2, xtmpd2 ;Restore xd2 + jmp .loop16_overlap ;Do one more overlap pass + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_2vect_mad_avx, 02, 00, 0204 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s new file mode 100644 index 00000000000..e8442aba5e2 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_avx2.asm.s @@ -0,0 +1,253 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define tmp2 r10 + %define return rax + %define return.w eax + %define stack_size 16*9 + 3*8 ; must be an odd multiple of 8 + %define arg(x) [rsp + stack_size + PS + PS*x] + + %define func(x) proc_frame x + %macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp+16*0],xmm6 + vmovdqa [rsp+16*1],xmm7 + vmovdqa [rsp+16*2],xmm8 + vmovdqa [rsp+16*3],xmm9 + vmovdqa [rsp+16*4],xmm10 + vmovdqa [rsp+16*5],xmm11 + vmovdqa [rsp+16*6],xmm12 + vmovdqa [rsp+16*7],xmm13 + vmovdqa [rsp+16*8],xmm14 + save_reg r12, 9*16 + 0*8 + save_reg r15, 9*16 + 1*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp+16*0] + vmovdqa xmm7, [rsp+16*1] + vmovdqa xmm8, [rsp+16*2] + vmovdqa xmm9, [rsp+16*3] + vmovdqa xmm10, [rsp+16*4] + vmovdqa xmm11, [rsp+16*5] + vmovdqa xmm12, [rsp+16*6] + vmovdqa xmm13, [rsp+16*7] + vmovdqa xmm14, [rsp+16*8] + mov r12, [rsp + 9*16 + 0*8] + mov r15, [rsp + 9*16 + 1*8] + add rsp, stack_size + %endmacro +%endif + +%ifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define tmp2 r10 + %define return rax + %define return.w eax + + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +;;; gf_2vect_mad_avx2(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 tmp2 + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else + +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + + +default rel + +[bits 64] +section .text + +%define xmask0f ymm14 +%define xmask0fx xmm14 +%define xgft1_lo ymm13 +%define xgft1_hi ymm12 +%define xgft2_lo ymm11 +%define xgft2_hi ymm10 + +%define x0 ymm0 +%define xtmpa ymm1 +%define xtmph1 ymm2 +%define xtmpl1 ymm3 +%define xtmph2 ymm4 +%define xtmpl2 ymm5 +%define xd1 ymm6 +%define xd2 ymm7 +%define xtmpd1 ymm8 +%define xtmpd2 ymm9 + +align 16 +global gf_2vect_mad_avx2:function + +func(gf_2vect_mad_avx2) + FUNC_SAVE + sub len, 32 + jl .return_fail + xor pos, pos + mov tmp.b, 0x0f + vpinsrb xmask0fx, xmask0fx, tmp.w, 0 + vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... + + sal vec_i, 5 ;Multiply by 32 + sal vec, 5 + lea tmp, [mul_array + vec_i] + vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} + ; " Ax{00}, Ax{10}, ..., Ax{f0} + vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + ; " Bx{00}, Bx{10}, ..., Bx{f0} + + vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi + vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo + vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi + vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo + mov dest2, [dest1+PS] ; reuse mul_array + mov dest1, [dest1] + + XLDR xtmpd1, [dest1+len] ;backup the last 16 bytes in dest + XLDR xtmpd2, [dest2+len] ;backup the last 16 bytes in dest + +.loop32 + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector +.loop32_overlap: + XLDR x0, [src+pos] ;Get next source vector + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials + vpxor xd1, xd1, xtmph1 ;xd1 += partial + + vpshufb xtmph2, xgft2_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials + vpxor xd2, xd2, xtmph2 ;xd2 += partial + + XSTR [dest1+pos], xd1 + XSTR [dest2+pos], xd2 + + add pos, 32 ;Loop on 32 bytes at a time + cmp pos, len + jle .loop32 + + lea tmp, [len + 32] + cmp pos, tmp + je .return_pass + + ;; Tail len + mov pos, len ;Overlapped offset length-32 + vmovdqa xd1, xtmpd1 ;Restore xd1 + vmovdqa xd2, xtmpd2 ;Restore xd2 + jmp .loop32_overlap ;Do one more overlap pass + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_2vect_mad_avx2, 04, 00, 0205 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s new file mode 100644 index 00000000000..a569a6ed268 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_2vect_mad_sse.asm.s @@ -0,0 +1,245 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp2 r10 + %define return rax + %define return.w eax + %define stack_size 16*9 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + movdqa [rsp+16*3],xmm9 + movdqa [rsp+16*4],xmm10 + movdqa [rsp+16*5],xmm11 + movdqa [rsp+16*6],xmm12 + movdqa [rsp+16*7],xmm13 + movdqa [rsp+16*8],xmm14 + save_reg r12, 9*16 + 0*8 + save_reg r15, 9*16 + 1*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + movdqa xmm9, [rsp+16*3] + movdqa xmm10, [rsp+16*4] + movdqa xmm11, [rsp+16*5] + movdqa xmm12, [rsp+16*6] + movdqa xmm13, [rsp+16*7] + movdqa xmm14, [rsp+16*8] + mov r12, [rsp + 9*16 + 0*8] + mov r15, [rsp + 9*16 + 1*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define return rax + %define return.w eax + + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +;;; gf_2vect_mad_sse(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 tmp2 + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR movdqu + %define XSTR movdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa + %else + %define XLDR movntdqa + %define XSTR movntdq + %endif +%endif + +default rel + +[bits 64] +section .text + +%define xmask0f xmm14 +%define xgft1_lo xmm13 +%define xgft1_hi xmm12 +%define xgft2_lo xmm11 +%define xgft2_hi xmm10 + +%define x0 xmm0 +%define xtmpa xmm1 +%define xtmph1 xmm2 +%define xtmpl1 xmm3 +%define xtmph2 xmm4 +%define xtmpl2 xmm5 +%define xd1 xmm6 +%define xd2 xmm7 +%define xtmpd1 xmm8 +%define xtmpd2 xmm9 + + +align 16 +global gf_2vect_mad_sse:function +func(gf_2vect_mad_sse) + FUNC_SAVE + sub len, 16 + jl .return_fail + + xor pos, pos + movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + sal vec_i, 5 ;Multiply by 32 + sal vec, 5 + lea tmp, [mul_array + vec_i] + movdqu xgft1_lo,[tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + movdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + movdqu xgft2_hi, [tmp+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + mov dest2, [dest1+PS] + mov dest1, [dest1] + + XLDR xtmpd1, [dest1+len] ;backup the last 16 bytes in dest + XLDR xtmpd2, [dest2+len] ;backup the last 16 bytes in dest + +.loop16: + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector +.loop16_overlap: + XLDR x0, [src+pos] ;Get next source vector + movdqa xtmph1, xgft1_hi ;Reload const array registers + movdqa xtmpl1, xgft1_lo + movdqa xtmph2, xgft2_hi ;Reload const array registers + movdqa xtmpl2, xgft2_lo + movdqa xtmpa, x0 ;Keep unshifted copy of src + psraw x0, 4 ;Shift to put high nibble into bits 4-0 + pand x0, xmask0f ;Mask high src nibble in bits 4-0 + pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + + pshufb xtmph1, x0 ;Lookup mul table of high nibble + pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble + pxor xtmph1, xtmpl1 ;GF add high and low partials + pxor xd1, xtmph1 + + pshufb xtmph2, x0 ;Lookup mul table of high nibble + pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble + pxor xtmph2, xtmpl2 ;GF add high and low partials + pxor xd2, xtmph2 + + XSTR [dest1+pos], xd1 ;Store result + XSTR [dest2+pos], xd2 ;Store result + + add pos, 16 ;Loop on 16 bytes at a time + cmp pos, len + jle .loop16 + + lea tmp, [len + 16] + cmp pos, tmp + je .return_pass + + ;; Tail len + mov pos, len ;Overlapped offset length-16 + movdqa xd1, xtmpd1 ;Restore xd1 + movdqa xd2, xtmpd2 ;Restore xd2 + jmp .loop16_overlap ;Do one more overlap pass + +.return_pass: + FUNC_RESTORE + mov return, 0 + ret + +.return_fail: + FUNC_RESTORE + mov return, 1 + ret + +endproc_frame + +section .data + +align 16 + +mask0f: + ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_2vect_mad_sse, 00, 00, 0203 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s index 6935cb19347..14097e06d63 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_3vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -46,6 +44,9 @@ %define tmp3 r13 ; must be saved and restored %define tmp4 r12 ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 @@ -73,6 +74,9 @@ %define tmp3 r13 ; must be saved and restored %define tmp4 r14 ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 %define stack_size 6*16 + 5*8 ; must be an odd multiple of 8 @@ -110,17 +114,97 @@ %endmacro %endif +%ifidn __OUTPUT_FORMAT__, elf32 + +;;;================== High Address; +;;; arg4 +;;; arg3 +;;; arg2 +;;; arg1 +;;; arg0 +;;; return +;;;<================= esp of caller +;;; ebp +;;;<================= ebp = esp +;;; var0 +;;; var1 +;;; esi +;;; edi +;;; ebx +;;;<================= esp of callee +;;; +;;;================== Low Address; + + %define PS 4 + %define LOG_PS 2 + %define func(x) x: + %define arg(x) [ebp + PS*2 + PS*x] + %define var(x) [ebp - PS - PS*x] + + %define trans ecx + %define trans2 esi + %define arg0 trans ;trans and trans2 are for the variables in stack + %define arg0_m arg(0) + %define arg1 ebx + %define arg2 arg2_m + %define arg2_m arg(2) + %define arg3 trans + %define arg3_m arg(3) + %define arg4 trans + %define arg4_m arg(4) + %define arg5 trans2 + %define tmp edx + %define tmp2 edi + %define tmp3 trans2 + %define tmp3_m var(0) + %define tmp4 trans2 + %define tmp4_m var(1) + %define return eax + %macro SLDR 2 ;; stack load/restore + mov %1, %2 + %endmacro + %define SSTR SLDR + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + sub esp, PS*2 ;2 local variables + push esi + push edi + push ebx + mov arg1, arg(1) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + add esp, PS*2 ;2 local variables + pop ebp + %endmacro + +%endif ; output formats + %define len arg0 %define vec arg1 %define mul_array arg2 %define src arg3 -%define dest1 arg4 +%define dest1 arg4 %define ptr arg5 + %define vec_i tmp2 %define dest2 tmp3 %define dest3 tmp4 %define pos return + %ifidn PS,4 ;32-bit code + %define len_m arg0_m + %define src_m arg3_m + %define dest1_m arg4_m + %define dest2_m tmp3_m + %define dest3_m tmp4_m + %endif + %ifndef EC_ALIGNED_ADDR ;;; Use Un-aligned load/store %define XLDR vmovdqu @@ -136,39 +220,62 @@ %endif %endif +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] +%endif -default rel -[bits 64] section .text -%define xmask0f xmm11 -%define xgft1_lo xmm10 -%define xgft1_hi xmm9 -%define xgft2_lo xmm8 -%define xgft2_hi xmm7 -%define xgft3_lo xmm6 -%define xgft3_hi xmm5 - -%define x0 xmm0 -%define xtmpa xmm1 -%define xp1 xmm2 -%define xp2 xmm3 -%define xp3 xmm4 +%ifidn PS,8 ;64-bit code + %define xmask0f xmm11 + %define xgft1_lo xmm10 + %define xgft1_hi xmm9 + %define xgft2_lo xmm8 + %define xgft2_hi xmm7 + %define xgft3_lo xmm6 + %define xgft3_hi xmm5 + + %define x0 xmm0 + %define xtmpa xmm1 + %define xp1 xmm2 + %define xp2 xmm3 + %define xp3 xmm4 +%else + %define xmask0f xmm7 + %define xgft1_lo xmm6 + %define xgft1_hi xmm5 + %define xgft2_lo xgft1_lo + %define xgft2_hi xgft1_hi + %define xgft3_lo xgft1_lo + %define xgft3_hi xgft1_hi + + %define x0 xmm0 + %define xtmpa xmm1 + %define xp1 xmm2 + %define xp2 xmm3 + %define xp3 xmm4 +%endif align 16 global gf_3vect_dot_prod_avx:function func(gf_3vect_dot_prod_avx) FUNC_SAVE + SLDR len, len_m sub len, 16 + SSTR len_m, len jl .return_fail xor pos, pos vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS + SLDR dest1, dest1_m mov dest2, [dest1+PS] + SSTR dest2_m, dest2 mov dest3, [dest1+2*PS] + SSTR dest3_m, dest3 mov dest1, [dest1] - + SSTR dest1_m, dest1 .loop16: vpxor xp1, xp1 @@ -178,17 +285,19 @@ func(gf_3vect_dot_prod_avx) xor vec_i, vec_i .next_vect: + SLDR src, src_m mov ptr, [src+vec_i] vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0} + %ifidn PS,8 ; 64-bit code vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0} vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} vmovdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0} - add tmp, 32 add vec_i, PS + %endif XLDR x0, [ptr+pos] ;Get next source vector vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 @@ -200,11 +309,23 @@ func(gf_3vect_dot_prod_avx) vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xp1, xgft1_hi ;xp1 += partial + %ifidn PS,4 ; 32-bit code + vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0} + %endif vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft2_hi, xgft2_lo ;GF add high and low partials vpxor xp2, xgft2_hi ;xp2 += partial + %ifidn PS,4 ; 32-bit code + sal vec, 1 + vmovdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} + vmovdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0} + sar vec, 1 + add tmp, 32 + add vec_i, PS + %endif vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft3_hi, xgft3_lo ;GF add high and low partials @@ -213,10 +334,14 @@ func(gf_3vect_dot_prod_avx) cmp vec_i, vec jl .next_vect + SLDR dest1, dest1_m + SLDR dest2, dest2_m XSTR [dest1+pos], xp1 XSTR [dest2+pos], xp2 + SLDR dest3, dest3_m XSTR [dest3+pos], xp3 + SLDR len, len_m add pos, 16 ;Loop on 16 bytes at a time cmp pos, len jle .loop16 @@ -255,6 +380,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits -slversion gf_3vect_dot_prod_avx, 02, 03, 0192 +slversion gf_3vect_dot_prod_avx, 02, 04, 0192 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s index 4ad01531712..d762104ba3d 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_avx2.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_3vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -48,7 +46,10 @@ %define tmp3 r13 ; must be saved and restored %define tmp4 r12 ; must be saved and restored %define return rax - %define PS 8 + %macro SLDR 2 + %endmacro + %define SSTR SLDR + %define PS 8 %define LOG_PS 3 %define func(x) x: @@ -77,6 +78,9 @@ %define tmp3 r13 ; must be saved and restored %define tmp4 r14 ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 %define stack_size 6*16 + 5*8 ; must be an odd multiple of 8 @@ -114,17 +118,99 @@ %endmacro %endif +%ifidn __OUTPUT_FORMAT__, elf32 + +;;;================== High Address; +;;; arg4 +;;; arg3 +;;; arg2 +;;; arg1 +;;; arg0 +;;; return +;;;<================= esp of caller +;;; ebp +;;;<================= ebp = esp +;;; var0 +;;; var1 +;;; esi +;;; edi +;;; ebx +;;;<================= esp of callee +;;; +;;;================== Low Address; + + %define PS 4 + %define LOG_PS 2 + %define func(x) x: + %define arg(x) [ebp + PS*2 + PS*x] + %define var(x) [ebp - PS - PS*x] + + %define trans ecx + %define trans2 esi + %define arg0 trans ;trans and trans2 are for the variables in stack + %define arg0_m arg(0) + %define arg1 ebx + %define arg2 arg2_m + %define arg2_m arg(2) + %define arg3 trans + %define arg3_m arg(3) + %define arg4 trans + %define arg4_m arg(4) + %define arg5 trans2 + %define tmp edx + %define tmp.w edx + %define tmp.b dl + %define tmp2 edi + %define tmp3 trans2 + %define tmp3_m var(0) + %define tmp4 trans2 + %define tmp4_m var(1) + %define return eax + %macro SLDR 2 ;stack load/restore + mov %1, %2 + %endmacro + %define SSTR SLDR + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + sub esp, PS*2 ;2 local variables + push esi + push edi + push ebx + mov arg1, arg(1) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + add esp, PS*2 ;2 local variables + pop ebp + %endmacro + +%endif ; output formats + %define len arg0 %define vec arg1 %define mul_array arg2 %define src arg3 %define dest1 arg4 %define ptr arg5 + %define vec_i tmp2 %define dest2 tmp3 %define dest3 tmp4 %define pos return +%ifidn PS,4 ;32-bit code + %define len_m arg0_m + %define src_m arg3_m + %define dest1_m arg4_m + %define dest2_m tmp3_m + %define dest3_m tmp4_m +%endif + %ifndef EC_ALIGNED_ADDR ;;; Use Un-aligned load/store %define XLDR vmovdqu @@ -140,32 +226,53 @@ %endif %endif +%ifidn PS,8 ;64-bit code + default rel + [bits 64] +%endif -default rel - -[bits 64] section .text -%define xmask0f ymm11 -%define xmask0fx xmm11 -%define xgft1_lo ymm10 -%define xgft1_hi ymm9 -%define xgft2_lo ymm8 -%define xgft2_hi ymm7 -%define xgft3_lo ymm6 -%define xgft3_hi ymm5 - -%define x0 ymm0 -%define xtmpa ymm1 -%define xp1 ymm2 -%define xp2 ymm3 -%define xp3 ymm4 +%ifidn PS,8 ;64-bit code + %define xmask0f ymm11 + %define xmask0fx xmm11 + %define xgft1_lo ymm10 + %define xgft1_hi ymm9 + %define xgft2_lo ymm8 + %define xgft2_hi ymm7 + %define xgft3_lo ymm6 + %define xgft3_hi ymm5 + + %define x0 ymm0 + %define xtmpa ymm1 + %define xp1 ymm2 + %define xp2 ymm3 + %define xp3 ymm4 +%else + %define xmask0f ymm7 + %define xmask0fx xmm7 + %define xgft1_lo ymm6 + %define xgft1_hi ymm5 + %define xgft2_lo xgft1_lo + %define xgft2_hi xgft1_hi + %define xgft3_lo xgft1_lo + %define xgft3_hi xgft1_hi + + %define x0 ymm0 + %define xtmpa ymm1 + %define xp1 ymm2 + %define xp2 ymm3 + %define xp3 ymm4 + +%endif align 16 global gf_3vect_dot_prod_avx2:function func(gf_3vect_dot_prod_avx2) FUNC_SAVE + SLDR len, len_m sub len, 32 + SSTR len_m, len jl .return_fail xor pos, pos mov tmp.b, 0x0f @@ -173,10 +280,13 @@ func(gf_3vect_dot_prod_avx2) vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS + SLDR dest1, dest1_m mov dest2, [dest1+PS] + SSTR dest2_m, dest2 mov dest3, [dest1+2*PS] + SSTR dest3_m, dest3 mov dest1, [dest1] - + SSTR dest1_m, dest1 .loop32: vpxor xp1, xp1 @@ -186,25 +296,27 @@ func(gf_3vect_dot_prod_avx2) xor vec_i, vec_i .next_vect: + SLDR src, src_m mov ptr, [src+vec_i] vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} ; " Ax{00}, Ax{10}, ..., Ax{f0} vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo - - vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + %ifidn PS,8 ; 64-bit code + vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} ; " Bx{00}, Bx{10}, ..., Bx{f0} vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo - vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} + vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} ; " Cx{00}, Cx{10}, ..., Cx{f0} vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo add tmp, 32 add vec_i, PS + %endif XLDR x0, [ptr+pos] ;Get next source vector vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 @@ -216,11 +328,27 @@ func(gf_3vect_dot_prod_avx2) vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xp1, xgft1_hi ;xp1 += partial - vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble - vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble - vpxor xgft2_hi, xgft2_lo ;GF add high and low partials - vpxor xp2, xgft2_hi ;xp2 += partial - + %ifidn PS,4 ; 32-bit code + vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + ; " Bx{00}, Bx{10}, ..., Bx{f0} + vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi + vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo + %endif + vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft2_hi, xgft2_lo ;GF add high and low partials + vpxor xp2, xgft2_hi ;xp2 += partial + + %ifidn PS,4 ; 32-bit code + sal vec, 1 + vmovdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} + ; " Cx{00}, Cx{10}, ..., Cx{f0} + vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi + vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo + sar vec, 1 + add tmp, 32 + add vec_i, PS + %endif vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft3_hi, xgft3_lo ;GF add high and low partials @@ -229,10 +357,14 @@ func(gf_3vect_dot_prod_avx2) cmp vec_i, vec jl .next_vect + SLDR dest1, dest1_m + SLDR dest2, dest2_m XSTR [dest1+pos], xp1 XSTR [dest2+pos], xp2 + SLDR dest3, dest3_m XSTR [dest3+pos], xp3 + SLDR len, len_m add pos, 32 ;Loop on 32 bytes at a time cmp pos, len jle .loop32 @@ -268,6 +400,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -slversion gf_3vect_dot_prod_avx2, 04, 03, 0197 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_3vect_dot_prod_avx2, 04, 04, 0197 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s index 925fd3414d6..bfaf2178223 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_dot_prod_sse.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_3vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -46,6 +44,9 @@ %define tmp3 r13 ; must be saved and restored %define tmp4 r12 ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 @@ -73,6 +74,9 @@ %define tmp3 r13 ; must be saved and restored %define tmp4 r14 ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 %define stack_size 6*16 + 5*8 ; must be an odd multiple of 8 @@ -110,17 +114,97 @@ %endmacro %endif +%ifidn __OUTPUT_FORMAT__, elf32 + +;;;================== High Address; +;;; arg4 +;;; arg3 +;;; arg2 +;;; arg1 +;;; arg0 +;;; return +;;;<================= esp of caller +;;; ebp +;;;<================= ebp = esp +;;; var0 +;;; var1 +;;; esi +;;; edi +;;; ebx +;;;<================= esp of callee +;;; +;;;================== Low Address; + + %define PS 4 + %define LOG_PS 2 + %define func(x) x: + %define arg(x) [ebp + PS*2 + PS*x] + %define var(x) [ebp - PS - PS*x] + + %define trans ecx + %define trans2 esi + %define arg0 trans ;trans and trans2 are for the variables in stack + %define arg0_m arg(0) + %define arg1 ebx + %define arg2 arg2_m + %define arg2_m arg(2) + %define arg3 trans + %define arg3_m arg(3) + %define arg4 trans + %define arg4_m arg(4) + %define arg5 trans2 + %define tmp edx + %define tmp2 edi + %define tmp3 trans2 + %define tmp3_m var(0) + %define tmp4 trans2 + %define tmp4_m var(1) + %define return eax + %macro SLDR 2 ;; stack load/restore + mov %1, %2 + %endmacro + %define SSTR SLDR + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + sub esp, PS*2 ;2 local variables + push esi + push edi + push ebx + mov arg1, arg(1) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + add esp, PS*2 ;2 local variables + pop ebp + %endmacro + +%endif ; output formats + %define len arg0 %define vec arg1 %define mul_array arg2 %define src arg3 -%define dest1 arg4 +%define dest1 arg4 %define ptr arg5 + %define vec_i tmp2 %define dest2 tmp3 %define dest3 tmp4 %define pos return + %ifidn PS,4 ;32-bit code + %define len_m arg0_m + %define src_m arg3_m + %define dest1_m arg4_m + %define dest2_m tmp3_m + %define dest3_m tmp4_m + %endif + %ifndef EC_ALIGNED_ADDR ;;; Use Un-aligned load/store %define XLDR movdqu @@ -136,39 +220,62 @@ %endif %endif +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] +%endif -default rel -[bits 64] section .text -%define xmask0f xmm11 -%define xgft1_lo xmm10 -%define xgft1_hi xmm9 -%define xgft2_lo xmm8 -%define xgft2_hi xmm7 -%define xgft3_lo xmm6 -%define xgft3_hi xmm5 - -%define x0 xmm0 -%define xtmpa xmm1 -%define xp1 xmm2 -%define xp2 xmm3 -%define xp3 xmm4 +%ifidn PS,8 ;64-bit code + %define xmask0f xmm11 + %define xgft1_lo xmm2 + %define xgft1_hi xmm3 + %define xgft2_lo xmm4 + %define xgft2_hi xmm7 + %define xgft3_lo xmm6 + %define xgft3_hi xmm5 + + %define x0 xmm0 + %define xtmpa xmm1 + %define xp1 xmm10 + %define xp2 xmm9 + %define xp3 xmm8 +%else + %define xmask0f xmm7 + %define xgft1_lo xmm6 + %define xgft1_hi xmm5 + %define xgft2_lo xgft1_lo + %define xgft2_hi xgft1_hi + %define xgft3_lo xgft1_lo + %define xgft3_hi xgft1_hi + + %define x0 xmm0 + %define xtmpa xmm1 + %define xp1 xmm2 + %define xp2 xmm3 + %define xp3 xmm4 +%endif align 16 global gf_3vect_dot_prod_sse:function func(gf_3vect_dot_prod_sse) FUNC_SAVE + SLDR len, len_m sub len, 16 + SSTR len_m, len jl .return_fail xor pos, pos movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS + SLDR dest1, dest1_m mov dest2, [dest1+PS] + SSTR dest2_m, dest2 mov dest3, [dest1+2*PS] + SSTR dest3_m, dest3 mov dest1, [dest1] - + SSTR dest1_m, dest1 .loop16: pxor xp1, xp1 @@ -178,17 +285,19 @@ func(gf_3vect_dot_prod_sse) xor vec_i, vec_i .next_vect: + SLDR src, src_m mov ptr, [src+vec_i] movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0} + %ifidn PS,8 ;64-bit code movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0} movdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} movdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0} - add tmp, 32 add vec_i, PS + %endif XLDR x0, [ptr+pos] ;Get next source vector movdqa xtmpa, x0 ;Keep unshifted copy of src @@ -201,11 +310,23 @@ func(gf_3vect_dot_prod_sse) pxor xgft1_hi, xgft1_lo ;GF add high and low partials pxor xp1, xgft1_hi ;xp1 += partial + %ifidn PS,4 ;32-bit code + movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0} + %endif pshufb xgft2_hi, x0 ;Lookup mul table of high nibble pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble pxor xgft2_hi, xgft2_lo ;GF add high and low partials pxor xp2, xgft2_hi ;xp2 += partial + %ifidn PS,4 ;32-bit code + sal vec, 1 + movdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} + movdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0} + sar vec, 1 + add tmp, 32 + add vec_i, PS + %endif pshufb xgft3_hi, x0 ;Lookup mul table of high nibble pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble pxor xgft3_hi, xgft3_lo ;GF add high and low partials @@ -214,10 +335,14 @@ func(gf_3vect_dot_prod_sse) cmp vec_i, vec jl .next_vect + SLDR dest1, dest1_m + SLDR dest2, dest2_m XSTR [dest1+pos], xp1 XSTR [dest2+pos], xp2 + SLDR dest3, dest3_m XSTR [dest3+pos], xp3 + SLDR len, len_m add pos, 16 ;Loop on 16 bytes at a time cmp pos, len jle .loop16 @@ -256,6 +381,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -slversion gf_3vect_dot_prod_sse, 00, 03, 0063 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_3vect_dot_prod_sse, 00, 05, 0063 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s new file mode 100644 index 00000000000..5adbcccc6ad --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx.asm.s @@ -0,0 +1,294 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define return rax + %define return.w eax + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp+16*0],xmm6 + vmovdqa [rsp+16*1],xmm7 + vmovdqa [rsp+16*2],xmm8 + vmovdqa [rsp+16*3],xmm9 + vmovdqa [rsp+16*4],xmm10 + vmovdqa [rsp+16*5],xmm11 + vmovdqa [rsp+16*6],xmm12 + vmovdqa [rsp+16*7],xmm13 + vmovdqa [rsp+16*8],xmm14 + vmovdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r15, 10*16 + 1*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp+16*0] + vmovdqa xmm7, [rsp+16*1] + vmovdqa xmm8, [rsp+16*2] + vmovdqa xmm9, [rsp+16*3] + vmovdqa xmm10, [rsp+16*4] + vmovdqa xmm11, [rsp+16*5] + vmovdqa xmm12, [rsp+16*6] + vmovdqa xmm13, [rsp+16*7] + vmovdqa xmm14, [rsp+16*8] + vmovdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r15, [rsp + 10*16 + 1*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define return rax + %define return.w eax + + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +;;; gf_3vect_mad_avx(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 mul_array +%define dest3 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + + +default rel + +[bits 64] +section .text + +%define xmask0f xmm15 +%define xgft1_lo xmm14 +%define xgft1_hi xmm13 +%define xgft2_lo xmm12 +%define xgft2_hi xmm11 +%define xgft3_lo xmm10 +%define xgft3_hi xmm9 + +%define x0 xmm0 +%define xtmpa xmm1 +%define xtmph1 xmm2 +%define xtmpl1 xmm3 +%define xtmph2 xmm4 +%define xtmpl2 xmm5 +%define xtmph3 xmm6 +%define xtmpl3 xmm7 +%define xd1 xmm8 +%define xd2 xtmpl1 +%define xd3 xtmph1 + +align 16 +global gf_3vect_mad_avx:function +func(gf_3vect_mad_avx) + FUNC_SAVE + sub len, 16 + jl .return_fail + xor pos, pos + vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + + sal vec_i, 5 ;Multiply by 32 + sal vec, 5 + lea tmp, [mul_array + vec_i] + vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + vmovdqu xgft2_hi, [tmp+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + vmovdqu xgft3_hi, [tmp+2*vec+16]; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + mov dest2, [dest1+PS] ; reuse mul_array + mov dest3, [dest1+2*PS] ; reuse vec_i + mov dest1, [dest1] + +.loop16: + XLDR x0, [src+pos] ;Get next source vector + XLDR xd1, [dest1+pos] ;Get next dest vector + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + ; dest1 + vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials + vpxor xd1, xd1, xtmph1 ;xd1 += partial + + XLDR xd2, [dest2+pos] ;reuse xtmpl1. Get next dest vector + XLDR xd3, [dest3+pos] ;reuse xtmph1. Get next dest vector + + ; dest2 + vpshufb xtmph2, xgft2_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials + vpxor xd2, xd2, xtmph2 ;xd2 += partial + + ; dest3 + vpshufb xtmph3, xgft3_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl3, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials + vpxor xd3, xd3, xtmph3 ;xd3 += partial + + XSTR [dest1+pos], xd1 + XSTR [dest2+pos], xd2 + XSTR [dest3+pos], xd3 + + add pos, 16 ;Loop on 16 bytes at a time + cmp pos, len + jle .loop16 + + lea tmp, [len + 16] + cmp pos, tmp + je .return_pass + +.lessthan16: + ;; Tail len + ;; Do one more overlap pass + mov tmp, len ;Overlapped offset length-16 + XLDR x0, [src+tmp] ;Get next source vector + XLDR xd1, [dest1+tmp] ;Get next dest vector + XLDR xd2, [dest2+tmp] ;reuse xtmpl1. Get next dest vector + XLDR xd3, [dest3+tmp] ;reuse xtmph1. Get next dest vector + + sub len, pos + + movdqa xtmph3, [constip16] ;Load const of i + 16 + vpinsrb xtmpl3, xtmpl3, len.w, 15 + vpshufb xtmpl3, xtmpl3, xmask0f ;Broadcast len to all bytes + vpcmpgtb xtmpl3, xtmpl3, xtmph3 + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + ; dest1 + vpshufb xgft1_hi, xgft1_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft1_hi, xgft1_hi, xgft1_lo ;GF add high and low partials + vpand xgft1_hi, xgft1_hi, xtmpl3 + vpxor xd1, xd1, xgft1_hi + + ; dest2 + vpshufb xgft2_hi, xgft2_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft2_hi, xgft2_hi, xgft2_lo ;GF add high and low partials + vpand xgft2_hi, xgft2_hi, xtmpl3 + vpxor xd2, xd2, xgft2_hi + + ; dest3 + vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft3_hi, xgft3_hi, xgft3_lo ;GF add high and low partials + vpand xgft3_hi, xgft3_hi, xtmpl3 + vpxor xd3, xd3, xgft3_hi + + XSTR [dest1+tmp], xd1 + XSTR [dest2+tmp], xd2 + XSTR [dest3+tmp], xd3 + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f +constip16: + ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_3vect_mad_avx, 02, 00, 0207 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s new file mode 100644 index 00000000000..077285c3768 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_avx2.asm.s @@ -0,0 +1,323 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 ; must be saved, loaded and restored + %define arg5 r15 ; must be saved and restored + + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define return rax + %define return.w eax + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp+16*0],xmm6 + vmovdqa [rsp+16*1],xmm7 + vmovdqa [rsp+16*2],xmm8 + vmovdqa [rsp+16*3],xmm9 + vmovdqa [rsp+16*4],xmm10 + vmovdqa [rsp+16*5],xmm11 + vmovdqa [rsp+16*6],xmm12 + vmovdqa [rsp+16*7],xmm13 + vmovdqa [rsp+16*8],xmm14 + vmovdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r15, 10*16 + 1*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp+16*0] + vmovdqa xmm7, [rsp+16*1] + vmovdqa xmm8, [rsp+16*2] + vmovdqa xmm9, [rsp+16*3] + vmovdqa xmm10, [rsp+16*4] + vmovdqa xmm11, [rsp+16*5] + vmovdqa xmm12, [rsp+16*6] + vmovdqa xmm13, [rsp+16*7] + vmovdqa xmm14, [rsp+16*8] + vmovdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r15, [rsp + 10*16 + 1*8] + add rsp, stack_size + %endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define return rax + %define return.w eax + + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +;;; gf_3vect_mad_avx2(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 mul_array +%define dest3 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + + +default rel + +[bits 64] +section .text + +%define xmask0f ymm15 +%define xmask0fx xmm15 +%define xgft1_lo ymm14 +%define xgft1_hi ymm13 +%define xgft2_lo ymm12 +%define xgft3_lo ymm11 + +%define x0 ymm0 +%define xtmpa ymm1 +%define xtmph1 ymm2 +%define xtmpl1 ymm3 +%define xtmph2 ymm4 +%define xtmpl2 ymm5 +%define xtmpl2x xmm5 +%define xtmph3 ymm6 +%define xtmpl3 ymm7 +%define xtmpl3x xmm7 +%define xd1 ymm8 +%define xd2 ymm9 +%define xd3 ymm10 + +align 16 +global gf_3vect_mad_avx2:function +func(gf_3vect_mad_avx2) + FUNC_SAVE + sub len, 32 + jl .return_fail + xor pos, pos + mov tmp.b, 0x0f + vpinsrb xmask0fx, xmask0fx, tmp.w, 0 + vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... + + sal vec_i, 5 ;Multiply by 32 + sal vec, 5 + lea tmp, [mul_array + vec_i] + + vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} + ; " Ax{00}, Ax{10}, ..., Ax{f0} + vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x11 ; swapped to hi | hi + vperm2i128 xgft1_lo, xgft1_lo, xgft1_lo, 0x00 ; swapped to lo | lo + + vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + mov dest2, [dest1+PS] ; reuse mul_array + mov dest3, [dest1+2*PS] ; reuse vec_i + mov dest1, [dest1] + +.loop32: + XLDR x0, [src+pos] ;Get next source vector + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector + XLDR xd3, [dest3+pos] ;Get next dest vector + vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi + vperm2i128 xtmpl2, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo + + vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi + vperm2i128 xtmpl3, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + ; dest1 + vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials + vpxor xd1, xd1, xtmph1 ;xd1 += partial + + ; dest2 + vpshufb xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmpl2 ;GF add high and low partials + vpxor xd2, xtmph2 ;xd2 += partial + + ; dest3 + vpshufb xtmph3, x0 ;Lookup mul table of high nibble + vpshufb xtmpl3, xtmpa ;Lookup mul table of low nibble + vpxor xtmph3, xtmpl3 ;GF add high and low partials + vpxor xd3, xtmph3 ;xd3 += partial + + XSTR [dest1+pos], xd1 + XSTR [dest2+pos], xd2 + XSTR [dest3+pos], xd3 + + add pos, 32 ;Loop on 32 bytes at a time + cmp pos, len + jle .loop32 + + lea tmp, [len + 32] + cmp pos, tmp + je .return_pass + +.lessthan32: + ;; Tail len + ;; Do one more overlap pass + mov tmp.b, 0x1f + vpinsrb xtmpl2x, xtmpl2x, tmp.w, 0 + vpbroadcastb xtmpl2, xtmpl2x ;Construct mask 0x1f1f1f... + + mov tmp, len ;Overlapped offset length-32 + + XLDR x0, [src+tmp] ;Get next source vector + XLDR xd1, [dest1+tmp] ;Get next dest vector + XLDR xd2, [dest2+tmp] ;Get next dest vector + XLDR xd3, [dest3+tmp] ;Get next dest vector + + sub len, pos + + vmovdqa xtmph3, [constip32] ;Load const of i + 32 + vpinsrb xtmpl3x, xtmpl3x, len.w, 15 + vinserti128 xtmpl3, xtmpl3, xtmpl3x, 1 ;swapped to xtmpl3x | xtmpl3x + vpshufb xtmpl3, xtmpl3, xtmpl2 ;Broadcast len to all bytes. xtmpl2=0x1f1f1f... + vpcmpgtb xtmpl3, xtmpl3, xtmph3 + + vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x11 ; swapped to hi | hi + vperm2i128 xgft2_lo, xgft2_lo, xgft2_lo, 0x00 ; swapped to lo | lo + + vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x11 ; swapped to hi | hi + vperm2i128 xgft3_lo, xgft3_lo, xgft3_lo, 0x00 ; swapped to lo | lo + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + ; dest1 + vpshufb xtmph1, xgft1_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials + vpand xtmph1, xtmph1, xtmpl3 + vpxor xd1, xd1, xtmph1 ;xd1 += partial + + ; dest2 + vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xgft2_lo ;GF add high and low partials + vpand xtmph2, xtmph2, xtmpl3 + vpxor xd2, xd2, xtmph2 ;xd2 += partial + + ; dest3 + vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble + vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph3, xtmph3, xgft3_lo ;GF add high and low partials + vpand xtmph3, xtmph3, xtmpl3 + vpxor xd3, xd3, xtmph3 ;xd3 += partial + + XSTR [dest1+tmp], xd1 + XSTR [dest2+tmp], xd2 + XSTR [dest3+tmp], xd3 + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 32 +constip32: + ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_3vect_mad_avx2, 04, 00, 0208 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s new file mode 100644 index 00000000000..55ead69f2a9 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_3vect_mad_sse.asm.s @@ -0,0 +1,304 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define return rax + %define return.w eax + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + movdqa [rsp+16*3],xmm9 + movdqa [rsp+16*4],xmm10 + movdqa [rsp+16*5],xmm11 + movdqa [rsp+16*6],xmm12 + movdqa [rsp+16*7],xmm13 + movdqa [rsp+16*8],xmm14 + movdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r15, 10*16 + 1*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + movdqa xmm9, [rsp+16*3] + movdqa xmm10, [rsp+16*4] + movdqa xmm11, [rsp+16*5] + movdqa xmm12, [rsp+16*6] + movdqa xmm13, [rsp+16*7] + movdqa xmm14, [rsp+16*8] + movdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r15, [rsp + 10*16 + 1*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define return rax + %define return.w eax + + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +;;; gf_3vect_mad_sse(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 mul_array +%define dest3 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR movdqu + %define XSTR movdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa + %else + %define XLDR movntdqa + %define XSTR movntdq + %endif +%endif + +default rel + +[bits 64] +section .text + +%define xmask0f xmm15 +%define xgft1_lo xmm14 +%define xgft1_hi xmm13 +%define xgft2_lo xmm12 +%define xgft2_hi xmm11 +%define xgft3_lo xmm10 +%define xgft3_hi xmm9 + +%define x0 xmm0 +%define xtmpa xmm1 +%define xtmph1 xmm2 +%define xtmpl1 xmm3 +%define xtmph2 xmm4 +%define xtmpl2 xmm5 +%define xtmph3 xmm6 +%define xtmpl3 xmm7 +%define xd1 xmm8 +%define xd2 xtmpl1 +%define xd3 xtmph1 + +align 16 +global gf_3vect_mad_sse:function +func(gf_3vect_mad_sse) + FUNC_SAVE + sub len, 16 + jl .return_fail + xor pos, pos + movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + sal vec_i, 5 ;Multiply by 32 + sal vec, 5 + lea tmp, [mul_array + vec_i] + + movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + movdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + movdqu xgft2_hi, [tmp+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + movdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + movdqu xgft3_hi, [tmp+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + mov dest2, [dest1+PS] ; reuse mul_array + mov dest3, [dest1+2*PS] ; reuse vec_i + mov dest1, [dest1] + +.loop16: + XLDR x0, [src+pos] ;Get next source vector + movdqa xtmph1, xgft1_hi ;Reload const array registers + movdqa xtmpl1, xgft1_lo + movdqa xtmph2, xgft2_hi ;Reload const array registers + movdqa xtmpl2, xgft2_lo + movdqa xtmph3, xgft3_hi ;Reload const array registers + movdqa xtmpl3, xgft3_lo + + XLDR xd1, [dest1+pos] ;Get next dest vector + + movdqa xtmpa, x0 ;Keep unshifted copy of src + psraw x0, 4 ;Shift to put high nibble into bits 4-0 + pand x0, xmask0f ;Mask high src nibble in bits 4-0 + pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + + ; dest1 + pshufb xtmph1, x0 ;Lookup mul table of high nibble + pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble + pxor xtmph1, xtmpl1 ;GF add high and low partials + pxor xd1, xtmph1 + + XLDR xd2, [dest2+pos] ;reuse xtmpl1. Get next dest vector + XLDR xd3, [dest3+pos] ;reuse xtmph1. Get next dest vector + + ; dest2 + pshufb xtmph2, x0 ;Lookup mul table of high nibble + pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble + pxor xtmph2, xtmpl2 ;GF add high and low partials + pxor xd2, xtmph2 + + ; dest3 + pshufb xtmph3, x0 ;Lookup mul table of high nibble + pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble + pxor xtmph3, xtmpl3 ;GF add high and low partials + pxor xd3, xtmph3 + + XSTR [dest1+pos], xd1 ;Store result + XSTR [dest2+pos], xd2 ;Store result + XSTR [dest3+pos], xd3 ;Store result + + add pos, 16 ;Loop on 16 bytes at a time + cmp pos, len + jle .loop16 + + lea tmp, [len + 16] + cmp pos, tmp + je .return_pass + +.lessthan16: + ;; Tail len + ;; Do one more overlap pass + mov tmp, len ;Overlapped offset length-16 + + XLDR x0, [src+tmp] ;Get next source vector + XLDR xd1, [dest1+tmp] ;Get next dest vector + XLDR xd2, [dest2+tmp] ;reuse xtmpl1. Get next dest vector + XLDR xd3, [dest3+tmp] ;reuse xtmph1. Get next dest vector + + sub len, pos + + movdqa xtmph3, [constip16] ;Load const of i + 16 + pinsrb xtmpl3, len.w, 15 + pshufb xtmpl3, xmask0f ;Broadcast len to all bytes + pcmpgtb xtmpl3, xtmph3 + + movdqa xtmpa, x0 ;Keep unshifted copy of src + psraw x0, 4 ;Shift to put high nibble into bits 4-0 + pand x0, xmask0f ;Mask high src nibble in bits 4-0 + pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + + ; dest1 + pshufb xgft1_hi, x0 ;Lookup mul table of high nibble + pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble + pxor xgft1_hi, xgft1_lo ;GF add high and low partials + pand xgft1_hi, xtmpl3 + pxor xd1, xgft1_hi + + ; dest2 + pshufb xgft2_hi, x0 ;Lookup mul table of high nibble + pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble + pxor xgft2_hi, xgft2_lo ;GF add high and low partials + pand xgft2_hi, xtmpl3 + pxor xd2, xgft2_hi + + ; dest3 + pshufb xgft3_hi, x0 ;Lookup mul table of high nibble + pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble + pxor xgft3_hi, xgft3_lo ;GF add high and low partials + pand xgft3_hi, xtmpl3 + pxor xd3, xgft3_hi + + XSTR [dest1+tmp], xd1 ;Store result + XSTR [dest2+tmp], xd2 ;Store result + XSTR [dest3+tmp], xd3 ;Store result + +.return_pass: + FUNC_RESTORE + mov return, 0 + ret + +.return_fail: + FUNC_RESTORE + mov return, 1 + ret + +endproc_frame + +section .data + +align 16 + +mask0f: + ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f +constip16: + ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_3vect_mad_sse, 00, 00, 0206 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s index 6197f017007..5649bc69fb1 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_4vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -48,7 +46,10 @@ %define tmp5 r14 ; must be saved and restored %define tmp6 r15 ; must be saved and restored %define return rax - %define PS 8 + %macro SLDR 2 + %endmacro + %define SSTR SLDR + %define PS 8 %define LOG_PS 3 %define func(x) x: @@ -81,6 +82,9 @@ %define tmp5 rdi ; must be saved and restored %define tmp6 rsi ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 %define stack_size 9*16 + 7*8 ; must be an odd multiple of 8 @@ -128,6 +132,82 @@ %endmacro %endif +%ifidn __OUTPUT_FORMAT__, elf32 + +;;;================== High Address; +;;; arg4 +;;; arg3 +;;; arg2 +;;; arg1 +;;; arg0 +;;; return +;;;<================= esp of caller +;;; ebp +;;;<================= ebp = esp +;;; var0 +;;; var1 +;;; var2 +;;; var3 +;;; esi +;;; edi +;;; ebx +;;;<================= esp of callee +;;; +;;;================== Low Address; + + %define PS 4 + %define LOG_PS 2 + %define func(x) x: + %define arg(x) [ebp + PS*2 + PS*x] + %define var(x) [ebp - PS - PS*x] + + %define trans ecx + %define trans2 esi + %define arg0 trans ;trans and trans2 are for the variables in stack + %define arg0_m arg(0) + %define arg1 ebx + %define arg2 arg2_m + %define arg2_m arg(2) + %define arg3 trans + %define arg3_m arg(3) + %define arg4 trans + %define arg4_m arg(4) + %define arg5 trans2 + %define tmp edx + %define tmp2 edi + %define tmp3 trans2 + %define tmp3_m var(0) + %define tmp4 trans2 + %define tmp4_m var(1) + %define tmp5 trans2 + %define tmp5_m var(2) + %define tmp6 trans2 + %define tmp6_m var(3) + %define return eax + %macro SLDR 2 ;stack load/restore + mov %1, %2 + %endmacro + %define SSTR SLDR + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + sub esp, PS*4 ;4 local variables + push esi + push edi + push ebx + mov arg1, arg(1) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + add esp, PS*4 ;4 local variables + pop ebp + %endmacro + +%endif ; output formats %define len arg0 %define vec arg1 @@ -140,7 +220,17 @@ %define dest3 tmp4 %define dest4 tmp5 %define vskip3 tmp6 -%define pos return +%define pos return + + %ifidn PS,4 ;32-bit code + %define len_m arg0_m + %define src_m arg3_m + %define dest1_m arg4_m + %define dest2_m tmp3_m + %define dest3_m tmp4_m + %define dest4_m tmp5_m + %define vskip3_m tmp6_m + %endif %ifndef EC_ALIGNED_ADDR ;;; Use Un-aligned load/store @@ -157,46 +247,73 @@ %endif %endif +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] +%endif -default rel -[bits 64] section .text -%define xmask0f xmm14 -%define xgft1_lo xmm13 -%define xgft1_hi xmm12 -%define xgft2_lo xmm11 -%define xgft2_hi xmm10 -%define xgft3_lo xmm9 -%define xgft3_hi xmm8 -%define xgft4_lo xmm7 -%define xgft4_hi xmm6 - - -%define x0 xmm0 -%define xtmpa xmm1 -%define xp1 xmm2 -%define xp2 xmm3 -%define xp3 xmm4 -%define xp4 xmm5 - +%ifidn PS,8 ;64-bit code + %define xmask0f xmm14 + %define xgft1_lo xmm13 + %define xgft1_hi xmm12 + %define xgft2_lo xmm11 + %define xgft2_hi xmm10 + %define xgft3_lo xmm9 + %define xgft3_hi xmm8 + %define xgft4_lo xmm7 + %define xgft4_hi xmm6 + + %define x0 xmm0 + %define xtmpa xmm1 + %define xp1 xmm2 + %define xp2 xmm3 + %define xp3 xmm4 + %define xp4 xmm5 +%else + %define xmm_trans xmm7 ;reuse xmask0f and xgft1_lo + %define xmask0f xmm_trans + %define xgft1_lo xmm_trans + %define xgft1_hi xmm6 + %define xgft2_lo xgft1_lo + %define xgft2_hi xgft1_hi + %define xgft3_lo xgft1_lo + %define xgft3_hi xgft1_hi + %define xgft4_lo xgft1_lo + %define xgft4_hi xgft1_hi + + %define x0 xmm0 + %define xtmpa xmm1 + %define xp1 xmm2 + %define xp2 xmm3 + %define xp3 xmm4 + %define xp4 xmm5 +%endif align 16 global gf_4vect_dot_prod_avx:function func(gf_4vect_dot_prod_avx) FUNC_SAVE + SLDR len, len_m sub len, 16 + SSTR len_m, len jl .return_fail xor pos, pos vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte mov vskip3, vec imul vskip3, 96 + SSTR vskip3_m, vskip3 sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS + SLDR dest1, dest1_m mov dest2, [dest1+PS] + SSTR dest2_m, dest2 mov dest3, [dest1+2*PS] + SSTR dest3_m, dest3 mov dest4, [dest1+3*PS] + SSTR dest4_m, dest4 mov dest1, [dest1] - + SSTR dest1_m, dest1 .loop16: vpxor xp1, xp1 @@ -207,41 +324,70 @@ func(gf_4vect_dot_prod_avx) xor vec_i, vec_i .next_vect: + SLDR src, src_m mov ptr, [src+vec_i] + %ifidn PS,8 ;64-bit code vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0} vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0} vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} vmovdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0} - vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} - vmovdqu xgft4_hi, [tmp+vskip3+16] ; " Cx{00}, Cx{10}, ..., Cx{f0} + vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} + vmovdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0} - XLDR x0, [ptr+pos] ;Get next source vector + XLDR x0, [ptr+pos] ;Get next source vector add tmp, 32 add vec_i, PS vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + %else ;32-bit code + XLDR x0, [ptr+pos] ;Get next source vector + vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} + vmovdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0} + %endif vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xp1, xgft1_hi ;xp1 += partial + %ifidn PS,4 ;32-bit code + vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + vmovdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0} + %endif vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft2_hi, xgft2_lo ;GF add high and low partials vpxor xp2, xgft2_hi ;xp2 += partial + %ifidn PS,4 ;32-bit code + sal vec, 1 + vmovdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} + vmovdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0} + sar vec, 1 + %endif vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft3_hi, xgft3_lo ;GF add high and low partials vpxor xp3, xgft3_hi ;xp3 += partial + %ifidn PS,4 ;32-bit code + SLDR vskip3, vskip3_m + vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} + vmovdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0} + add tmp, 32 + add vec_i, PS + %endif vpshufb xgft4_hi, x0 ;Lookup mul table of high nibble vpshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft4_hi, xgft4_lo ;GF add high and low partials @@ -250,11 +396,16 @@ func(gf_4vect_dot_prod_avx) cmp vec_i, vec jl .next_vect + SLDR dest1, dest1_m + SLDR dest2, dest2_m XSTR [dest1+pos], xp1 XSTR [dest2+pos], xp2 + SLDR dest3, dest3_m XSTR [dest3+pos], xp3 + SLDR dest4, dest4_m XSTR [dest4+pos], xp4 + SLDR len, len_m add pos, 16 ;Loop on 16 bytes at a time cmp pos, len jle .loop16 @@ -293,6 +444,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -slversion gf_4vect_dot_prod_avx, 00, 02, 0064 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_4vect_dot_prod_avx, 02, 04, 0193 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s index e4267e201f2..dcd46f39cbf 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_avx2.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_4vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -50,7 +48,10 @@ %define tmp5 r14 ; must be saved and restored %define tmp6 r15 ; must be saved and restored %define return rax - %define PS 8 + %macro SLDR 2 + %endmacro + %define SSTR SLDR + %define PS 8 %define LOG_PS 3 %define func(x) x: @@ -85,6 +86,9 @@ %define tmp5 rdi ; must be saved and restored %define tmp6 rsi ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 %define stack_size 9*16 + 7*8 ; must be an odd multiple of 8 @@ -132,6 +136,84 @@ %endmacro %endif +%ifidn __OUTPUT_FORMAT__, elf32 + +;;;================== High Address; +;;; arg4 +;;; arg3 +;;; arg2 +;;; arg1 +;;; arg0 +;;; return +;;;<================= esp of caller +;;; ebp +;;;<================= ebp = esp +;;; var0 +;;; var1 +;;; var2 +;;; var3 +;;; esi +;;; edi +;;; ebx +;;;<================= esp of callee +;;; +;;;================== Low Address; + + %define PS 4 + %define LOG_PS 2 + %define func(x) x: + %define arg(x) [ebp + PS*2 + PS*x] + %define var(x) [ebp - PS - PS*x] + + %define trans ecx + %define trans2 esi + %define arg0 trans ;trans and trans2 are for the variables in stack + %define arg0_m arg(0) + %define arg1 ebx + %define arg2 arg2_m + %define arg2_m arg(2) + %define arg3 trans + %define arg3_m arg(3) + %define arg4 trans + %define arg4_m arg(4) + %define arg5 trans2 + %define tmp edx + %define tmp.w edx + %define tmp.b dl + %define tmp2 edi + %define tmp3 trans2 + %define tmp3_m var(0) + %define tmp4 trans2 + %define tmp4_m var(1) + %define tmp5 trans2 + %define tmp5_m var(2) + %define tmp6 trans2 + %define tmp6_m var(3) + %define return eax + %macro SLDR 2 ;stack load/restore + mov %1, %2 + %endmacro + %define SSTR SLDR + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + sub esp, PS*4 ;4 local variables + push esi + push edi + push ebx + mov arg1, arg(1) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + add esp, PS*4 ;4 local variables + pop ebp + %endmacro + +%endif ; output formats %define len arg0 %define vec arg1 @@ -144,7 +226,17 @@ %define dest3 tmp4 %define dest4 tmp5 %define vskip3 tmp6 -%define pos return +%define pos return + + %ifidn PS,4 ;32-bit code + %define len_m arg0_m + %define src_m arg3_m + %define dest1_m arg4_m + %define dest2_m tmp3_m + %define dest3_m tmp4_m + %define dest4_m tmp5_m + %define vskip3_m tmp6_m + %endif %ifndef EC_ALIGNED_ADDR ;;; Use Un-aligned load/store @@ -161,36 +253,59 @@ %endif %endif +%ifidn PS,8 ;64-bit code + default rel + [bits 64] +%endif -default rel -[bits 64] section .text -%define xmask0f ymm14 -%define xmask0fx xmm14 -%define xgft1_lo ymm13 -%define xgft1_hi ymm12 -%define xgft2_lo ymm11 -%define xgft2_hi ymm10 -%define xgft3_lo ymm9 -%define xgft3_hi ymm8 -%define xgft4_lo ymm7 -%define xgft4_hi ymm6 - - -%define x0 ymm0 -%define xtmpa ymm1 -%define xp1 ymm2 -%define xp2 ymm3 -%define xp3 ymm4 -%define xp4 ymm5 - +%ifidn PS,8 ;64-bit code + %define xmask0f ymm14 + %define xmask0fx xmm14 + %define xgft1_lo ymm13 + %define xgft1_hi ymm12 + %define xgft2_lo ymm11 + %define xgft2_hi ymm10 + %define xgft3_lo ymm9 + %define xgft3_hi ymm8 + %define xgft4_lo ymm7 + %define xgft4_hi ymm6 + + %define x0 ymm0 + %define xtmpa ymm1 + %define xp1 ymm2 + %define xp2 ymm3 + %define xp3 ymm4 + %define xp4 ymm5 +%else + %define ymm_trans ymm7 ;reuse xmask0f and xgft1_hi + %define xmask0f ymm_trans + %define xmask0fx xmm7 + %define xgft1_lo ymm6 + %define xgft1_hi ymm_trans + %define xgft2_lo xgft1_lo + %define xgft2_hi xgft1_hi + %define xgft3_lo xgft1_lo + %define xgft3_hi xgft1_hi + %define xgft4_lo xgft1_lo + %define xgft4_hi xgft1_hi + + %define x0 ymm0 + %define xtmpa ymm1 + %define xp1 ymm2 + %define xp2 ymm3 + %define xp3 ymm4 + %define xp4 ymm5 +%endif align 16 global gf_4vect_dot_prod_avx2:function func(gf_4vect_dot_prod_avx2) FUNC_SAVE + SLDR len, len_m sub len, 32 + SSTR len_m, len jl .return_fail xor pos, pos mov tmp.b, 0x0f @@ -198,12 +313,17 @@ func(gf_4vect_dot_prod_avx2) vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... mov vskip3, vec imul vskip3, 96 + SSTR vskip3_m, vskip3 sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS + SLDR dest1, dest1_m mov dest2, [dest1+PS] + SSTR dest2_m, dest2 mov dest3, [dest1+2*PS] + SSTR dest3_m, dest3 mov dest4, [dest1+3*PS] + SSTR dest4_m, dest4 mov dest1, [dest1] - + SSTR dest1_m, dest1 .loop32: vpxor xp1, xp1 @@ -214,10 +334,12 @@ func(gf_4vect_dot_prod_avx2) xor vec_i, vec_i .next_vect: + SLDR src, src_m mov ptr, [src+vec_i] XLDR x0, [ptr+pos] ;Get next source vector - add vec_i, PS + add vec_i, PS + %ifidn PS,8 ;64-bit code vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 @@ -230,30 +352,64 @@ func(gf_4vect_dot_prod_avx2) ; " Bx{00}, Bx{10}, ..., Bx{f0} vmovdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} ; " Cx{00}, Cx{10}, ..., Cx{f0} - vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} - ; " Cx{00}, Cx{10}, ..., Cx{f0} + vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} + ; " Dx{00}, Dx{10}, ..., Dx{f0} vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo add tmp, 32 + %else ;32-bit code + mov cl, 0x0f ;use ecx as a temp variable + vpinsrb xmask0fx, xmask0fx, ecx, 0 + vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... + + vpand xgft4_lo, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + vperm2i128 xtmpa, xgft4_lo, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi + vperm2i128 x0, xgft4_lo, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo + + vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} + ; " Ax{00}, Ax{10}, ..., Ax{f0} + vperm2i128 xgft1_hi, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo + %endif vpshufb xgft1_hi, x0 ;Lookup mul table of high nibble vpshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft1_hi, xgft1_lo ;GF add high and low partials vpxor xp1, xgft1_hi ;xp1 += partial + %ifidn PS,4 ; 32-bit code + vmovdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + ; " Bx{00}, Bx{10}, ..., Bx{f0} + vperm2i128 xgft2_hi, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo + %endif vpshufb xgft2_hi, x0 ;Lookup mul table of high nibble vpshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft2_hi, xgft2_lo ;GF add high and low partials vpxor xp2, xgft2_hi ;xp2 += partial + %ifidn PS,4 ; 32-bit code + sal vec, 1 + vmovdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} + ; " Cx{00}, Cx{10}, ..., Cx{f0} + vperm2i128 xgft3_hi, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo + sar vec, 1 + %endif vpshufb xgft3_hi, x0 ;Lookup mul table of high nibble vpshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft3_hi, xgft3_lo ;GF add high and low partials vpxor xp3, xgft3_hi ;xp3 += partial + %ifidn PS,4 ; 32-bit code + SLDR vskip3, vskip3_m + vmovdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} + ; " DX{00}, Dx{10}, ..., Dx{f0} + vperm2i128 xgft4_hi, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + add tmp, 32 + %endif vpshufb xgft4_hi, x0 ;Lookup mul table of high nibble vpshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft4_hi, xgft4_lo ;GF add high and low partials @@ -262,11 +418,16 @@ func(gf_4vect_dot_prod_avx2) cmp vec_i, vec jl .next_vect + SLDR dest1, dest1_m + SLDR dest2, dest2_m XSTR [dest1+pos], xp1 XSTR [dest2+pos], xp2 + SLDR dest3, dest3_m XSTR [dest3+pos], xp3 + SLDR dest4, dest4_m XSTR [dest4+pos], xp4 + SLDR len, len_m add pos, 32 ;Loop on 32 bytes at a time cmp pos, len jle .loop32 @@ -302,6 +463,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -slversion gf_4vect_dot_prod_avx2, 04, 03, 0064 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_4vect_dot_prod_avx2, 04, 04, 0198 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s index 920a8da71c8..4d716ef585b 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_dot_prod_sse.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_4vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -48,7 +46,10 @@ %define tmp5 r14 ; must be saved and restored %define tmp6 r15 ; must be saved and restored %define return rax - %define PS 8 + %macro SLDR 2 + %endmacro + %define SSTR SLDR + %define PS 8 %define LOG_PS 3 %define func(x) x: @@ -81,6 +82,9 @@ %define tmp5 rdi ; must be saved and restored %define tmp6 rsi ; must be saved and restored %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define LOG_PS 3 %define stack_size 9*16 + 7*8 ; must be an odd multiple of 8 @@ -128,6 +132,82 @@ %endmacro %endif +%ifidn __OUTPUT_FORMAT__, elf32 + +;;;================== High Address; +;;; arg4 +;;; arg3 +;;; arg2 +;;; arg1 +;;; arg0 +;;; return +;;;<================= esp of caller +;;; ebp +;;;<================= ebp = esp +;;; var0 +;;; var1 +;;; var2 +;;; var3 +;;; esi +;;; edi +;;; ebx +;;;<================= esp of callee +;;; +;;;================== Low Address; + + %define PS 4 + %define LOG_PS 2 + %define func(x) x: + %define arg(x) [ebp + PS*2 + PS*x] + %define var(x) [ebp - PS - PS*x] + + %define trans ecx + %define trans2 esi + %define arg0 trans ;trans and trans2 are for the variables in stack + %define arg0_m arg(0) + %define arg1 ebx + %define arg2 arg2_m + %define arg2_m arg(2) + %define arg3 trans + %define arg3_m arg(3) + %define arg4 trans + %define arg4_m arg(4) + %define arg5 trans2 + %define tmp edx + %define tmp2 edi + %define tmp3 trans2 + %define tmp3_m var(0) + %define tmp4 trans2 + %define tmp4_m var(1) + %define tmp5 trans2 + %define tmp5_m var(2) + %define tmp6 trans2 + %define tmp6_m var(3) + %define return eax + %macro SLDR 2 ;stack load/restore + mov %1, %2 + %endmacro + %define SSTR SLDR + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + sub esp, PS*4 ;4 local variables + push esi + push edi + push ebx + mov arg1, arg(1) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + add esp, PS*4 ;4 local variables + pop ebp + %endmacro + +%endif ; output formats %define len arg0 %define vec arg1 @@ -140,7 +220,17 @@ %define dest3 tmp4 %define dest4 tmp5 %define vskip3 tmp6 -%define pos return +%define pos return + + %ifidn PS,4 ;32-bit code + %define len_m arg0_m + %define src_m arg3_m + %define dest1_m arg4_m + %define dest2_m tmp3_m + %define dest3_m tmp4_m + %define dest4_m tmp5_m + %define vskip3_m tmp6_m + %endif %ifndef EC_ALIGNED_ADDR ;;; Use Un-aligned load/store @@ -157,46 +247,73 @@ %endif %endif +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] +%endif -default rel -[bits 64] section .text -%define xmask0f xmm14 -%define xgft1_lo xmm13 -%define xgft1_hi xmm12 -%define xgft2_lo xmm11 -%define xgft2_hi xmm10 -%define xgft3_lo xmm9 -%define xgft3_hi xmm8 -%define xgft4_lo xmm7 -%define xgft4_hi xmm6 - - -%define x0 xmm0 -%define xtmpa xmm1 -%define xp1 xmm2 -%define xp2 xmm3 -%define xp3 xmm4 -%define xp4 xmm5 - +%ifidn PS,8 ;64-bit code + %define xmask0f xmm14 + %define xgft1_lo xmm2 + %define xgft1_hi xmm3 + %define xgft2_lo xmm11 + %define xgft2_hi xmm4 + %define xgft3_lo xmm9 + %define xgft3_hi xmm5 + %define xgft4_lo xmm7 + %define xgft4_hi xmm6 + + %define x0 xmm0 + %define xtmpa xmm1 + %define xp1 xmm8 + %define xp2 xmm10 + %define xp3 xmm12 + %define xp4 xmm13 +%else + %define xmm_trans xmm7 ;reuse xmask0f and xgft1_lo + %define xmask0f xmm_trans + %define xgft1_lo xmm_trans + %define xgft1_hi xmm6 + %define xgft2_lo xgft1_lo + %define xgft2_hi xgft1_hi + %define xgft3_lo xgft1_lo + %define xgft3_hi xgft1_hi + %define xgft4_lo xgft1_lo + %define xgft4_hi xgft1_hi + + %define x0 xmm0 + %define xtmpa xmm1 + %define xp1 xmm2 + %define xp2 xmm3 + %define xp3 xmm4 + %define xp4 xmm5 +%endif align 16 global gf_4vect_dot_prod_sse:function func(gf_4vect_dot_prod_sse) FUNC_SAVE + SLDR len, len_m sub len, 16 + SSTR len_m, len jl .return_fail xor pos, pos movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte - mov vskip3, vec - imul vskip3, 96 - sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS - mov dest2, [dest1+PS] - mov dest3, [dest1+2*PS] - mov dest4, [dest1+3*PS] - mov dest1, [dest1] - + mov vskip3, vec + imul vskip3, 96 + SSTR vskip3_m, vskip3 + sal vec, LOG_PS ;vec *= PS. Make vec_i count by PS + SLDR dest1, dest1_m + mov dest2, [dest1+PS] + SSTR dest2_m, dest2 + mov dest3, [dest1+2*PS] + SSTR dest3_m, dest3 + mov dest4, [dest1+3*PS] + SSTR dest4_m, dest4 + mov dest1, [dest1] + SSTR dest1_m, dest1 .loop16: pxor xp1, xp1 @@ -207,41 +324,72 @@ func(gf_4vect_dot_prod_sse) xor vec_i, vec_i .next_vect: + SLDR src, src_m mov ptr, [src+vec_i] + %ifidn PS,8 ;64-bit code movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0} movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0} movdqu xgft3_lo, [tmp+vec*(64/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} movdqu xgft3_hi, [tmp+vec*(64/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0} - movdqu xgft4_lo, [tmp+vskip3] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} - movdqu xgft4_hi, [tmp+vskip3+16] ; " Cx{00}, Cx{10}, ..., Cx{f0} + movdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} + movdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0} - XLDR x0, [ptr+pos] ;Get next source vector - add tmp, 32 - add vec_i, PS + XLDR x0, [ptr+pos] ;Get next source vector + add tmp, 32 + add vec_i, PS movdqa xtmpa, x0 ;Keep unshifted copy of src psraw x0, 4 ;Shift to put high nibble into bits 4-0 pand x0, xmask0f ;Mask high src nibble in bits 4-0 + pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + %else ;32-bit code + XLDR x0, [ptr+pos] ;Get next source vector + movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + + movdqa xtmpa, x0 ;Keep unshifted copy of src + psraw x0, 4 ;Shift to put high nibble into bits 4-0 + pand x0, xmask0f ;Mask high src nibble in bits 4-0 pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + movdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} + movdqu xgft1_hi, [tmp+16] ; " Ax{00}, Ax{10}, ..., Ax{f0} + %endif + pshufb xgft1_hi, x0 ;Lookup mul table of high nibble pshufb xgft1_lo, xtmpa ;Lookup mul table of low nibble pxor xgft1_hi, xgft1_lo ;GF add high and low partials pxor xp1, xgft1_hi ;xp1 += partial + %ifidn PS,4 ;32-bit code + movdqu xgft2_lo, [tmp+vec*(32/PS)] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + movdqu xgft2_hi, [tmp+vec*(32/PS)+16] ; " Bx{00}, Bx{10}, ..., Bx{f0} + %endif pshufb xgft2_hi, x0 ;Lookup mul table of high nibble pshufb xgft2_lo, xtmpa ;Lookup mul table of low nibble pxor xgft2_hi, xgft2_lo ;GF add high and low partials pxor xp2, xgft2_hi ;xp2 += partial + %ifidn PS,4 ;32-bit code + sal vec, 1 + movdqu xgft3_lo, [tmp+vec*(32/PS)] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} + movdqu xgft3_hi, [tmp+vec*(32/PS)+16] ; " Cx{00}, Cx{10}, ..., Cx{f0} + sar vec, 1 + %endif pshufb xgft3_hi, x0 ;Lookup mul table of high nibble pshufb xgft3_lo, xtmpa ;Lookup mul table of low nibble pxor xgft3_hi, xgft3_lo ;GF add high and low partials pxor xp3, xgft3_hi ;xp3 += partial + %ifidn PS,4 ;32-bit code + SLDR vskip3, vskip3_m + movdqu xgft4_lo, [tmp+vskip3] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} + movdqu xgft4_hi, [tmp+vskip3+16] ; " Dx{00}, Dx{10}, ..., Dx{f0} + add tmp, 32 + add vec_i, PS + %endif pshufb xgft4_hi, x0 ;Lookup mul table of high nibble pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble pxor xgft4_hi, xgft4_lo ;GF add high and low partials @@ -250,11 +398,16 @@ func(gf_4vect_dot_prod_sse) cmp vec_i, vec jl .next_vect + SLDR dest1, dest1_m + SLDR dest2, dest2_m XSTR [dest1+pos], xp1 XSTR [dest2+pos], xp2 + SLDR dest3, dest3_m XSTR [dest3+pos], xp3 + SLDR dest4, dest4_m XSTR [dest4+pos], xp4 + SLDR len, len_m add pos, 16 ;Loop on 16 bytes at a time cmp pos, len jle .loop16 @@ -293,6 +446,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -slversion gf_4vect_dot_prod_sse, 00, 03, 0064 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_4vect_dot_prod_sse, 00, 05, 0064 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s new file mode 100644 index 00000000000..605e42a901f --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx.asm.s @@ -0,0 +1,343 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 + %define return rax + %define return.w eax + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + movdqa [rsp+16*3],xmm9 + movdqa [rsp+16*4],xmm10 + movdqa [rsp+16*5],xmm11 + movdqa [rsp+16*6],xmm12 + movdqa [rsp+16*7],xmm13 + movdqa [rsp+16*8],xmm14 + movdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r15, 10*16 + 2*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + movdqa xmm9, [rsp+16*3] + movdqa xmm10, [rsp+16*4] + movdqa xmm11, [rsp+16*5] + movdqa xmm12, [rsp+16*6] + movdqa xmm13, [rsp+16*7] + movdqa xmm14, [rsp+16*8] + movdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r15, [rsp + 10*16 + 2*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r12 + %define return rax + %define return.w eax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + %endmacro + %macro FUNC_RESTORE 0 + pop r12 + %endmacro +%endif + +;;; gf_4vect_mad_avx(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 mul_array +%define dest3 tmp2 +%define dest4 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + + +default rel + +[bits 64] +section .text + +%define xmask0f xmm15 +%define xgft3_hi xmm14 +%define xgft4_hi xmm13 +%define xgft4_lo xmm12 + +%define x0 xmm0 +%define xtmpa xmm1 +%define xtmph1 xmm2 +%define xtmpl1 xmm3 +%define xtmph2 xmm4 +%define xtmpl2 xmm5 +%define xtmph3 xmm6 +%define xtmpl3 xmm7 +%define xtmph4 xmm8 +%define xtmpl4 xmm9 +%define xd1 xmm10 +%define xd2 xmm11 +%define xd3 xtmph1 +%define xd4 xtmpl1 + +align 16 +global gf_4vect_mad_avx:function +func(gf_4vect_mad_avx) + FUNC_SAVE + sub len, 16 + jl .return_fail + xor pos, pos + vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + + mov tmp, vec + + sal vec_i, 5 ;Multiply by 32 + lea tmp3, [mul_array + vec_i] + + sal tmp, 6 ;Multiply by 64 + vmovdqu xgft3_hi, [tmp3+tmp+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + sal vec, 5 ;Multiply by 32 + add tmp, vec + vmovdqu xgft4_lo, [tmp3+tmp] ;Load array Dx{00}, Dx{01}, Dx{02}, ... + vmovdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0} + + mov dest2, [dest1+PS] ; reuse mul_array + mov dest3, [dest1+2*PS] + mov dest4, [dest1+3*PS] ; reuse vec_i + mov dest1, [dest1] + +.loop16: + XLDR x0, [src+pos] ;Get next source vector + vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + ; dest1 + vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1, xtmpl1, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials + vpxor xd1, xd1, xtmph1 + + XLDR xd3, [dest3+pos] ;Reuse xtmph1, Get next dest vector + XLDR xd4, [dest4+pos] ;Reuse xtmpl1, Get next dest vector + + ; dest2 + vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2, xtmpl2, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials + vpxor xd2, xd2, xtmph2 + + ; dest3 + vpshufb xtmph3, xgft3_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl3, xtmpl3, xtmpa ;Lookup mul table of low nibble + vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials + vpxor xd3, xd3, xtmph3 + + ; dest4 + vpshufb xtmph4, xgft4_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl4, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph4, xtmph4, xtmpl4 ;GF add high and low partials + vpxor xd4, xd4, xtmph4 + + XSTR [dest1+pos], xd1 ;Store result + XSTR [dest2+pos], xd2 ;Store result + XSTR [dest3+pos], xd3 ;Store result + XSTR [dest4+pos], xd4 ;Store result + + add pos, 16 ;Loop on 16 bytes at a time + cmp pos, len + jle .loop16 + + lea tmp, [len + 16] + cmp pos, tmp + je .return_pass + +.lessthan16: + ;; Tail len + ;; Do one more overlap pass + + mov tmp, len ;Overlapped offset length-16 + + XLDR x0, [src+tmp] ;Get next source vector + + vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + + XLDR xd1, [dest1+tmp] ;Get next dest vector + XLDR xd2, [dest2+tmp] ;Get next dest vector + XLDR xtmph4, [dest3+tmp] ;Get next dest vector + + sub len, pos + + vmovdqa xtmpl4, [constip16] ;Load const of i + 16 + vpinsrb xtmph3, xtmph3, len.w, 15 + vpshufb xtmph3, xtmph3, xmask0f ;Broadcast len to all bytes + vpcmpgtb xtmph3, xtmph3, xtmpl4 + + XLDR xtmpl4, [dest4+tmp] ;Get next dest vector + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + ; dest1 + vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1, xtmpl1, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials + vpand xtmph1, xtmph1, xtmph3 + vpxor xd1, xd1, xtmph1 + + ; dest2 + vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2, xtmpl2, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials + vpand xtmph2, xtmph2, xtmph3 + vpxor xd2, xd2, xtmph2 + + ; dest3 + vpshufb xgft3_hi, xgft3_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl3, xtmpl3, xtmpa ;Lookup mul table of low nibble + vpxor xgft3_hi, xgft3_hi, xtmpl3 ;GF add high and low partials + vpand xgft3_hi, xgft3_hi, xtmph3 + vpxor xtmph4, xtmph4, xgft3_hi + + ; dest4 + vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials + vpand xgft4_hi, xgft4_hi, xtmph3 + vpxor xtmpl4, xtmpl4, xgft4_hi + + XSTR [dest1+tmp], xd1 ;Store result + XSTR [dest2+tmp], xd2 ;Store result + XSTR [dest3+tmp], xtmph4 ;Store result + XSTR [dest4+tmp], xtmpl4 ;Store result + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 +mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f +constip16: + ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_4vect_mad_avx, 02, 00, 020a diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s new file mode 100644 index 00000000000..ad3eafa4db6 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_avx2.asm.s @@ -0,0 +1,348 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define return rax + %define return.w eax + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + movdqa [rsp+16*3],xmm9 + movdqa [rsp+16*4],xmm10 + movdqa [rsp+16*5],xmm11 + movdqa [rsp+16*6],xmm12 + movdqa [rsp+16*7],xmm13 + movdqa [rsp+16*8],xmm14 + movdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r15, 10*16 + 1*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + movdqa xmm9, [rsp+16*3] + movdqa xmm10, [rsp+16*4] + movdqa xmm11, [rsp+16*5] + movdqa xmm12, [rsp+16*6] + movdqa xmm13, [rsp+16*7] + movdqa xmm14, [rsp+16*8] + movdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r15, [rsp + 10*16 + 1*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define return rax + %define return.w eax + + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + + +;;; gf_4vect_mad_avx2(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 mul_array +%define dest3 vec +%define dest4 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + + +default rel + +[bits 64] +section .text + +%define xmask0f ymm15 +%define xmask0fx xmm15 +%define xgft1_lo ymm14 +%define xgft2_lo ymm13 +%define xgft3_lo ymm12 +%define xgft4_lo ymm11 + +%define x0 ymm0 +%define xtmpa ymm1 +%define xtmpl ymm2 +%define xtmplx xmm2 +%define xtmph1 ymm3 +%define xtmph1x xmm3 +%define xtmph2 ymm4 +%define xtmph3 ymm5 +%define xtmph4 ymm6 +%define xd1 ymm7 +%define xd2 ymm8 +%define xd3 ymm9 +%define xd4 ymm10 + +align 16 +global gf_4vect_mad_avx2:function +func(gf_4vect_mad_avx2) + FUNC_SAVE + sub len, 32 + jl .return_fail + xor pos, pos + mov tmp.b, 0x0f + vpinsrb xmask0fx, xmask0fx, tmp.w, 0 + vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... + + sal vec_i, 5 ;Multiply by 32 + sal vec, 5 ;Multiply by 32 + lea tmp, [mul_array + vec_i] + + vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + add tmp, vec + vmovdqu xgft4_lo, [tmp+2*vec] ;Load array Dx{00}, Dx{01}, Dx{02}, ... + ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0} + + mov dest2, [dest1+PS] ; reuse mul_array + mov dest3, [dest1+2*PS] ; reuse vec + mov dest4, [dest1+3*PS] ; reuse vec_i + mov dest1, [dest1] + +.loop32: + XLDR x0, [src+pos] ;Get next source vector + + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector + XLDR xd3, [dest3+pos] ;Get next dest vector + XLDR xd4, [dest4+pos] ;reuse xtmpl1. Get next dest vector + + vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi + vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo + + vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo + vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo + vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo + vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + + ; dest1 + vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials + vpxor xd1, xd1, xtmph1 ;xd1 += partial + + ; dest2 + vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials + vpxor xd2, xd2, xtmph2 ;xd2 += partial + + ; dest3 + vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph3, xtmph3, xtmpl ;GF add high and low partials + vpxor xd3, xd3, xtmph3 ;xd3 += partial + + ; dest4 + vpshufb xtmph4, xtmph4, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph4, xtmph4, xtmpl ;GF add high and low partials + vpxor xd4, xd4, xtmph4 ;xd4 += partial + + XSTR [dest1+pos], xd1 + XSTR [dest2+pos], xd2 + XSTR [dest3+pos], xd3 + XSTR [dest4+pos], xd4 + + add pos, 32 ;Loop on 32 bytes at a time + cmp pos, len + jle .loop32 + + lea tmp, [len + 32] + cmp pos, tmp + je .return_pass + +.lessthan32: + ;; Tail len + ;; Do one more overlap pass + mov tmp.b, 0x1f + vpinsrb xtmph1x, xtmph1x, tmp.w, 0 + vpbroadcastb xtmph1, xtmph1x ;Construct mask 0x1f1f1f... + + mov tmp, len ;Overlapped offset length-32 + + XLDR x0, [src+tmp] ;Get next source vector + + XLDR xd1, [dest1+tmp] ;Get next dest vector + XLDR xd2, [dest2+tmp] ;Get next dest vector + XLDR xd3, [dest3+tmp] ;Get next dest vector + XLDR xd4, [dest4+tmp] ;Get next dest vector + + sub len, pos + + vmovdqa xtmph2, [constip32] ;Load const of i + 32 + vpinsrb xtmplx, xtmplx, len.w, 15 + vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx + vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f... + vpcmpgtb xtmpl, xtmpl, xtmph2 + + vpand xtmph1, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + vperm2i128 xtmpa, xtmph1, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi + vperm2i128 x0, xtmph1, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo + + vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo + vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo + vperm2i128 xtmph3, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo + vperm2i128 xtmph4, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + + ; dest1 + vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xgft1_lo ;GF add high and low partials + vpand xtmph1, xtmph1, xtmpl + vpxor xd1, xd1, xtmph1 ;xd1 += partial + + ; dest2 + vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xgft2_lo ;GF add high and low partials + vpand xtmph2, xtmph2, xtmpl + vpxor xd2, xd2, xtmph2 ;xd2 += partial + + ; dest3 + vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble + vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph3, xtmph3, xgft3_lo ;GF add high and low partials + vpand xtmph3, xtmph3, xtmpl + vpxor xd3, xd3, xtmph3 ;xd3 += partial + + ; dest4 + vpshufb xtmph4, xtmph4, x0 ;Lookup mul table of high nibble + vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph4, xtmph4, xgft4_lo ;GF add high and low partials + vpand xtmph4, xtmph4, xtmpl + vpxor xd4, xd4, xtmph4 ;xd4 += partial + + XSTR [dest1+tmp], xd1 + XSTR [dest2+tmp], xd2 + XSTR [dest3+tmp], xd3 + XSTR [dest4+tmp], xd4 + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data +align 32 +constip32: + ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_4vect_mad_avx2, 04, 00, 020b diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s new file mode 100644 index 00000000000..038f926de4f --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_4vect_mad_sse.asm.s @@ -0,0 +1,348 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 + %define return rax + %define return.w eax + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + movdqa [rsp+16*3],xmm9 + movdqa [rsp+16*4],xmm10 + movdqa [rsp+16*5],xmm11 + movdqa [rsp+16*6],xmm12 + movdqa [rsp+16*7],xmm13 + movdqa [rsp+16*8],xmm14 + movdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r15, 10*16 + 2*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + movdqa xmm9, [rsp+16*3] + movdqa xmm10, [rsp+16*4] + movdqa xmm11, [rsp+16*5] + movdqa xmm12, [rsp+16*6] + movdqa xmm13, [rsp+16*7] + movdqa xmm14, [rsp+16*8] + movdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r15, [rsp + 10*16 + 2*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r12 + %define return rax + %define return.w eax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + %endmacro + %macro FUNC_RESTORE 0 + pop r12 + %endmacro +%endif + +;;; gf_4vect_mad_sse(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 mul_array +%define dest3 tmp2 +%define dest4 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR movdqu + %define XSTR movdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa + %else + %define XLDR movntdqa + %define XSTR movntdq + %endif +%endif + +default rel + +[bits 64] +section .text + +%define xmask0f xmm15 +%define xgft3_hi xmm14 +%define xgft4_hi xmm13 +%define xgft4_lo xmm12 + +%define x0 xmm0 +%define xtmpa xmm1 +%define xtmph1 xmm2 +%define xtmpl1 xmm3 +%define xtmph2 xmm4 +%define xtmpl2 xmm5 +%define xtmph3 xmm6 +%define xtmpl3 xmm7 +%define xtmph4 xmm8 +%define xtmpl4 xmm9 +%define xd1 xmm10 +%define xd2 xmm11 +%define xd3 xtmph1 +%define xd4 xtmpl1 + +align 16 +global gf_4vect_mad_sse:function +func(gf_4vect_mad_sse) + FUNC_SAVE + sub len, 16 + jl .return_fail + xor pos, pos + movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + mov tmp, vec + + sal vec_i, 5 ;Multiply by 32 + lea tmp3, [mul_array + vec_i] + + sal tmp, 6 ;Multiply by 64 + + movdqu xgft3_hi, [tmp3+tmp+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + sal vec, 5 ;Multiply by 32 + add tmp, vec + movdqu xgft4_lo, [tmp3+tmp] ;Load array Dx{00}, Dx{01}, Dx{02}, ... + movdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0} + + mov dest2, [dest1+PS] ; reuse mul_array + mov dest3, [dest1+2*PS] + mov dest4, [dest1+3*PS] ; reuse vec_i + mov dest1, [dest1] + +.loop16: + XLDR x0, [src+pos] ;Get next source vector + movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + + movdqa xtmph3, xgft3_hi + movdqa xtmpl4, xgft4_lo + movdqa xtmph4, xgft4_hi + + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector + + movdqa xtmpa, x0 ;Keep unshifted copy of src + psraw x0, 4 ;Shift to put high nibble into bits 4-0 + pand x0, xmask0f ;Mask high src nibble in bits 4-0 + pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + + ; dest1 + pshufb xtmph1, x0 ;Lookup mul table of high nibble + pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble + pxor xtmph1, xtmpl1 ;GF add high and low partials + pxor xd1, xtmph1 + + XLDR xd3, [dest3+pos] ;Reuse xtmph1, Get next dest vector + XLDR xd4, [dest4+pos] ;Reuse xtmpl1, Get next dest vector + + ; dest2 + pshufb xtmph2, x0 ;Lookup mul table of high nibble + pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble + pxor xtmph2, xtmpl2 ;GF add high and low partials + pxor xd2, xtmph2 + + ; dest3 + pshufb xtmph3, x0 ;Lookup mul table of high nibble + pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble + pxor xtmph3, xtmpl3 ;GF add high and low partials + pxor xd3, xtmph3 + + ; dest4 + pshufb xtmph4, x0 ;Lookup mul table of high nibble + pshufb xtmpl4, xtmpa ;Lookup mul table of low nibble + pxor xtmph4, xtmpl4 ;GF add high and low partials + pxor xd4, xtmph4 + + XSTR [dest1+pos], xd1 ;Store result + XSTR [dest2+pos], xd2 ;Store result + XSTR [dest3+pos], xd3 ;Store result + XSTR [dest4+pos], xd4 ;Store result + + add pos, 16 ;Loop on 16 bytes at a time + cmp pos, len + jle .loop16 + + lea tmp, [len + 16] + cmp pos, tmp + je .return_pass + +.lessthan16: + ;; Tail len + ;; Do one more overlap pass + mov tmp, len ;Overlapped offset length-16 + + XLDR x0, [src+tmp] ;Get next source vector + + movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + + XLDR xd1, [dest1+tmp] ;Get next dest vector + XLDR xd2, [dest2+tmp] ;Get next dest vector + XLDR xtmph4, [dest3+tmp] ;Reuse xtmph1. Get next dest vector + + sub len, pos + + movdqa xtmpl4, [constip16] ;Load const of i + 16 + pinsrb xtmph3, len.w, 15 + pshufb xtmph3, xmask0f ;Broadcast len to all bytes + pcmpgtb xtmph3, xtmpl4 + + XLDR xtmpl4, [dest4+tmp] ;Get next dest vector + + movdqa xtmpa, x0 ;Keep unshifted copy of src + psraw x0, 4 ;Shift to put high nibble into bits 4-0 + pand x0, xmask0f ;Mask high src nibble in bits 4-0 + pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + + ; dest1 + pshufb xtmph1, x0 ;Lookup mul table of high nibble + pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble + pxor xtmph1, xtmpl1 ;GF add high and low partials + pand xtmph1, xtmph3 + pxor xd1, xtmph1 + + ; dest2 + pshufb xtmph2, x0 ;Lookup mul table of high nibble + pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble + pxor xtmph2, xtmpl2 ;GF add high and low partials + pand xtmph2, xtmph3 + pxor xd2, xtmph2 + + ; dest3 + pshufb xgft3_hi, x0 ;Lookup mul table of high nibble + pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble + pxor xgft3_hi, xtmpl3 ;GF add high and low partials + pand xgft3_hi, xtmph3 + pxor xtmph4, xgft3_hi + + ; dest4 + pshufb xgft4_hi, x0 ;Lookup mul table of high nibble + pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble + pxor xgft4_hi, xgft4_lo ;GF add high and low partials + pand xgft4_hi, xtmph3 + pxor xtmpl4, xgft4_hi + + XSTR [dest1+tmp], xd1 ;Store result + XSTR [dest2+tmp], xd2 ;Store result + XSTR [dest3+tmp], xtmph4 ;Store result + XSTR [dest4+tmp], xtmpl4 ;Store result + +.return_pass: + FUNC_RESTORE + mov return, 0 + ret + +.return_fail: + FUNC_RESTORE + mov return, 1 + ret + +endproc_frame + +section .data + +align 16 + +mask0f: + ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f +constip16: + ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_4vect_mad_sse, 00, 00, 0209 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s index a5625659713..1ef451f2006 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_5vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -309,5 +307,3 @@ global %1_slver %endmacro ;;; func core, ver, snum slversion gf_5vect_dot_prod_avx, 02, 03, 0194 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s index 7f25c1622bb..a7a41c2d568 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_avx2.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_5vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -321,5 +319,3 @@ global %1_slver %endmacro ;;; func core, ver, snum slversion gf_5vect_dot_prod_avx2, 04, 03, 0199 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s index 003ad261424..6264db60d0c 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_dot_prod_sse.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_5vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -165,23 +163,23 @@ default rel section .text %define xmask0f xmm15 -%define xgft1_lo xmm14 -%define xgft1_hi xmm13 -%define xgft2_lo xmm12 -%define xgft2_hi xmm11 +%define xgft1_lo xmm2 +%define xgft1_hi xmm3 +%define xgft2_lo xmm4 +%define xgft2_hi xmm5 %define xgft3_lo xmm10 -%define xgft3_hi xmm9 +%define xgft3_hi xmm6 %define xgft4_lo xmm8 %define xgft4_hi xmm7 %define x0 xmm0 %define xtmpa xmm1 -%define xp1 xmm2 -%define xp2 xmm3 -%define xp3 xmm4 -%define xp4 xmm5 -%define xp5 xmm6 +%define xp1 xmm9 +%define xp2 xmm11 +%define xp3 xmm12 +%define xp4 xmm13 +%define xp5 xmm14 align 16 global gf_5vect_dot_prod_sse:function @@ -309,6 +307,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits -slversion gf_5vect_dot_prod_sse, 00, 03, 0065 +slversion gf_5vect_dot_prod_sse, 00, 04, 0065 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s new file mode 100644 index 00000000000..4660a352278 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx.asm.s @@ -0,0 +1,371 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 + %define tmp4 r14 + %define return rax + %define return.w eax + %define stack_size 16*10 + 5*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + movdqa [rsp+16*3],xmm9 + movdqa [rsp+16*4],xmm10 + movdqa [rsp+16*5],xmm11 + movdqa [rsp+16*6],xmm12 + movdqa [rsp+16*7],xmm13 + movdqa [rsp+16*8],xmm14 + movdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + movdqa xmm9, [rsp+16*3] + movdqa xmm10, [rsp+16*4] + movdqa xmm11, [rsp+16*5] + movdqa xmm12, [rsp+16*6] + movdqa xmm13, [rsp+16*7] + movdqa xmm14, [rsp+16*8] + movdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r12 + %define tmp4 r13 + %define return rax + %define return.w eax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + %endmacro + %macro FUNC_RESTORE 0 + pop r13 + pop r12 + %endmacro +%endif + +;;; gf_5vect_mad_avx(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 tmp4 +%define dest3 mul_array +%define dest4 tmp2 +%define dest5 vec_i + + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel + +[bits 64] +section .text + +%define xmask0f xmm15 +%define xgft5_hi xmm14 +%define xgft4_lo xmm13 +%define xgft4_hi xmm12 + +%define x0 xmm0 +%define xtmpa xmm1 +%define xtmph1 xmm2 +%define xtmpl1 xmm3 +%define xtmph2 xmm4 +%define xtmpl2 xmm5 +%define xtmph3 xmm6 +%define xtmpl3 xmm7 +%define xtmph5 xmm8 +%define xtmpl5 xmm9 +%define xd1 xmm10 +%define xd2 xmm11 +%define xd3 xtmpl1 +%define xd4 xtmph1 +%define xd5 xtmpl2 + + +align 16 +global gf_5vect_mad_avx:function +func(gf_5vect_mad_avx) + FUNC_SAVE + sub len, 16 + jl .return_fail + xor pos, pos + vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + mov tmp, vec + sal vec_i, 5 ;Multiply by 32 + lea tmp3, [mul_array + vec_i] + sal tmp, 6 ;Multiply by 64 + vmovdqu xgft5_hi, [tmp3+2*tmp+16] ; " Ex{00}, Ex{10}, ..., Ex{f0} + sal vec, 5 ;Multiply by 32 + add tmp, vec + vmovdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0} + vmovdqu xgft4_lo, [tmp3+tmp] ;Load array Dx{00}, Dx{01}, Dx{02}, ... + + mov dest3, [dest1+2*PS] ; reuse mul_array + mov dest4, [dest1+3*PS] + mov dest5, [dest1+4*PS] ; reuse vec_i + mov dest2, [dest1+PS] + mov dest1, [dest1] + +.loop16: + XLDR x0, [src+pos] ;Get next source vector + + vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + vmovdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + vmovdqu xtmpl5, [tmp3+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f} + + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + ; dest1 + vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1, xtmpl1, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials + vpxor xd1, xd1, xtmph1 + + XLDR xd3, [dest3+pos] ;Reuse xtmpl1, Get next dest vector + XLDR xd4, [dest4+pos] ;Reuse xtmph1, Get next dest vector + + ; dest2 + vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2, xtmpl2, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials + vpxor xd2, xd2, xtmph2 + + XLDR xd5, [dest5+pos] ;Reuse xtmpl2. Get next dest vector + + ; dest3 + vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble + vpshufb xtmpl3, xtmpl3, xtmpa ;Lookup mul table of low nibble + vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials + vpxor xd3, xd3, xtmph3 + + ; dest4 + vpshufb xtmph2, xgft4_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl3, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl3 ;GF add high and low partials + vpxor xd4, xd4, xtmph2 + + ; dest5 + vpshufb xtmph5, xgft5_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl5, xtmpl5, xtmpa ;Lookup mul table of low nibble + vpxor xtmph5, xtmph5, xtmpl5 ;GF add high and low partials + vpxor xd5, xd5, xtmph5 + + XSTR [dest1+pos], xd1 ;Store result into dest1 + XSTR [dest2+pos], xd2 ;Store result into dest2 + XSTR [dest3+pos], xd3 ;Store result into dest3 + XSTR [dest4+pos], xd4 ;Store result into dest4 + XSTR [dest5+pos], xd5 ;Store result into dest5 + + add pos, 16 ;Loop on 16 bytes at a time + cmp pos, len + jle .loop16 + + lea tmp, [len + 16] + cmp pos, tmp + je .return_pass + +.lessthan16: + ;; Tail len + ;; Do one more overlap pass + mov tmp, len ;Overlapped offset length-16 + XLDR x0, [src+tmp] ;Get next source vector + + sub len, pos + + vmovdqa xtmph1, [constip16] ;Load const of i + 16 + vpinsrb xtmph5, len.w, 15 + vpshufb xtmph5, xmask0f ;Broadcast len to all bytes + vpcmpgtb xtmph5, xtmph5, xtmph1 + + vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + vmovdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + vmovdqu xtmpl5, [tmp3+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f} + + XLDR xd1, [dest1+tmp] ;Get next dest vector + XLDR xd2, [dest2+tmp] ;Get next dest vector + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + ; dest1 + vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1, xtmpl1, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials + vpand xtmph1, xtmph1, xtmph5 + vpxor xd1, xd1, xtmph1 + + XLDR xd3, [dest3+tmp] ;Reuse xtmpl1, Get next dest vector + XLDR xd4, [dest4+tmp] ;Reuse xtmph1, Get next dest vector + + ; dest2 + vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2, xtmpl2, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials + vpand xtmph2, xtmph2, xtmph5 + vpxor xd2, xd2, xtmph2 + + XLDR xd5, [dest5+tmp] ;Reuse xtmpl2. Get next dest vector + + ; dest3 + vpshufb xtmph3, xtmph3, x0 ;Lookup mul table of high nibble + vpshufb xtmpl3, xtmpl3, xtmpa ;Lookup mul table of low nibble + vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials + vpand xtmph3, xtmph3, xtmph5 + vpxor xd3, xd3, xtmph3 + + ; dest4 + vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials + vpand xgft4_hi, xgft4_hi, xtmph5 + vpxor xd4, xd4, xgft4_hi + + ; dest5 + vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl5, xtmpl5, xtmpa ;Lookup mul table of low nibble + vpxor xgft5_hi, xgft5_hi, xtmpl5 ;GF add high and low partials + vpand xgft5_hi, xgft5_hi, xtmph5 + vpxor xd5, xd5, xgft5_hi + + XSTR [dest1+tmp], xd1 ;Store result into dest1 + XSTR [dest2+tmp], xd2 ;Store result into dest2 + XSTR [dest3+tmp], xd3 ;Store result into dest3 + XSTR [dest4+tmp], xd4 ;Store result into dest4 + XSTR [dest5+tmp], xd5 ;Store result into dest5 + +.return_pass: + FUNC_RESTORE + mov return, 0 + ret + +.return_fail: + FUNC_RESTORE + mov return, 1 + ret + +endproc_frame + +section .data + +align 16 +mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f +constip16: + ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_5vect_mad_avx, 02, 00, 020d diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s new file mode 100644 index 00000000000..db84189af89 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_avx2.asm.s @@ -0,0 +1,369 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define tmp2 r10 + %define return rax + %define return.w eax + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + movdqa [rsp+16*3],xmm9 + movdqa [rsp+16*4],xmm10 + movdqa [rsp+16*5],xmm11 + movdqa [rsp+16*6],xmm12 + movdqa [rsp+16*7],xmm13 + movdqa [rsp+16*8],xmm14 + movdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r15, 10*16 + 1*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + movdqa xmm9, [rsp+16*3] + movdqa xmm10, [rsp+16*4] + movdqa xmm11, [rsp+16*5] + movdqa xmm12, [rsp+16*6] + movdqa xmm13, [rsp+16*7] + movdqa xmm14, [rsp+16*8] + movdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r15, [rsp + 10*16 + 1*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define tmp2 r10 + %define return rax + %define return.w eax + + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +;;; gf_5vect_mad_avx2(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 tmp2 +%define dest3 mul_array +%define dest4 vec +%define dest5 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + +default rel + +[bits 64] +section .text + +%define xmask0f ymm15 +%define xmask0fx xmm15 +%define xgft1_lo ymm14 +%define xgft2_lo ymm13 +%define xgft3_lo ymm12 +%define xgft4_lo ymm11 +%define xgft5_lo ymm10 + +%define x0 ymm0 +%define xtmpa ymm1 +%define xtmpl ymm2 +%define xtmplx xmm2 +%define xtmph1 ymm3 +%define xtmph1x xmm3 +%define xtmph2 ymm4 +%define xd1 ymm5 +%define xd2 ymm6 +%define xd3 ymm7 +%define xd4 ymm8 +%define xd5 ymm9 + +align 16 +global gf_5vect_mad_avx2:function +func(gf_5vect_mad_avx2) + FUNC_SAVE + sub len, 32 + jl .return_fail + xor pos, pos + mov tmp.b, 0x0f + vpinsrb xmask0fx, xmask0fx, tmp.w, 0 + vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... + + sal vec_i, 5 ;Multiply by 32 + sal vec, 5 ;Multiply by 32 + lea tmp, [mul_array + vec_i] + + vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} + ; " Ax{00}, Ax{10}, ..., Ax{f0} + vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + ; " Bx{00}, Bx{10}, ..., Bx{f0} + vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} + ; " Cx{00}, Cx{10}, ..., Cx{f0} + vmovdqu xgft5_lo, [tmp+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f} + ; " Ex{00}, Ex{10}, ..., Ex{f0} + add tmp, vec + vmovdqu xgft4_lo, [tmp+2*vec] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} + ; " Dx{00}, Dx{10}, ..., Dx{f0} + + mov dest3, [dest1+2*PS] ; reuse mul_array + mov dest4, [dest1+3*PS] ; reuse vec + mov dest5, [dest1+4*PS] ; reuse vec_i + mov dest2, [dest1+PS] + mov dest1, [dest1] + +.loop32: + XLDR x0, [src+pos] ;Get next source vector + + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector + XLDR xd3, [dest3+pos] ;Get next dest vector + XLDR xd4, [dest4+pos] ;Get next dest vector + XLDR xd5, [dest5+pos] ;Get next dest vector + + vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi + vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo + + vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo + vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo + + ; dest1 + vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials + vpxor xd1, xd1, xtmph1 ;xd1 += partial + + vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo + ; dest2 + vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials + vpxor xd2, xd2, xtmph2 ;xd2 += partial + + vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + ; dest3 + vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials + vpxor xd3, xd3, xtmph1 ;xd3 += partial + + vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo + ; dest4 + vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl ;GF add high and low partials + vpxor xd4, xd4, xtmph2 ;xd4 += partial + + ; dest5 + vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft5_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl ;GF add high and low partials + vpxor xd5, xd5, xtmph1 ;xd5 += partial + + XSTR [dest1+pos], xd1 + XSTR [dest2+pos], xd2 + XSTR [dest3+pos], xd3 + XSTR [dest4+pos], xd4 + XSTR [dest5+pos], xd5 + + add pos, 32 ;Loop on 32 bytes at a time + cmp pos, len + jle .loop32 + + lea tmp, [len + 32] + cmp pos, tmp + je .return_pass + +.lessthan32: + ;; Tail len + ;; Do one more overlap pass + mov tmp.b, 0x1f + vpinsrb xtmph1x, xtmph1x, tmp.w, 0 + vpbroadcastb xtmph1, xtmph1x ;Construct mask 0x1f1f1f... + + mov tmp, len ;Overlapped offset length-32 + + XLDR x0, [src+tmp] ;Get next source vector + + XLDR xd1, [dest1+tmp] ;Get next dest vector + XLDR xd2, [dest2+tmp] ;Get next dest vector + XLDR xd3, [dest3+tmp] ;Get next dest vector + XLDR xd4, [dest4+tmp] ;Get next dest vector + XLDR xd5, [dest5+tmp] ;Get next dest vector + + sub len, pos + + vmovdqa xtmph2, [constip32] ;Load const of i + 32 + vpinsrb xtmplx, xtmplx, len.w, 15 + vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx + vpshufb xtmpl, xtmpl, xtmph1 ;Broadcast len to all bytes. xtmph1=0x1f1f1f... + vpcmpgtb xtmpl, xtmpl, xtmph2 + + vpand xtmph1, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + vperm2i128 xtmpa, xtmph1, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi + vperm2i128 x0, xtmph1, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo + + vperm2i128 xtmph1, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo + vperm2i128 xtmph2, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo + + ; dest1 + vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xgft1_lo ;GF add high and low partials + vpand xtmph1, xtmph1, xtmpl + vpxor xd1, xd1, xtmph1 ;xd1 += partial + + vperm2i128 xtmph1, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo + ; dest2 + vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xgft2_lo ;GF add high and low partials + vpand xtmph2, xtmph2, xtmpl + vpxor xd2, xd2, xtmph2 ;xd2 += partial + + vperm2i128 xtmph2, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + ; dest3 + vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xgft3_lo ;GF add high and low partials + vpand xtmph1, xtmph1, xtmpl + vpxor xd3, xd3, xtmph1 ;xd3 += partial + + vperm2i128 xtmph1, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo + ; dest4 + vpshufb xtmph2, xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xgft4_lo ;GF add high and low partials + vpand xtmph2, xtmph2, xtmpl + vpxor xd4, xd4, xtmph2 ;xd4 += partial + + ; dest5 + vpshufb xtmph1, xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xgft5_lo ;GF add high and low partials + vpand xtmph1, xtmph1, xtmpl + vpxor xd5, xd5, xtmph1 ;xd5 += partial + + XSTR [dest1+tmp], xd1 + XSTR [dest2+tmp], xd2 + XSTR [dest3+tmp], xd3 + XSTR [dest4+tmp], xd4 + XSTR [dest5+tmp], xd5 + +.return_pass: + FUNC_RESTORE + mov return, 0 + ret + +.return_fail: + FUNC_RESTORE + mov return, 1 + ret + +endproc_frame + +section .data +align 32 +constip32: + ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_5vect_mad_avx2, 04, 00, 020e diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s new file mode 100644 index 00000000000..615a7f769e0 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_5vect_mad_sse.asm.s @@ -0,0 +1,379 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 + %define tmp4 r14 + %define return rax + %define return.w eax + %define stack_size 16*10 + 5*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + movdqa [rsp+16*3],xmm9 + movdqa [rsp+16*4],xmm10 + movdqa [rsp+16*5],xmm11 + movdqa [rsp+16*6],xmm12 + movdqa [rsp+16*7],xmm13 + movdqa [rsp+16*8],xmm14 + movdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + movdqa xmm9, [rsp+16*3] + movdqa xmm10, [rsp+16*4] + movdqa xmm11, [rsp+16*5] + movdqa xmm12, [rsp+16*6] + movdqa xmm13, [rsp+16*7] + movdqa xmm14, [rsp+16*8] + movdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r12 + %define tmp4 r13 + %define return rax + %define return.w eax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + %endmacro + %macro FUNC_RESTORE 0 + pop r13 + pop r12 + %endmacro +%endif + +;;; gf_5vect_mad_sse(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 tmp4 +%define dest3 mul_array +%define dest4 tmp2 +%define dest5 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR movdqu + %define XSTR movdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa + %else + %define XLDR movntdqa + %define XSTR movntdq + %endif +%endif + +default rel + +[bits 64] +section .text + +%define xmask0f xmm15 +%define xgft5_hi xmm14 +%define xgft4_lo xmm13 +%define xgft4_hi xmm12 + +%define x0 xmm0 +%define xtmpa xmm1 +%define xtmph1 xmm2 +%define xtmpl1 xmm3 +%define xtmph2 xmm4 +%define xtmpl2 xmm5 +%define xtmph3 xmm6 +%define xtmpl3 xmm7 +%define xtmph5 xmm8 +%define xtmpl5 xmm9 +%define xd1 xmm10 +%define xd2 xmm11 +%define xd3 xtmpl1 +%define xd4 xtmph1 +%define xd5 xtmpl2 + + +align 16 +global gf_5vect_mad_sse:function +func(gf_5vect_mad_sse) + FUNC_SAVE + sub len, 16 + jl .return_fail + xor pos, pos + + movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + mov tmp, vec + sal vec_i, 5 ;Multiply by 32 + lea tmp3, [mul_array + vec_i] + sal tmp, 6 ;Multiply by 64 + movdqu xgft5_hi, [tmp3+2*tmp+16] ; " Ex{00}, Ex{10}, ..., Ex{f0} + sal vec, 5 ;Multiply by 32 + add tmp, vec + movdqu xgft4_hi, [tmp3+tmp+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0} + movdqu xgft4_lo, [tmp3+tmp] ;Load array Dx{00}, Dx{01}, Dx{02}, ... + + mov dest3, [dest1+2*PS] ; reuse mul_array + mov dest4, [dest1+3*PS] + mov dest5, [dest1+4*PS] ; reuse vec_i + mov dest2, [dest1+PS] + mov dest1, [dest1] + +.loop16: + XLDR x0, [src+pos] ;Get next source vector + + movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + movdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + movdqu xtmpl5, [tmp3+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f} + movdqa xtmph5, xgft5_hi ;Reload const array registers + + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector + + movdqa xtmpa, x0 ;Keep unshifted copy of src + psraw x0, 4 ;Shift to put high nibble into bits 4-0 + pand x0, xmask0f ;Mask high src nibble in bits 4-0 + pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + + ; dest1 + pshufb xtmph1, x0 ;Lookup mul table of high nibble + pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble + pxor xtmph1, xtmpl1 ;GF add high and low partials + pxor xd1, xtmph1 + + XLDR xd3, [dest3+pos] ;Reuse xtmpl1, Get next dest vector + XLDR xd4, [dest4+pos] ;Reuse xtmph1. Get next dest vector + + ; dest2 + pshufb xtmph2, x0 ;Lookup mul table of high nibble + pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble + pxor xtmph2, xtmpl2 ;GF add high and low partials + pxor xd2, xtmph2 + + XLDR xd5, [dest5+pos] ;Reuse xtmpl2. Get next dest vector + + ; dest3 + pshufb xtmph3, x0 ;Lookup mul table of high nibble + pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble + pxor xtmph3, xtmpl3 ;GF add high and low partials + pxor xd3, xtmph3 + + movdqa xtmph2, xgft4_hi ;Reload const array registers + movdqa xtmpl3, xgft4_lo ;Reload const array registers + + ; dest5 + pshufb xtmph5, x0 ;Lookup mul table of high nibble + pshufb xtmpl5, xtmpa ;Lookup mul table of low nibble + pxor xtmph5, xtmpl5 ;GF add high and low partials + pxor xd5, xtmph5 + + ; dest4 + pshufb xtmph2, x0 ;Lookup mul table of high nibble + pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble + pxor xtmph2, xtmpl3 ;GF add high and low partials + pxor xd4, xtmph2 + + XSTR [dest1+pos], xd1 ;Store result into dest1 + XSTR [dest2+pos], xd2 ;Store result into dest2 + XSTR [dest3+pos], xd3 ;Store result into dest3 + XSTR [dest4+pos], xd4 ;Store result into dest4 + XSTR [dest5+pos], xd5 ;Store result into dest5 + + add pos, 16 ;Loop on 16 bytes at a time + cmp pos, len + jle .loop16 + + lea tmp, [len + 16] + cmp pos, tmp + je .return_pass + +.lessthan16: + ;; Tail len + ;; Do one more overlap pass + mov tmp, len ;Overlapped offset length-16 + XLDR x0, [src+tmp] ;Get next source vector + + sub len, pos + + movdqa xtmpl1, [constip16] ;Load const of i + 16 + pinsrb xtmph5, len.w, 15 + pshufb xtmph5, xmask0f ;Broadcast len to all bytes + pcmpgtb xtmph5, xtmpl1 + + movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + movdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + movdqu xtmpl5, [tmp3+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f} + + XLDR xd1, [dest1+tmp] ;Get next dest vector + XLDR xd2, [dest2+tmp] ;Get next dest vector + + movdqa xtmpa, x0 ;Keep unshifted copy of src + psraw x0, 4 ;Shift to put high nibble into bits 4-0 + pand x0, xmask0f ;Mask high src nibble in bits 4-0 + pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + + ; dest1 + pshufb xtmph1, x0 ;Lookup mul table of high nibble + pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble + pxor xtmph1, xtmpl1 ;GF add high and low partials + pand xtmph1, xtmph5 + pxor xd1, xtmph1 + + XLDR xd3, [dest3+tmp] ;Reuse xtmpl1, Get next dest vector + XLDR xd4, [dest4+tmp] ;Reuse xtmph1. Get next dest vector + + ; dest2 + pshufb xtmph2, x0 ;Lookup mul table of high nibble + pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble + pxor xtmph2, xtmpl2 ;GF add high and low partials + pand xtmph2, xtmph5 + pxor xd2, xtmph2 + + XLDR xd5, [dest5+tmp] ;Reuse xtmpl2. Get next dest vector + + ; dest3 + pshufb xtmph3, x0 ;Lookup mul table of high nibble + pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble + pxor xtmph3, xtmpl3 ;GF add high and low partials + pand xtmph3, xtmph5 + pxor xd3, xtmph3 + + ; dest4 + pshufb xgft4_hi, x0 ;Lookup mul table of high nibble + pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble + pxor xgft4_hi, xgft4_lo ;GF add high and low partials + pand xgft4_hi, xtmph5 + pxor xd4, xgft4_hi + + ; dest5 + pshufb xgft5_hi, x0 ;Lookup mul table of high nibble + pshufb xtmpl5, xtmpa ;Lookup mul table of low nibble + pxor xgft5_hi, xtmpl5 ;GF add high and low partials + pand xgft5_hi, xtmph5 + pxor xd5, xgft5_hi + + XSTR [dest1+tmp], xd1 ;Store result into dest1 + XSTR [dest2+tmp], xd2 ;Store result into dest2 + XSTR [dest3+tmp], xd3 ;Store result into dest3 + XSTR [dest4+tmp], xd4 ;Store result into dest4 + XSTR [dest5+tmp], xd5 ;Store result into dest5 + +.return_pass: + FUNC_RESTORE + mov return, 0 + ret + +.return_fail: + FUNC_RESTORE + mov return, 1 + ret + +endproc_frame + +section .data + +align 16 + +mask0f: + ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f +constip16: + ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_5vect_mad_sse, 00, 00, 020c diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s index 28ca861357a..f439fbbbc16 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_6vect_dot_prod_avx(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -321,5 +319,3 @@ global %1_slver %endmacro ;;; func core, ver, snum slversion gf_6vect_dot_prod_avx, 02, 03, 0195 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s index a957c9ecc0c..fac63022ac0 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_avx2.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_6vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -332,5 +330,3 @@ global %1_slver %endmacro ;;; func core, ver, snum slversion gf_6vect_dot_prod_avx2, 04, 03, 019a -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s index 4910ddd703a..c3cfa14b821 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_dot_prod_sse.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_6vect_dot_prod_sse(len, vec, *g_tbls, **buffs, **dests); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -166,20 +164,20 @@ default rel section .text %define xmask0f xmm15 -%define xgft1_lo xmm14 -%define xgft1_hi xmm13 -%define xgft2_lo xmm12 -%define xgft2_hi xmm11 -%define xgft3_lo xmm10 -%define xgft3_hi xmm9 +%define xgft1_lo xmm2 +%define xgft1_hi xmm3 +%define xgft2_lo xmm4 +%define xgft2_hi xmm5 +%define xgft3_lo xmm6 +%define xgft3_hi xmm7 %define x0 xmm0 %define xtmpa xmm1 -%define xp1 xmm2 -%define xp2 xmm3 -%define xp3 xmm4 -%define xp4 xmm5 -%define xp5 xmm6 -%define xp6 xmm7 +%define xp1 xmm8 +%define xp2 xmm9 +%define xp3 xmm10 +%define xp4 xmm11 +%define xp5 xmm12 +%define xp6 xmm13 align 16 global gf_6vect_dot_prod_sse:function @@ -320,6 +318,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -slversion gf_6vect_dot_prod_sse, 00, 03, 0066 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_6vect_dot_prod_sse, 00, 04, 0066 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s new file mode 100644 index 00000000000..84b2eca5de6 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx.asm.s @@ -0,0 +1,400 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r13 + %define tmp4 r14 + %define tmp5 rdi + %define return rax + %define return.w eax + %define stack_size 16*10 + 5*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + movdqa [rsp+16*3],xmm9 + movdqa [rsp+16*4],xmm10 + movdqa [rsp+16*5],xmm11 + movdqa [rsp+16*6],xmm12 + movdqa [rsp+16*7],xmm13 + movdqa [rsp+16*8],xmm14 + movdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + movdqa xmm9, [rsp+16*3] + movdqa xmm10, [rsp+16*4] + movdqa xmm11, [rsp+16*5] + movdqa xmm12, [rsp+16*6] + movdqa xmm13, [rsp+16*7] + movdqa xmm14, [rsp+16*8] + movdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp2 r10 + %define tmp3 r12 + %define tmp4 r13 + %define tmp5 r14 + %define return rax + %define return.w eax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + %endmacro + %macro FUNC_RESTORE 0 + pop r14 + pop r13 + pop r12 + %endmacro +%endif + +;;; gf_6vect_mad_avx(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 tmp4 +%define dest3 tmp2 +%define dest4 mul_array +%define dest5 tmp5 +%define dest6 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + + +default rel + +[bits 64] +section .text + +%define xmask0f xmm15 +%define xgft4_lo xmm14 +%define xgft4_hi xmm13 +%define xgft5_lo xmm12 +%define xgft5_hi xmm11 +%define xgft6_lo xmm10 +%define xgft6_hi xmm9 + +%define x0 xmm0 +%define xtmpa xmm1 +%define xtmph1 xmm2 +%define xtmpl1 xmm3 +%define xtmph2 xmm4 +%define xtmpl2 xmm5 +%define xtmph3 xmm6 +%define xtmpl3 xmm7 +%define xd1 xmm8 +%define xd2 xtmpl1 +%define xd3 xtmph1 + + +align 16 +global gf_6vect_mad_avx:function +func(gf_6vect_mad_avx) + FUNC_SAVE + sub len, 16 + jl .return_fail + xor pos, pos + vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + mov tmp, vec + sal vec_i, 5 ;Multiply by 32 + lea tmp3, [mul_array + vec_i] + sal tmp, 6 ;Multiply by 64 + + sal vec, 5 ;Multiply by 32 + lea vec_i, [tmp + vec] ;vec_i = vec*96 + lea mul_array, [tmp + vec_i] ;mul_array = vec*160 + + vmovdqu xgft5_lo, [tmp3+2*tmp] ;Load array Ex{00}, Ex{01}, ..., Ex{0f} + vmovdqu xgft5_hi, [tmp3+2*tmp+16] ; " Ex{00}, Ex{10}, ..., Ex{f0} + vmovdqu xgft4_lo, [tmp3+vec_i] ;Load array Dx{00}, Dx{01}, Dx{02}, ... + vmovdqu xgft4_hi, [tmp3+vec_i+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0} + vmovdqu xgft6_lo, [tmp3+mul_array] ;Load array Fx{00}, Fx{01}, ..., Fx{0f} + vmovdqu xgft6_hi, [tmp3+mul_array+16] ; " Fx{00}, Fx{10}, ..., Fx{f0} + + mov dest2, [dest1+PS] + mov dest3, [dest1+2*PS] + mov dest4, [dest1+3*PS] ; reuse mul_array + mov dest5, [dest1+4*PS] + mov dest6, [dest1+5*PS] ; reuse vec_i + mov dest1, [dest1] + +.loop16: + XLDR x0, [src+pos] ;Get next source vector + + vmovdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + vmovdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + vmovdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + vmovdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + vmovdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + vmovdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + XLDR xd1, [dest1+pos] ;Get next dest vector + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + + ;dest1 + vpshufb xtmph1, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmpl1 ;GF add high and low partials + vpxor xd1, xtmph1 + + XLDR xd2, [dest2+pos] ;reuse xtmpl1. Get next dest vector + XLDR xd3, [dest3+pos] ;reuse xtmph1. Get next dest vector + + ;dest2 + vpshufb xtmph2, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmpl2 ;GF add high and low partials + vpxor xd2, xtmph2 + + ;dest3 + vpshufb xtmph3, x0 ;Lookup mul table of high nibble + vpshufb xtmpl3, xtmpa ;Lookup mul table of low nibble + vpxor xtmph3, xtmpl3 ;GF add high and low partials + vpxor xd3, xtmph3 + + XSTR [dest1+pos], xd1 ;Store result into dest1 + XSTR [dest2+pos], xd2 ;Store result into dest2 + XSTR [dest3+pos], xd3 ;Store result into dest3 + + ;dest4 + XLDR xd1, [dest4+pos] ;Get next dest vector + vpshufb xtmph1, xgft4_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl1, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph1, xtmph1, xtmpl1 ;GF add high and low partials + vpxor xd1, xd1, xtmph1 + + XLDR xd2, [dest5+pos] ;reuse xtmpl1. Get next dest vector + XLDR xd3, [dest6+pos] ;reuse xtmph1. Get next dest vector + + ;dest5 + vpshufb xtmph2, xgft5_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl2, xgft5_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph2, xtmph2, xtmpl2 ;GF add high and low partials + vpxor xd2, xd2, xtmph2 + + ;dest6 + vpshufb xtmph3, xgft6_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl3, xgft6_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph3, xtmph3, xtmpl3 ;GF add high and low partials + vpxor xd3, xd3, xtmph3 + + XSTR [dest4+pos], xd1 ;Store result into dest4 + XSTR [dest5+pos], xd2 ;Store result into dest5 + XSTR [dest6+pos], xd3 ;Store result into dest6 + + add pos, 16 ;Loop on 16 bytes at a time + cmp pos, len + jle .loop16 + + lea tmp, [len + 16] + cmp pos, tmp + je .return_pass + +.lessthan16: + ;; Tail len + ;; Do one more overlap pass + ;; Overlapped offset length-16 + mov tmp, len ;Backup len as len=rdi + + XLDR x0, [src+tmp] ;Get next source vector + XLDR xd1, [dest4+tmp] ;Get next dest vector + XLDR xd2, [dest5+tmp] ;reuse xtmpl1. Get next dest vector + XLDR xd3, [dest6+tmp] ;reuse xtmph1. Get next dest vector + + sub len, pos + + vmovdqa xtmph3, [constip16] ;Load const of i + 16 + vpinsrb xtmpl3, len.w, 15 + vpshufb xtmpl3, xmask0f ;Broadcast len to all bytes + vpcmpgtb xtmpl3, xtmpl3, xtmph3 + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + ;dest4 + vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials + vpand xgft4_hi, xgft4_hi, xtmpl3 + vpxor xd1, xd1, xgft4_hi + + ;dest5 + vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials + vpand xgft5_hi, xgft5_hi, xtmpl3 + vpxor xd2, xd2, xgft5_hi + + ;dest6 + vpshufb xgft6_hi, xgft6_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft6_hi, xgft6_hi, xgft6_lo ;GF add high and low partials + vpand xgft6_hi, xgft6_hi, xtmpl3 + vpxor xd3, xd3, xgft6_hi + + XSTR [dest4+tmp], xd1 ;Store result into dest4 + XSTR [dest5+tmp], xd2 ;Store result into dest5 + XSTR [dest6+tmp], xd3 ;Store result into dest6 + + vmovdqu xgft4_lo, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + vmovdqu xgft4_hi, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + vmovdqu xgft5_lo, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + vmovdqu xgft5_hi, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + vmovdqu xgft6_lo, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + vmovdqu xgft6_hi, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + XLDR xd1, [dest1+tmp] ;Get next dest vector + XLDR xd2, [dest2+tmp] ;reuse xtmpl1. Get next dest vector + XLDR xd3, [dest3+tmp] ;reuse xtmph1. Get next dest3 vector + + ;dest1 + vpshufb xgft4_hi, xgft4_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft4_hi, xgft4_hi, xgft4_lo ;GF add high and low partials + vpand xgft4_hi, xgft4_hi, xtmpl3 + vpxor xd1, xd1, xgft4_hi + + ;dest2 + vpshufb xgft5_hi, xgft5_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft5_hi, xgft5_hi, xgft5_lo ;GF add high and low partials + vpand xgft5_hi, xgft5_hi, xtmpl3 + vpxor xd2, xd2, xgft5_hi + + ;dest3 + vpshufb xgft6_hi, xgft6_hi, x0 ;Lookup mul table of high nibble + vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble + vpxor xgft6_hi, xgft6_hi, xgft6_lo ;GF add high and low partials + vpand xgft6_hi, xgft6_hi, xtmpl3 + vpxor xd3, xd3, xgft6_hi + + XSTR [dest1+tmp], xd1 ;Store result into dest1 + XSTR [dest2+tmp], xd2 ;Store result into dest2 + XSTR [dest3+tmp], xd3 ;Store result into dest3 + +.return_pass: + FUNC_RESTORE + mov return, 0 + ret + +.return_fail: + FUNC_RESTORE + mov return, 1 + ret + +endproc_frame + +section .data + +align 16 +mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f +constip16: + ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_6vect_mad_avx, 02, 00, 0210 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s new file mode 100644 index 00000000000..d83847ab6c8 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_avx2.asm.s @@ -0,0 +1,407 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest); +;;; + + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define tmp2 r10 + %define tmp3 r13 + %define return rax + %define return.w eax + %define stack_size 16*10 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + movdqa [rsp+16*3],xmm9 + movdqa [rsp+16*4],xmm10 + movdqa [rsp+16*5],xmm11 + movdqa [rsp+16*6],xmm12 + movdqa [rsp+16*7],xmm13 + movdqa [rsp+16*8],xmm14 + movdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r15, 10*16 + 2*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + movdqa xmm9, [rsp+16*3] + movdqa xmm10, [rsp+16*4] + movdqa xmm11, [rsp+16*5] + movdqa xmm12, [rsp+16*6] + movdqa xmm13, [rsp+16*7] + movdqa xmm14, [rsp+16*8] + movdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r15, [rsp + 10*16 + 3*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define tmp2 r10 + %define tmp3 r12 + %define return rax + %define return.w eax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + %endmacro + %macro FUNC_RESTORE 0 + pop r12 + %endmacro +%endif + +;;; gf_6vect_mad_avx2(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 tmp3 +%define dest3 tmp2 +%define dest4 mul_array +%define dest5 vec +%define dest6 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + + +default rel + +[bits 64] +section .text + +%define xmask0f ymm15 +%define xmask0fx xmm15 +%define xgft1_lo ymm14 +%define xgft2_lo ymm13 +%define xgft3_lo ymm12 +%define xgft4_lo ymm11 +%define xgft5_lo ymm10 +%define xgft6_lo ymm9 + +%define x0 ymm0 +%define xtmpa ymm1 +%define xtmpl ymm2 +%define xtmplx xmm2 +%define xtmph ymm3 +%define xtmphx xmm3 +%define xd1 ymm4 +%define xd2 ymm5 +%define xd3 ymm6 +%define xd4 ymm7 +%define xd5 ymm8 +%define xd6 xd1 + +align 16 +global gf_6vect_mad_avx2:function +func(gf_6vect_mad_avx2) + FUNC_SAVE + sub len, 32 + jl .return_fail + xor pos, pos + mov tmp.b, 0x0f + vpinsrb xmask0fx, xmask0fx, tmp.w, 0 + vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... + + sal vec_i, 5 ;Multiply by 32 + sal vec, 5 ;Multiply by 32 + lea tmp, [mul_array + vec_i] + mov vec_i, vec + mov mul_array, vec + sal vec_i, 1 + sal mul_array, 1 + add vec_i, vec ;vec_i=vec*96 + add mul_array, vec_i ;vec_i=vec*160 + + vmovdqu xgft1_lo, [tmp] ;Load array Ax{00}, Ax{01}, ..., Ax{0f} + ; " Ax{00}, Ax{10}, ..., Ax{f0} + vmovdqu xgft2_lo, [tmp+vec] ;Load array Bx{00}, Bx{01}, ..., Bx{0f} + ; " Bx{00}, Bx{10}, ..., Bx{f0} + vmovdqu xgft3_lo, [tmp+2*vec] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} + ; " Cx{00}, Cx{10}, ..., Cx{f0} + vmovdqu xgft4_lo, [tmp+vec_i] ;Load array Fx{00}, Fx{01}, ..., Fx{0f} + ; " Fx{00}, Fx{10}, ..., Fx{f0} + vmovdqu xgft5_lo, [tmp+4*vec] ;Load array Ex{00}, Ex{01}, ..., Ex{0f} + ; " Ex{00}, Ex{10}, ..., Ex{f0} + vmovdqu xgft6_lo, [tmp+mul_array] ;Load array Dx{00}, Dx{01}, ..., Dx{0f} + ; " Dx{00}, Dx{10}, ..., Dx{f0} + + mov dest2, [dest1+PS] ; reuse tmp3 + mov dest3, [dest1+2*PS] ; reuse tmp2 + mov dest4, [dest1+3*PS] ; reuse mul_array + mov dest5, [dest1+4*PS] ; reuse vec + mov dest6, [dest1+5*PS] ; reuse vec_i + mov dest1, [dest1] + +.loop32: + XLDR x0, [src+pos] ;Get next source vector + XLDR xd1, [dest1+pos] ;Get next dest vector + XLDR xd2, [dest2+pos] ;Get next dest vector + XLDR xd3, [dest3+pos] ;Get next dest vector + XLDR xd4, [dest4+pos] ;Get next dest vector + XLDR xd5, [dest5+pos] ;Get next dest vector + + vpand xtmpl, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + vperm2i128 xtmpa, xtmpl, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi + vperm2i128 x0, xtmpl, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo + + ;dest1 + vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xtmpl ;GF add high and low partials + vpxor xd1, xd1, xtmph ;xd1 += partial + + XSTR [dest1+pos], xd1 ;Store result into dest1 + + ;dest2 + vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xtmpl ;GF add high and low partials + vpxor xd2, xd2, xtmph ;xd2 += partial + + ;dest3 + vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xtmpl ;GF add high and low partials + vpxor xd3, xd3, xtmph ;xd3 += partial + + XLDR xd6, [dest6+pos] ;reuse xd1. Get next dest vector + + ;dest4 + vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xtmpl ;GF add high and low partials + vpxor xd4, xd4, xtmph ;xd4 += partial + + ;dest5 + vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft5_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xtmpl ;GF add high and low partials + vpxor xd5, xd5, xtmph ;xd5 += partial + + ;dest6 + vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft6_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xtmpl ;GF add high and low partials + vpxor xd6, xd6, xtmph ;xd6 += partial + + XSTR [dest2+pos], xd2 ;Store result into dest2 + XSTR [dest3+pos], xd3 ;Store result into dest3 + XSTR [dest4+pos], xd4 ;Store result into dest4 + XSTR [dest5+pos], xd5 ;Store result into dest5 + XSTR [dest6+pos], xd6 ;Store result into dest6 + + add pos, 32 ;Loop on 32 bytes at a time + cmp pos, len + jle .loop32 + + lea tmp, [len + 32] + cmp pos, tmp + je .return_pass + +.lessthan32: + ;; Tail len + ;; Do one more overlap pass + mov tmp.b, 0x1f + vpinsrb xtmphx, xtmphx, tmp.w, 0 + vpbroadcastb xtmph, xtmphx ;Construct mask 0x1f1f1f... + + mov tmp, len ;Overlapped offset length-32 + + XLDR x0, [src+tmp] ;Get next source vector + XLDR xd1, [dest1+tmp] ;Get next dest vector + XLDR xd2, [dest2+tmp] ;Get next dest vector + XLDR xd3, [dest3+tmp] ;Get next dest vector + XLDR xd4, [dest4+tmp] ;Get next dest vector + XLDR xd5, [dest5+tmp] ;Get next dest vector + + sub len, pos + + vpinsrb xtmplx, xtmplx, len.w, 15 + vinserti128 xtmpl, xtmpl, xtmplx, 1 ;swapped to xtmplx | xtmplx + vpshufb xtmpl, xtmpl, xtmph ;Broadcast len to all bytes. xtmph=0x1f1f1f... + vpcmpgtb xtmpl, xtmpl, [constip32] + + vpand xtmph, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + vperm2i128 xtmpa, xtmph, x0, 0x30 ;swap xtmpa from 1lo|2lo to 1lo|2hi + vperm2i128 x0, xtmph, x0, 0x12 ;swap x0 from 1hi|2hi to 1hi|2lo + + ;dest1 + vperm2i128 xtmph, xgft1_lo, xgft1_lo, 0x01 ; swapped to hi | lo + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble + vpshufb xgft1_lo, xgft1_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xgft1_lo ;GF add high and low partials + vpand xtmph, xtmph, xtmpl + vpxor xd1, xd1, xtmph ;xd1 += partial + + XSTR [dest1+tmp], xd1 ;Store result into dest1 + + ;dest2 + vperm2i128 xtmph, xgft2_lo, xgft2_lo, 0x01 ; swapped to hi | lo + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble + vpshufb xgft2_lo, xgft2_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xgft2_lo ;GF add high and low partials + vpand xtmph, xtmph, xtmpl + vpxor xd2, xd2, xtmph ;xd2 += partial + + ;dest3 + vperm2i128 xtmph, xgft3_lo, xgft3_lo, 0x01 ; swapped to hi | lo + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble + vpshufb xgft3_lo, xgft3_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xgft3_lo ;GF add high and low partials + vpand xtmph, xtmph, xtmpl + vpxor xd3, xd3, xtmph ;xd3 += partial + + XLDR xd6, [dest6+tmp] ;reuse xd1. Get next dest vector + + ;dest4 + vperm2i128 xtmph, xgft4_lo, xgft4_lo, 0x01 ; swapped to hi | lo + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble + vpshufb xgft4_lo, xgft4_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xgft4_lo ;GF add high and low partials + vpand xtmph, xtmph, xtmpl + vpxor xd4, xd4, xtmph ;xd4 += partial + + ;dest5 + vperm2i128 xtmph, xgft5_lo, xgft5_lo, 0x01 ; swapped to hi | lo + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble + vpshufb xgft5_lo, xgft5_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xgft5_lo ;GF add high and low partials + vpand xtmph, xtmph, xtmpl + vpxor xd5, xd5, xtmph ;xd5 += partial + + ;dest6 + vperm2i128 xtmph, xgft6_lo, xgft6_lo, 0x01 ; swapped to hi | lo + vpshufb xtmph, xtmph, x0 ;Lookup mul table of high nibble + vpshufb xgft6_lo, xgft6_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xgft6_lo ;GF add high and low partials + vpand xtmph, xtmph, xtmpl + vpxor xd6, xd6, xtmph ;xd6 += partial + + XSTR [dest2+tmp], xd2 ;Store result into dest2 + XSTR [dest3+tmp], xd3 ;Store result into dest3 + XSTR [dest4+tmp], xd4 ;Store result into dest4 + XSTR [dest5+tmp], xd5 ;Store result into dest5 + XSTR [dest6+tmp], xd6 ;Store result into dest6 + +.return_pass: + FUNC_RESTORE + mov return, 0 + ret + +.return_fail: + FUNC_RESTORE + mov return, 1 + ret + +endproc_frame + +section .data +align 32 +constip32: + ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + ddq 0xe0e1e2e3e4e5e6e7e8e9eaebecedeeef + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_6vect_mad_avx2, 04, 00, 0211 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s new file mode 100644 index 00000000000..f9b4eecd171 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_6vect_mad_sse.asm.s @@ -0,0 +1,412 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest); +;;; + +%define PS 8 + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define tmp.w r11d + %define tmp2 r10 + %define tmp3 r13 + %define tmp4 r14 + %define tmp5 rdi + %define return rax + %define return.w eax + %define stack_size 16*10 + 5*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + movdqa [rsp+16*3],xmm9 + movdqa [rsp+16*4],xmm10 + movdqa [rsp+16*5],xmm11 + movdqa [rsp+16*6],xmm12 + movdqa [rsp+16*7],xmm13 + movdqa [rsp+16*8],xmm14 + movdqa [rsp+16*9],xmm15 + save_reg r12, 10*16 + 0*8 + save_reg r13, 10*16 + 1*8 + save_reg r14, 10*16 + 2*8 + save_reg r15, 10*16 + 3*8 + save_reg rdi, 10*16 + 4*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + movdqa xmm9, [rsp+16*3] + movdqa xmm10, [rsp+16*4] + movdqa xmm11, [rsp+16*5] + movdqa xmm12, [rsp+16*6] + movdqa xmm13, [rsp+16*7] + movdqa xmm14, [rsp+16*8] + movdqa xmm15, [rsp+16*9] + mov r12, [rsp + 10*16 + 0*8] + mov r13, [rsp + 10*16 + 1*8] + mov r14, [rsp + 10*16 + 2*8] + mov r15, [rsp + 10*16 + 3*8] + mov rdi, [rsp + 10*16 + 4*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define tmp.w r11d + %define tmp2 r10 + %define tmp3 r12 + %define tmp4 r13 + %define tmp5 r14 + %define return rax + %define return.w eax + + %define func(x) x: + %macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + %endmacro + %macro FUNC_RESTORE 0 + pop r14 + pop r13 + pop r12 + %endmacro +%endif + +;;; gf_6vect_mad_sse(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest1 arg5 +%define pos return +%define pos.w return.w + +%define dest2 mul_array +%define dest3 tmp2 +%define dest4 tmp4 +%define dest5 tmp5 +%define dest6 vec_i + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR movdqu + %define XSTR movdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa + %else + %define XLDR movntdqa + %define XSTR movntdq + %endif +%endif + +default rel + +[bits 64] +section .text + +%define xmask0f xmm15 +%define xgft4_lo xmm14 +%define xgft4_hi xmm13 +%define xgft5_lo xmm12 +%define xgft5_hi xmm11 +%define xgft6_lo xmm10 +%define xgft6_hi xmm9 + +%define x0 xmm0 +%define xtmpa xmm1 +%define xtmph1 xmm2 +%define xtmpl1 xmm3 +%define xtmph2 xmm4 +%define xtmpl2 xmm5 +%define xtmph3 xmm6 +%define xtmpl3 xmm7 +%define xd1 xmm8 +%define xd2 xtmpl1 +%define xd3 xtmph1 + + +align 16 +global gf_6vect_mad_sse:function +func(gf_6vect_mad_sse) + FUNC_SAVE + sub len, 16 + jl .return_fail + + xor pos, pos + movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + + mov tmp, vec + sal vec_i, 5 ;Multiply by 32 + lea tmp3, [mul_array + vec_i] + sal tmp, 6 ;Multiply by 64 + + sal vec, 5 ;Multiply by 32 + lea vec_i, [tmp + vec] ;vec_i = 96 + lea mul_array, [tmp + vec_i] ;mul_array = 160 + + movdqu xgft5_lo, [tmp3+2*tmp] ;Load array Ex{00}, Ex{01}, ..., Ex{0f} + movdqu xgft5_hi, [tmp3+2*tmp+16] ; " Ex{00}, Ex{10}, ..., Ex{f0} + movdqu xgft4_lo, [tmp3+vec_i] ;Load array Dx{00}, Dx{01}, Dx{02}, ... + movdqu xgft4_hi, [tmp3+vec_i+16] ; " Dx{00}, Dx{10}, Dx{20}, ... , Dx{f0} + movdqu xgft6_lo, [tmp3+mul_array] ;Load array Fx{00}, Fx{01}, ..., Fx{0f} + movdqu xgft6_hi, [tmp3+mul_array+16] ; " Fx{00}, Fx{10}, ..., Fx{f0} + + mov dest2, [dest1+PS] + mov dest3, [dest1+2*PS] + mov dest4, [dest1+3*PS] ; reuse mul_array + mov dest5, [dest1+4*PS] + mov dest6, [dest1+5*PS] ; reuse vec_i + mov dest1, [dest1] + +.loop16: + XLDR x0, [src+pos] ;Get next source vector + + movdqu xtmpl1, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + movdqu xtmph1, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + movdqu xtmpl2, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + movdqu xtmph2, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + movdqu xtmpl3, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + movdqu xtmph3, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + XLDR xd1, [dest1+pos] ;Get next dest vector + + movdqa xtmpa, x0 ;Keep unshifted copy of src + psraw x0, 4 ;Shift to put high nibble into bits 4-0 + pand x0, xmask0f ;Mask high src nibble in bits 4-0 + pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + + ;dest1 + pshufb xtmph1, x0 ;Lookup mul table of high nibble + pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble + pxor xtmph1, xtmpl1 ;GF add high and low partials + pxor xd1, xtmph1 + + XLDR xd2, [dest2+pos] ;reuse xtmpl1. Get next dest vector + XLDR xd3, [dest3+pos] ;reuse xtmph1. Get next dest3 vector + + ;dest2 + pshufb xtmph2, x0 ;Lookup mul table of high nibble + pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble + pxor xtmph2, xtmpl2 ;GF add high and low partials + pxor xd2, xtmph2 + + ;dest3 + pshufb xtmph3, x0 ;Lookup mul table of high nibble + pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble + pxor xtmph3, xtmpl3 ;GF add high and low partials + pxor xd3, xtmph3 + + XSTR [dest1+pos], xd1 ;Store result into dest1 + XSTR [dest2+pos], xd2 ;Store result into dest2 + XSTR [dest3+pos], xd3 ;Store result into dest3 + + movdqa xtmph1, xgft4_hi ;Reload const array registers + movdqa xtmpl1, xgft4_lo ;Reload const array registers + movdqa xtmph2, xgft5_hi ;Reload const array registers + movdqa xtmpl2, xgft5_lo ;Reload const array registers + movdqa xtmph3, xgft6_hi ;Reload const array registers + movdqa xtmpl3, xgft6_lo ;Reload const array registers + + ;dest4 + XLDR xd1, [dest4+pos] ;Get next dest vector + pshufb xtmph1, x0 ;Lookup mul table of high nibble + pshufb xtmpl1, xtmpa ;Lookup mul table of low nibble + pxor xtmph1, xtmpl1 ;GF add high and low partials + pxor xd1, xtmph1 + + XLDR xd2, [dest5+pos] ;reuse xtmpl1. Get next dest vector + XLDR xd3, [dest6+pos] ;reuse xtmph1. Get next dest vector + + ;dest5 + pshufb xtmph2, x0 ;Lookup mul table of high nibble + pshufb xtmpl2, xtmpa ;Lookup mul table of low nibble + pxor xtmph2, xtmpl2 ;GF add high and low partials + pxor xd2, xtmph2 + + ;dest6 + pshufb xtmph3, x0 ;Lookup mul table of high nibble + pshufb xtmpl3, xtmpa ;Lookup mul table of low nibble + pxor xtmph3, xtmpl3 ;GF add high and low partials + pxor xd3, xtmph3 + + XSTR [dest4+pos], xd1 ;Store result into dest4 + XSTR [dest5+pos], xd2 ;Store result into dest5 + XSTR [dest6+pos], xd3 ;Store result into dest6 + + add pos, 16 ;Loop on 16 bytes at a time + cmp pos, len + jle .loop16 + + lea tmp, [len + 16] + cmp pos, tmp + je .return_pass + +.lessthan16: + ;; Tail len + ;; Do one more overlap pass + ;; Overlapped offset length-16 + mov tmp, len ;Backup len as len=rdi + + XLDR x0, [src+tmp] ;Get next source vector + XLDR xd1, [dest4+tmp] ;Get next dest vector + XLDR xd2, [dest5+tmp] ;reuse xtmpl1. Get next dest vector + XLDR xd3, [dest6+tmp] ;reuse xtmph1. Get next dest vector + + sub len, pos + + movdqa xtmph3, [constip16] ;Load const of i + 16 + pinsrb xtmpl3, len.w, 15 + pshufb xtmpl3, xmask0f ;Broadcast len to all bytes + pcmpgtb xtmpl3, xtmph3 + + movdqa xtmpa, x0 ;Keep unshifted copy of src + psraw x0, 4 ;Shift to put high nibble into bits 4-0 + pand x0, xmask0f ;Mask high src nibble in bits 4-0 + pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + + ;dest4 + pshufb xgft4_hi, x0 ;Lookup mul table of high nibble + pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble + pxor xgft4_hi, xgft4_lo ;GF add high and low partials + pand xgft4_hi, xtmpl3 + pxor xd1, xgft4_hi + + ;dest5 + pshufb xgft5_hi, x0 ;Lookup mul table of high nibble + pshufb xgft5_lo, xtmpa ;Lookup mul table of low nibble + pxor xgft5_hi, xgft5_lo ;GF add high and low partials + pand xgft5_hi, xtmpl3 + pxor xd2, xgft5_hi + + ;dest6 + pshufb xgft6_hi, x0 ;Lookup mul table of high nibble + pshufb xgft6_lo, xtmpa ;Lookup mul table of low nibble + pxor xgft6_hi, xgft6_lo ;GF add high and low partials + pand xgft6_hi, xtmpl3 + pxor xd3, xgft6_hi + + XSTR [dest4+tmp], xd1 ;Store result into dest4 + XSTR [dest5+tmp], xd2 ;Store result into dest5 + XSTR [dest6+tmp], xd3 ;Store result into dest6 + + movdqu xgft4_lo, [tmp3] ;Load array Ax{00}, Ax{01}, Ax{02}, ... + movdqu xgft4_hi, [tmp3+16] ; " Ax{00}, Ax{10}, Ax{20}, ... , Ax{f0} + movdqu xgft5_lo, [tmp3+vec] ;Load array Bx{00}, Bx{01}, Bx{02}, ... + movdqu xgft5_hi, [tmp3+vec+16] ; " Bx{00}, Bx{10}, Bx{20}, ... , Bx{f0} + movdqu xgft6_lo, [tmp3+2*vec] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + movdqu xgft6_hi, [tmp3+2*vec+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + XLDR xd1, [dest1+tmp] ;Get next dest vector + XLDR xd2, [dest2+tmp] ;reuse xtmpl1. Get next dest vector + XLDR xd3, [dest3+tmp] ;reuse xtmph1. Get next dest3 vector + + ;dest1 + pshufb xgft4_hi, x0 ;Lookup mul table of high nibble + pshufb xgft4_lo, xtmpa ;Lookup mul table of low nibble + pxor xgft4_hi, xgft4_lo ;GF add high and low partials + pand xgft4_hi, xtmpl3 + pxor xd1, xgft4_hi + + ;dest2 + pshufb xgft5_hi, x0 ;Lookup mul table of high nibble + pshufb xgft5_lo, xtmpa ;Lookup mul table of low nibble + pxor xgft5_hi, xgft5_lo ;GF add high and low partials + pand xgft5_hi, xtmpl3 + pxor xd2, xgft5_hi + + ;dest3 + pshufb xgft6_hi, x0 ;Lookup mul table of high nibble + pshufb xgft6_lo, xtmpa ;Lookup mul table of low nibble + pxor xgft6_hi, xgft6_lo ;GF add high and low partials + pand xgft6_hi, xtmpl3 + pxor xd3, xgft6_hi + + XSTR [dest1+tmp], xd1 ;Store result into dest1 + XSTR [dest2+tmp], xd2 ;Store result into dest2 + XSTR [dest3+tmp], xd3 ;Store result into dest3 + +.return_pass: + FUNC_RESTORE + mov return, 0 + ret + +.return_fail: + FUNC_RESTORE + mov return, 1 + ret + +endproc_frame + +section .data + +align 16 + +mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f +constip16: + ddq 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_6vect_mad_sse, 00, 00, 020f diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s index 894783f6346..2fad367a887 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_vect_dot_prod_avx(len, vec, *g_tbls, **buffs, *dest); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -39,12 +37,14 @@ %define arg2 rdx %define arg3 rcx %define arg4 r8 - %define arg5 r9 %define tmp r11 %define tmp2 r10 %define tmp3 r9 %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define func(x) x: %define FUNC_SAVE @@ -62,6 +62,9 @@ %define tmp2 r10 %define tmp3 rdi ; must be saved and loaded %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define frame_size 2*8 %define arg(x) [rsp + frame_size + PS + PS*x] @@ -80,6 +83,67 @@ %endmacro %endif +%ifidn __OUTPUT_FORMAT__, elf32 + +;;;================== High Address; +;;; arg4 +;;; arg3 +;;; arg2 +;;; arg1 +;;; arg0 +;;; return +;;;<================= esp of caller +;;; ebp +;;;<================= ebp = esp +;;; esi +;;; edi +;;; ebx +;;;<================= esp of callee +;;; +;;;================== Low Address; + + %define PS 4 + %define LOG_PS 2 + %define func(x) x: + %define arg(x) [ebp + PS*2 + PS*x] + + %define trans ecx ;trans is for the variables in stack + %define arg0 trans + %define arg0_m arg(0) + %define arg1 trans + %define arg1_m arg(1) + %define arg2 arg2_m + %define arg2_m arg(2) + %define arg3 ebx + %define arg4 trans + %define arg4_m arg(4) + %define tmp edx + %define tmp2 edi + %define tmp3 esi + %define return eax + %macro SLDR 2 ;; stack load/restore + mov %1, %2 + %endmacro + %define SSTR SLDR + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + push esi + push edi + push ebx + mov arg3, arg(3) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + mov esp, ebp + pop ebp + %endmacro + +%endif ; output formats %define len arg0 %define vec arg1 @@ -91,6 +155,12 @@ %define ptr tmp3 %define pos return + %ifidn PS,4 ;32-bit code + %define vec_m arg1_m + %define len_m arg0_m + %define dest_m arg4_m + %endif + %ifndef EC_ALIGNED_ADDR ;;; Use Un-aligned load/store %define XLDR vmovdqu @@ -106,10 +176,11 @@ %endif %endif +%ifidn PS,8 ; 64-bit code + default rel + [bits 64] +%endif -default rel - -[bits 64] section .text %define xmask0f xmm5 @@ -124,7 +195,9 @@ align 16 global gf_vect_dot_prod_avx:function func(gf_vect_dot_prod_avx) FUNC_SAVE + SLDR len, len_m sub len, 16 + SSTR len_m, len jl .return_fail xor pos, pos vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte @@ -135,10 +208,12 @@ func(gf_vect_dot_prod_avx) xor vec_i, vec_i .next_vect: + mov ptr, [src+vec_i*PS] vmovdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} vmovdqu xgft_hi, [tmp+16] ; " Cx{00}, Cx{10}, ..., Cx{f0} XLDR x0, [ptr+pos] ;Get next source vector + add tmp, 32 add vec_i, 1 @@ -150,11 +225,16 @@ func(gf_vect_dot_prod_avx) vpshufb xgft_lo, xgft_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials vpxor xp, xp, xgft_hi ;xp += partial + + SLDR vec, vec_m cmp vec_i, vec jl .next_vect + SLDR dest, dest_m XSTR [dest+pos], xp + add pos, 16 ;Loop on 16 bytes at a time + SLDR len, len_m cmp pos, len jle .loop16 @@ -182,7 +262,6 @@ section .data align 16 -poly: mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f @@ -195,6 +274,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -slversion gf_vect_dot_prod_avx, 02, 03, 0061 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_vect_dot_prod_avx, 02, 04, 0061 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s index f5f928748cd..737378058b3 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, *dest); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -47,7 +45,10 @@ %define tmp2 r10 %define tmp3 r9 %define return rax - %define PS 8 + %macro SLDR 2 + %endmacro + %define SSTR SLDR + %define PS 8 %define func(x) x: %define FUNC_SAVE %define FUNC_RESTORE @@ -66,7 +67,10 @@ %define tmp2 r10 %define tmp3 rdi ; must be saved and loaded %define return rax - %define PS 8 + %macro SLDR 2 + %endmacro + %define SSTR SLDR + %define PS 8 %define frame_size 2*8 %define arg(x) [rsp + frame_size + PS + PS*x] @@ -84,6 +88,69 @@ %endmacro %endif +%ifidn __OUTPUT_FORMAT__, elf32 + +;;;================== High Address; +;;; arg4 +;;; arg3 +;;; arg2 +;;; arg1 +;;; arg0 +;;; return +;;;<================= esp of caller +;;; ebp +;;;<================= ebp = esp +;;; esi +;;; edi +;;; ebx +;;;<================= esp of callee +;;; +;;;================== Low Address; + + %define PS 4 + %define LOG_PS 2 + %define func(x) x: + %define arg(x) [ebp + PS*2 + PS*x] + + %define trans ecx ;trans is for the variables in stack + %define arg0 trans + %define arg0_m arg(0) + %define arg1 trans + %define arg1_m arg(1) + %define arg2 arg2_m + %define arg2_m arg(2) + %define arg3 ebx + %define arg4 trans + %define arg4_m arg(4) + %define tmp edx + %define tmp.w edx + %define tmp.b dl + %define tmp2 edi + %define tmp3 esi + %define return eax + %macro SLDR 2 ;stack load/restore + mov %1, %2 + %endmacro + %define SSTR SLDR + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + push esi + push edi + push ebx + mov arg3, arg(3) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + mov esp, ebp + pop ebp + %endmacro + +%endif ; output formats %define len arg0 %define vec arg1 @@ -95,6 +162,12 @@ %define ptr tmp3 %define pos return +%ifidn PS,4 ;32-bit code + %define vec_m arg1_m + %define len_m arg0_m + %define dest_m arg4_m +%endif + %ifndef EC_ALIGNED_ADDR ;;; Use Un-aligned load/store %define XLDR vmovdqu @@ -110,10 +183,11 @@ %endif %endif +%ifidn PS,8 ;64-bit code + default rel + [bits 64] +%endif -default rel - -[bits 64] section .text %define xmask0f ymm3 @@ -129,7 +203,9 @@ align 16 global gf_vect_dot_prod_avx2:function func(gf_vect_dot_prod_avx2) FUNC_SAVE + SLDR len, len_m sub len, 32 + SSTR len_m, len jl .return_fail xor pos, pos mov tmp.b, 0x0f @@ -142,6 +218,7 @@ func(gf_vect_dot_prod_avx2) xor vec_i, vec_i .next_vect: + mov ptr, [src+vec_i*PS] vmovdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, Cx{02}, ... @@ -150,6 +227,7 @@ func(gf_vect_dot_prod_avx2) vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo XLDR x0, [ptr+pos] ;Get next source vector + add tmp, 32 add vec_i, 1 @@ -161,11 +239,16 @@ func(gf_vect_dot_prod_avx2) vpshufb xgft_lo, xgft_lo, xtmpa ;Lookup mul table of low nibble vpxor xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials vpxor xp, xp, xgft_hi ;xp += partial + + SLDR vec, vec_m cmp vec_i, vec jl .next_vect + SLDR dest, dest_m XSTR [dest+pos], xp + add pos, 32 ;Loop on 32 bytes at a time + SLDR len, len_m cmp pos, len jle .loop32 @@ -200,6 +283,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -slversion gf_vect_dot_prod_avx2, 04, 03, 0190 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_vect_dot_prod_avx2, 04, 04, 0190 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s index 2e13c186673..4d2c77f438c 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_sse.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_vect_dot_prod_sse(len, vec, *g_tbls, **buffs, *dest); ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -44,6 +42,9 @@ %define tmp2 r10 %define tmp3 r9 %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define func(x) x: %define FUNC_SAVE @@ -61,6 +62,9 @@ %define tmp2 r10 %define tmp3 rdi ; must be saved and loaded %define return rax + %macro SLDR 2 + %endmacro + %define SSTR SLDR %define PS 8 %define frame_size 2*8 %define arg(x) [rsp + frame_size + PS + PS*x] @@ -79,6 +83,67 @@ %endmacro %endif +%ifidn __OUTPUT_FORMAT__, elf32 + +;;;================== High Address; +;;; arg4 +;;; arg3 +;;; arg2 +;;; arg1 +;;; arg0 +;;; return +;;;<================= esp of caller +;;; ebp +;;;<================= ebp = esp +;;; esi +;;; edi +;;; ebx +;;;<================= esp of callee +;;; +;;;================== Low Address; + + %define PS 4 + %define LOG_PS 2 + %define func(x) x: + %define arg(x) [ebp + PS*2 + PS*x] + + %define trans ecx ;trans is for the variables in stack + %define arg0 trans + %define arg0_m arg(0) + %define arg1 trans + %define arg1_m arg(1) + %define arg2 arg2_m + %define arg2_m arg(2) + %define arg3 ebx + %define arg4 trans + %define arg4_m arg(4) + %define tmp edx + %define tmp2 edi + %define tmp3 esi + %define return eax + %macro SLDR 2 ;; stack load/restore + mov %1, %2 + %endmacro + %define SSTR SLDR + + %macro FUNC_SAVE 0 + push ebp + mov ebp, esp + push esi + push edi + push ebx + mov arg3, arg(3) + %endmacro + + %macro FUNC_RESTORE 0 + pop ebx + pop edi + pop esi + mov esp, ebp + pop ebp + %endmacro + +%endif ; output formats %define len arg0 %define vec arg1 @@ -90,6 +155,11 @@ %define ptr tmp3 %define pos return + %ifidn PS,4 ;32-bit code + %define vec_m arg1_m + %define len_m arg0_m + %define dest_m arg4_m + %endif %ifndef EC_ALIGNED_ADDR ;;; Use Un-aligned load/store @@ -106,10 +176,11 @@ %endif %endif +%ifidn PS,8 ;64-bit code + default rel + [bits 64] +%endif -default rel - -[bits 64] section .text %define xmask0f xmm5 @@ -124,7 +195,9 @@ align 16 global gf_vect_dot_prod_sse:function func(gf_vect_dot_prod_sse) FUNC_SAVE + SLDR len, len_m sub len, 16 + SSTR len_m, len jl .return_fail xor pos, pos movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte @@ -135,26 +208,34 @@ func(gf_vect_dot_prod_sse) xor vec_i, vec_i .next_vect: + mov ptr, [src+vec_i*PS] movdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, ..., Cx{0f} movdqu xgft_hi, [tmp+16] ; " Cx{00}, Cx{10}, ..., Cx{f0} XLDR x0, [ptr+pos] ;Get next source vector + add tmp, 32 add vec_i, 1 + movdqa xtmpa, x0 ;Keep unshifted copy of src psraw x0, 4 ;Shift to put high nibble into bits 4-0 pand x0, xmask0f ;Mask high src nibble in bits 4-0 pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + pshufb xgft_hi, x0 ;Lookup mul table of high nibble pshufb xgft_lo, xtmpa ;Lookup mul table of low nibble pxor xgft_hi, xgft_lo ;GF add high and low partials pxor xp, xgft_hi ;xp += partial + + SLDR vec, vec_m cmp vec_i, vec jl .next_vect + SLDR dest, dest_m XSTR [dest+pos], xp add pos, 16 ;Loop on 16 bytes at a time + SLDR len, len_m cmp pos, len jle .loop16 @@ -181,6 +262,7 @@ endproc_frame section .data align 16 + mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f %macro slversion 4 @@ -192,6 +274,4 @@ global %1_slver db 0x%3, 0x%2 %endmacro ;;; func core, ver, snum -slversion gf_vect_dot_prod_sse, 00, 03, 0060 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits +slversion gf_vect_dot_prod_sse, 00, 04, 0060 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s new file mode 100644 index 00000000000..4874b030c96 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx.asm.s @@ -0,0 +1,202 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest); +;;; + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define return rax + %define return.w eax + %define PS 8 + %define stack_size 16*3 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp+16*0],xmm6 + vmovdqa [rsp+16*1],xmm7 + vmovdqa [rsp+16*2],xmm8 + save_reg r12, 3*16 + 0*8 + save_reg r15, 3*16 + 1*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp+16*0] + vmovdqa xmm7, [rsp+16*1] + vmovdqa xmm8, [rsp+16*2] + mov r12, [rsp + 3*16 + 0*8] + mov r15, [rsp + 3*16 + 1*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define return rax + %define return.w eax + + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +;;; gf_vect_mad_avx(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest arg5 +%define pos return +%define pos.w return.w + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + + +default rel + +[bits 64] +section .text + +%define xmask0f xmm8 +%define xgft_lo xmm7 +%define xgft_hi xmm6 + +%define x0 xmm0 +%define xtmpa xmm1 +%define xtmph xmm2 +%define xtmpl xmm3 +%define xd xmm4 +%define xtmpd xmm5 + +align 16 +global gf_vect_mad_avx:function +func(gf_vect_mad_avx) + FUNC_SAVE + sub len, 16 + jl .return_fail + + xor pos, pos + vmovdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + + sal vec_i, 5 ;Multiply by 32 + vmovdqu xgft_lo, [vec_i+mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + vmovdqu xgft_hi, [vec_i+mul_array+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + + XLDR xtmpd, [dest+len] ;backup the last 16 bytes in dest + +.loop16: + XLDR xd, [dest+pos] ;Get next dest vector +.loop16_overlap: + XLDR x0, [src+pos] ;Get next source vector + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + vpshufb xtmph, xgft_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xtmpl ;GF add high and low partials + vpxor xd, xd, xtmph ;xd += partial + + XSTR [dest+pos], xd + add pos, 16 ;Loop on 16 bytes at a time + cmp pos, len + jle .loop16 + + lea tmp, [len + 16] + cmp pos, tmp + je .return_pass + + ;; Tail len + mov pos, len ;Overlapped offset length-16 + vmovdqa xd, xtmpd ;Restore xd + jmp .loop16_overlap ;Do one more overlap pass + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 + +mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_vect_mad_avx, 02, 00, 0201 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s new file mode 100644 index 00000000000..5ee9c2f93ca --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_avx2.asm.s @@ -0,0 +1,209 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest); +;;; + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 ; must be saved and loaded + %define arg5 r15 + + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define return rax + %define return.w eax + %define PS 8 + %define stack_size 16*3 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + + %macro FUNC_SAVE 0 + sub rsp, stack_size + vmovdqa [rsp+16*0],xmm6 + vmovdqa [rsp+16*1],xmm7 + vmovdqa [rsp+16*2],xmm8 + save_reg r12, 3*16 + 0*8 + save_reg r15, 3*16 + 1*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) + %endmacro + + %macro FUNC_RESTORE 0 + vmovdqa xmm6, [rsp+16*0] + vmovdqa xmm7, [rsp+16*1] + vmovdqa xmm8, [rsp+16*2] + mov r12, [rsp + 3*16 + 0*8] + mov r15, [rsp + 3*16 + 1*8] + add rsp, stack_size + %endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + + %define tmp r11 + %define tmp.w r11d + %define tmp.b r11b + %define return rax + %define return.w eax + + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + + +;;; gf_vect_mad_avx2(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest arg5 +%define pos return +%define pos.w return.w + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR vmovdqu + %define XSTR vmovdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR vmovdqa + %define XSTR vmovdqa + %else + %define XLDR vmovntdqa + %define XSTR vmovntdq + %endif +%endif + + +default rel + +[bits 64] +section .text + +%define xmask0f ymm8 +%define xmask0fx xmm8 +%define xgft_lo ymm7 +%define xgft_hi ymm6 + +%define x0 ymm0 +%define xtmpa ymm1 +%define xtmph ymm2 +%define xtmpl ymm3 +%define xd ymm4 +%define xtmpd ymm5 + +align 16 +global gf_vect_mad_avx2:function +func(gf_vect_mad_avx2) + FUNC_SAVE + sub len, 32 + jl .return_fail + xor pos, pos + mov tmp.b, 0x0f + vpinsrb xmask0fx, xmask0fx, tmp.w, 0 + vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f... + + sal vec_i, 5 ;Multiply by 32 + vmovdqu xgft_lo, [vec_i+mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi + vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo + + XLDR xtmpd, [dest+len] ;backup the last 32 bytes in dest + +.loop32: + XLDR xd, [dest+pos] ;Get next dest vector +.loop32_overlap: + XLDR x0, [src+pos] ;Get next source vector + + vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0 + vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0 + vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0 + + vpshufb xtmph, xgft_hi, x0 ;Lookup mul table of high nibble + vpshufb xtmpl, xgft_lo, xtmpa ;Lookup mul table of low nibble + vpxor xtmph, xtmph, xtmpl ;GF add high and low partials + vpxor xd, xd, xtmph ;xd += partial + + XSTR [dest+pos], xd + add pos, 32 ;Loop on 32 bytes at a time + cmp pos, len + jle .loop32 + + lea tmp, [len + 32] + cmp pos, tmp + je .return_pass + + ;; Tail len + mov pos, len ;Overlapped offset length-32 + vmovdqa xd, xtmpd ;Restore xd + jmp .loop32_overlap ;Do one more overlap pass + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_vect_mad_avx2, 04, 00, 0202 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s new file mode 100644 index 00000000000..bfe8f31de82 --- /dev/null +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mad_sse.asm.s @@ -0,0 +1,203 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; +;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest); +;;; + +%ifidn __OUTPUT_FORMAT__, win64 + %define arg0 rcx + %define arg0.w ecx + %define arg1 rdx + %define arg2 r8 + %define arg3 r9 + %define arg4 r12 + %define arg5 r15 + %define tmp r11 + %define return rax + %define return.w eax + %define PS 8 + %define stack_size 16*3 + 3*8 + %define arg(x) [rsp + stack_size + PS + PS*x] + %define func(x) proc_frame x + +%macro FUNC_SAVE 0 + sub rsp, stack_size + movdqa [rsp+16*0],xmm6 + movdqa [rsp+16*1],xmm7 + movdqa [rsp+16*2],xmm8 + save_reg r12, 3*16 + 0*8 + save_reg r15, 3*16 + 1*8 + end_prolog + mov arg4, arg(4) + mov arg5, arg(5) +%endmacro + +%macro FUNC_RESTORE 0 + movdqa xmm6, [rsp+16*0] + movdqa xmm7, [rsp+16*1] + movdqa xmm8, [rsp+16*2] + mov r12, [rsp + 3*16 + 0*8] + mov r15, [rsp + 3*16 + 1*8] + add rsp, stack_size +%endmacro + +%elifidn __OUTPUT_FORMAT__, elf64 + %define arg0 rdi + %define arg0.w edi + %define arg1 rsi + %define arg2 rdx + %define arg3 rcx + %define arg4 r8 + %define arg5 r9 + %define tmp r11 + %define return rax + %define return.w eax + + %define func(x) x: + %define FUNC_SAVE + %define FUNC_RESTORE +%endif + +;;; gf_vect_mad_sse(len, vec, vec_i, mul_array, src, dest) +%define len arg0 +%define len.w arg0.w +%define vec arg1 +%define vec_i arg2 +%define mul_array arg3 +%define src arg4 +%define dest arg5 +%define pos return +%define pos.w return.w + +%ifndef EC_ALIGNED_ADDR +;;; Use Un-aligned load/store + %define XLDR movdqu + %define XSTR movdqu +%else +;;; Use Non-temporal load/stor + %ifdef NO_NT_LDST + %define XLDR movdqa + %define XSTR movdqa + %else + %define XLDR movntdqa + %define XSTR movntdq + %endif +%endif + +default rel + +[bits 64] +section .text + +%define xmask0f xmm8 +%define xgft_lo xmm7 +%define xgft_hi xmm6 + +%define x0 xmm0 +%define xtmpa xmm1 +%define xtmph xmm2 +%define xtmpl xmm3 +%define xd xmm4 +%define xtmpd xmm5 + + +align 16 +global gf_vect_mad_sse:function +func(gf_vect_mad_sse) + FUNC_SAVE + sub len, 16 + jl .return_fail + + xor pos, pos + movdqa xmask0f, [mask0f] ;Load mask of lower nibble in each byte + sal vec_i, 5 ;Multiply by 32 + movdqu xgft_lo, [vec_i+mul_array] ;Load array Cx{00}, Cx{01}, Cx{02}, ... + movdqu xgft_hi, [vec_i+mul_array+16] ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0} + + XLDR xtmpd, [dest+len] ;backup the last 16 bytes in dest + +.loop16: + XLDR xd, [dest+pos] ;Get next dest vector +.loop16_overlap: + XLDR x0, [src+pos] ;Get next source vector + movdqa xtmph, xgft_hi ;Reload const array registers + movdqa xtmpl, xgft_lo + movdqa xtmpa, x0 ;Keep unshifted copy of src + psraw x0, 4 ;Shift to put high nibble into bits 4-0 + pand x0, xmask0f ;Mask high src nibble in bits 4-0 + pand xtmpa, xmask0f ;Mask low src nibble in bits 4-0 + pshufb xtmph, x0 ;Lookup mul table of high nibble + pshufb xtmpl, xtmpa ;Lookup mul table of low nibble + pxor xtmph, xtmpl ;GF add high and low partials + + pxor xd, xtmph + XSTR [dest+pos], xd ;Store result + + add pos, 16 ;Loop on 16 bytes at a time + cmp pos, len + jle .loop16 + + lea tmp, [len + 16] + cmp pos, tmp + je .return_pass + + ;; Tail len + mov pos, len ;Overlapped offset length-16 + movdqa xd, xtmpd ;Restore xd + jmp .loop16_overlap ;Do one more overlap pass + +.return_pass: + mov return, 0 + FUNC_RESTORE + ret + +.return_fail: + mov return, 1 + FUNC_RESTORE + ret + +endproc_frame + +section .data + +align 16 + +mask0f: ddq 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f + +%macro slversion 4 +global %1_slver_%2%3%4 +global %1_slver +%1_slver: +%1_slver_%2%3%4: + dw 0x%4 + db 0x%3, 0x%2 +%endmacro +;;; func core, ver, snum +slversion gf_vect_mad_sse, 00, 00, 0200 diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s index 0536ed7950a..1924da71566 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_avx.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_vect_mul_avx(len, mul_array, src, dest) ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -170,5 +168,3 @@ global %1_slver %endmacro ;;; func core, ver, snum slversion gf_vect_mul_avx, 01, 02, 0036 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits diff --git a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s index c6d7d586967..61ecaac9dd6 100644 --- a/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s +++ b/src/erasure-code/isa/isa-l/erasure_code/gf_vect_mul_sse.asm.s @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -30,8 +30,6 @@ ;;; ;;; gf_vect_mul_sse(len, mul_array, src, dest) ;;; -;;; Author: Gregory Tucker - %ifidn __OUTPUT_FORMAT__, elf64 %define arg0 rdi @@ -176,5 +174,3 @@ global %1_slver %endmacro ;;; func core, ver, snum slversion gf_vect_mul_sse, 00, 02, 0034 -; inform linker that this doesn't require executable stack -section .note.GNU-stack noalloc noexec nowrite progbits diff --git a/src/erasure-code/isa/isa-l/include/erasure_code.h b/src/erasure-code/isa/isa-l/include/erasure_code.h index 0f3b6db0825..53e480f0193 100644 --- a/src/erasure-code/isa/isa-l/include/erasure_code.h +++ b/src/erasure-code/isa/isa-l/include/erasure_code.h @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2011-2014 Intel Corporation All rights reserved. + Copyright(c) 2011-2015 Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -74,73 +74,128 @@ extern "C" { void ec_init_tables(int k, int rows, unsigned char* a, unsigned char* gftbls); /** - * @brief Generate or decode erasure codes on blocks of data. + * @brief Generate or decode erasure codes on blocks of data, runs appropriate version. * * Given a list of source data blocks, generate one or multiple blocks of * encoded data as specified by a matrix of GF(2^8) coefficients. When given a * suitable set of coefficients, this function will perform the fast generation * or decoding of Reed-Solomon type erasure codes. * - * @requires SSE4.1 + * This function determines what instruction sets are enabled and + * selects the appropriate version at runtime. + * * @param len Length of each block of data (vector) of source or dest data. * @param k The number of vector sources or rows in the generator matrix * for coding. * @param rows The number of output vectors to concurrently encode/decode. * @param gftbls Pointer to array of input tables generated from coding - * coefficients in ec_init_tables(). Must be of size 32*k*rows + * coefficients in ec_init_tables(). Must be of size 32*k*rows * @param data Array of pointers to source input buffers. * @param coding Array of pointers to coded output buffers. * @returns none */ -void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, unsigned char **coding); +void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, + unsigned char **coding); +/** + * @brief Generate or decode erasure codes on blocks of data. + * + * Arch specific version of ec_encode_data() with same parameters. + * @requires SSE4.1 + */ +void ec_encode_data_sse(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, + unsigned char **coding); /** - * @brief Generate or decode erasure codes on blocks of data, runs appropriate version. + * @brief Generate or decode erasure codes on blocks of data. * - * Given a list of source data blocks, generate one or multiple blocks of - * encoded data as specified by a matrix of GF(2^8) coefficients. When given a - * suitable set of coefficients, this function will perform the fast generation - * or decoding of Reed-Solomon type erasure codes. + * Arch specific version of ec_encode_data() with same parameters. + * @requires AVX + */ +void ec_encode_data_avx(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, + unsigned char **coding); + +/** + * @brief Generate or decode erasure codes on blocks of data. * - * This function determines what instruction sets are enabled and - * selects the appropriate version at runtime. + * Arch specific version of ec_encode_data() with same parameters. + * @requires AVX2 + */ +void ec_encode_data_avx2(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, + unsigned char **coding); + +/** + * @brief Generate or decode erasure codes on blocks of data, runs baseline version. + * + * Baseline version of ec_encode_data() with same parameters. + */ +void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src, + unsigned char **dest); + +/** + * @brief Generate update for encode or decode of erasure codes from single source, runs appropriate version. + * + * Given one source data block, update one or multiple blocks of encoded data as + * specified by a matrix of GF(2^8) coefficients. When given a suitable set of + * coefficients, this function will perform the fast generation or decoding of + * Reed-Solomon type erasure codes from one input source at a time. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. * * @param len Length of each block of data (vector) of source or dest data. * @param k The number of vector sources or rows in the generator matrix * for coding. * @param rows The number of output vectors to concurrently encode/decode. - * @param gftbls Pointer to array of input tables generated from coding + * @param vec_i The vector index corresponding to the single input source. + * @param g_tbls Pointer to array of input tables generated from coding * coefficients in ec_init_tables(). Must be of size 32*k*rows - * @param data Array of pointers to source input buffers. + * @param data Pointer to single input source used to update output parity. * @param coding Array of pointers to coded output buffers. * @returns none */ +void ec_encode_data_update(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding); -void ec_encode_data(int len, int k, int rows, unsigned char *gftbls, unsigned char **data, unsigned char **coding); +/** + * @brief Generate update for encode or decode of erasure codes from single source. + * + * Arch specific version of ec_encode_data_update() with same parameters. + * @requires SSE4.1 + */ +void ec_encode_data_update_sse(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding); /** - * @brief Generate or decode erasure codes on blocks of data, runs baseline version. + * @brief Generate update for encode or decode of erasure codes from single source. * - * Given a list of source data blocks, generate one or multiple blocks of - * encoded data as specified by a matrix of GF(2^8) coefficients. When given a - * suitable set of coefficients, this function will perform the fast generation - * or decoding of Reed-Solomon type erasure codes. + * Arch specific version of ec_encode_data_update() with same parameters. + * @requires AVX + */ + +void ec_encode_data_update_avx(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding); + +/** + * @brief Generate update for encode or decode of erasure codes from single source. * - * @param len Length of each block of data (vector) of source or dest data. - * @param srcs The number of vector sources or rows in the generator matrix - * for coding. - * @param dests The number of output vectors to concurrently encode/decode. - * @param v Pointer to array of input tables generated from coding - * coefficients in ec_init_tables(). Must be of size 32*k*rows - * @param src Array of pointers to source input buffers. - * @param dest Array of pointers to coded output buffers. - * @returns none + * Arch specific version of ec_encode_data_update() with same parameters. + * @requires AVX2 + */ + +void ec_encode_data_update_avx2(int len, int k, int rows, int vec_i, unsigned char *g_tbls, + unsigned char *data, unsigned char **coding); + +/** + * @brief Generate update for encode or decode of erasure codes from single source. + * + * Baseline version of ec_encode_data_update(). */ -void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigned char **src, unsigned char **dest); +void ec_encode_data_update_base(int len, int k, int rows, int vec_i, unsigned char *v, + unsigned char *data, unsigned char **dest); /** @@ -150,8 +205,8 @@ void ec_encode_data_base(int len, int srcs, int dests, unsigned char *v, unsigne * set of coefficients to produce each byte of the output. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 32*vlen byte constant array based on the input coefficients. - * * @requires SSE4.1 + * * @param len Length of each vector in bytes. Must be >= 16. * @param vlen Number of vector sources. * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based @@ -171,8 +226,8 @@ void gf_vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, * set of coefficients to produce each byte of the output. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 32*vlen byte constant array based on the input coefficients. - * * @requires AVX + * * @param len Length of each vector in bytes. Must be >= 16. * @param vlen Number of vector sources. * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based @@ -192,8 +247,8 @@ void gf_vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, * set of coefficients to produce each byte of the output. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 32*vlen byte constant array based on the input coefficients. - * * @requires AVX2 + * * @param len Length of each vector in bytes. Must be >= 32. * @param vlen Number of vector sources. * @param gftbls Pointer to 32*vlen byte array of pre-calculated constants based @@ -214,8 +269,8 @@ void gf_vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 2*32*vlen byte constant array based on the two sets of input coefficients. - * * @requires SSE4.1 + * * @param len Length of each vector in bytes. Must be >= 16. * @param vlen Number of vector sources. * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants @@ -236,8 +291,8 @@ void gf_2vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 2*32*vlen byte constant array based on the two sets of input coefficients. - * * @requires AVX + * * @param len Length of each vector in bytes. Must be >= 16. * @param vlen Number of vector sources. * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants @@ -258,8 +313,8 @@ void gf_2vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 2*32*vlen byte constant array based on the two sets of input coefficients. - * * @requires AVX2 + * * @param len Length of each vector in bytes. Must be >= 32. * @param vlen Number of vector sources. * @param gftbls Pointer to 2*32*vlen byte array of pre-calculated constants @@ -280,8 +335,8 @@ void gf_2vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 3*32*vlen byte constant array based on the three sets of input coefficients. - * * @requires SSE4.1 + * * @param len Length of each vector in bytes. Must be >= 16. * @param vlen Number of vector sources. * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants @@ -302,8 +357,8 @@ void gf_3vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 3*32*vlen byte constant array based on the three sets of input coefficients. - * * @requires AVX + * * @param len Length of each vector in bytes. Must be >= 16. * @param vlen Number of vector sources. * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants @@ -324,8 +379,8 @@ void gf_3vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 3*32*vlen byte constant array based on the three sets of input coefficients. - * * @requires AVX2 + * * @param len Length of each vector in bytes. Must be >= 32. * @param vlen Number of vector sources. * @param gftbls Pointer to 3*32*vlen byte array of pre-calculated constants @@ -346,8 +401,8 @@ void gf_3vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 4*32*vlen byte constant array based on the four sets of input coefficients. - * * @requires SSE4.1 + * * @param len Length of each vector in bytes. Must be >= 16. * @param vlen Number of vector sources. * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants @@ -368,8 +423,8 @@ void gf_4vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 4*32*vlen byte constant array based on the four sets of input coefficients. - * * @requires AVX + * * @param len Length of each vector in bytes. Must be >= 16. * @param vlen Number of vector sources. * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants @@ -390,8 +445,8 @@ void gf_4vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 4*32*vlen byte constant array based on the four sets of input coefficients. - * * @requires AVX2 + * * @param len Length of each vector in bytes. Must be >= 32. * @param vlen Number of vector sources. * @param gftbls Pointer to 4*32*vlen byte array of pre-calculated constants @@ -412,8 +467,8 @@ void gf_4vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 5*32*vlen byte constant array based on the five sets of input coefficients. - * * @requires SSE4.1 + * * @param len Length of each vector in bytes. Must >= 16. * @param vlen Number of vector sources. * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants @@ -434,8 +489,8 @@ void gf_5vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 5*32*vlen byte constant array based on the five sets of input coefficients. - * * @requires AVX + * * @param len Length of each vector in bytes. Must >= 16. * @param vlen Number of vector sources. * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants @@ -456,8 +511,8 @@ void gf_5vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 5*32*vlen byte constant array based on the five sets of input coefficients. - * * @requires AVX2 + * * @param len Length of each vector in bytes. Must >= 32. * @param vlen Number of vector sources. * @param gftbls Pointer to 5*32*vlen byte array of pre-calculated constants @@ -478,8 +533,8 @@ void gf_5vect_dot_prod_avx2(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 6*32*vlen byte constant array based on the six sets of input coefficients. - * * @requires SSE4.1 + * * @param len Length of each vector in bytes. Must be >= 16. * @param vlen Number of vector sources. * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants @@ -500,8 +555,8 @@ void gf_6vect_dot_prod_sse(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 6*32*vlen byte constant array based on the six sets of input coefficients. - * * @requires AVX + * * @param len Length of each vector in bytes. Must be >= 16. * @param vlen Number of vector sources. * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants @@ -522,8 +577,8 @@ void gf_6vect_dot_prod_avx(int len, int vlen, unsigned char *gftbls, * sets of coefficients to produce each byte of the outputs. Can be used for * erasure coding encode and decode. Function requires pre-calculation of a * 6*32*vlen byte constant array based on the six sets of input coefficients. - * * @requires AVX2 + * * @param len Length of each vector in bytes. Must be >= 32. * @param vlen Number of vector sources. * @param gftbls Pointer to 6*32*vlen byte array of pre-calculated constants @@ -582,6 +637,224 @@ void gf_vect_dot_prod_base(int len, int vlen, unsigned char *gftbls, void gf_vect_dot_prod(int len, int vlen, unsigned char *gftbls, unsigned char **src, unsigned char *dest); + +/** + * @brief GF(2^8) vector multiply accumulate, runs appropriate version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constant and add to destination array. Can be used for erasure coding encode + * and decode update when only one source is available at a time. Function + * requires pre-calculation of a 32*vec byte constant array based on the input + * coefficients. + * + * This function determines what instruction sets are enabled and selects the + * appropriate version at runtime. + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Array of pointers to source inputs. + * @param dest Pointer to destination data array. + * @returns none + */ + +void gf_vect_mad(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); + +/** + * @brief GF(2^8) vector multiply accumulate, arch specific version. + * + * Arch specific version of gf_vect_mad() with same parameters. + * @requires SSE4.1 + */ + +void gf_vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); +/** + * @brief GF(2^8) vector multiply accumulate, arch specific version. + * + * Arch specific version of gf_vect_mad() with same parameters. + * @requires AVX + */ + +void gf_vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); + +/** + * @brief GF(2^8) vector multiply accumulate, arch specific version. + * + * Arch specific version of gf_vect_mad() with same parameters. + * @requires AVX2 + */ + +void gf_vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char *dest); + +/** + * @brief GF(2^8) vector multiply accumulate, baseline version. + * + * Baseline version of gf_vect_mad() with same parameters. + */ + +void gf_vect_mad_base(int len, int vec, int vec_i, unsigned char *v, unsigned char *src, + unsigned char *dest); + +/** + * @brief GF(2^8) vector multiply with 2 accumulate. SSE version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ + +void gf_2vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 2 accumulate. AVX version of gf_2vect_mad_sse(). + * @requires AVX + */ +void gf_2vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); +/** + * @brief GF(2^8) vector multiply with 2 accumulate. AVX2 version of gf_2vect_mad_sse(). + * @requires AVX2 + */ +void gf_2vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 3 accumulate. SSE version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ + +void gf_3vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 3 accumulate. AVX version of gf_3vect_mad_sse(). + * @requires AVX + */ +void gf_3vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 3 accumulate. AVX2 version of gf_3vect_mad_sse(). + * @requires AVX2 + */ +void gf_3vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 4 accumulate. SSE version. + * + * Does a GF(2^8) multiply across each byte of input source with expanded + * constants and add to destination arrays. Can be used for erasure coding + * encode and decode update when only one source is available at a + * time. Function requires pre-calculation of a 32*vec byte constant array based + * on the input coefficients. + * @requires SSE4.1 + * + * @param len Length of each vector in bytes. Must be >= 32. + * @param vec The number of vector sources or rows in the generator matrix + * for coding. + * @param vec_i The vector index corresponding to the single input source. + * @param gftbls Pointer to array of input tables generated from coding + * coefficients in ec_init_tables(). Must be of size 32*vec. + * @param src Pointer to source input array. + * @param dest Array of pointers to destination input/outputs. + * @returns none + */ + +void gf_4vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 4 accumulate. AVX version of gf_4vect_mad_sse(). + * @requires AVX + */ +void gf_4vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); +/** + * @brief GF(2^8) vector multiply with 4 accumulate. AVX2 version of gf_4vect_mad_sse(). + * @requires AVX2 + */ +void gf_4vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 5 accumulate. SSE version. + * @requires SSE4.1 + */ +void gf_5vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 5 accumulate. AVX version. + * @requires AVX + */ +void gf_5vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); +/** + * @brief GF(2^8) vector multiply with 5 accumulate. AVX2 version. + * @requires AVX2 + */ +void gf_5vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 6 accumulate. SSE version. + * @requires SSE4.1 + */ +void gf_6vect_mad_sse(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); +/** + * @brief GF(2^8) vector multiply with 6 accumulate. AVX version. + * @requires AVX + */ +void gf_6vect_mad_avx(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + +/** + * @brief GF(2^8) vector multiply with 6 accumulate. AVX2 version. + * @requires AVX2 + */ +void gf_6vect_mad_avx2(int len, int vec, int vec_i, unsigned char *gftbls, unsigned char *src, + unsigned char **dest); + + /********************************************************************** * The remaining are lib support functions used in GF(2^8) operations. */ @@ -650,6 +923,7 @@ void gf_gen_cauchy1_matrix(unsigned char *a, int m, int k); int gf_invert_matrix(unsigned char *in, unsigned char *out, const int n); + /*************************************************************/ #ifdef __cplusplus diff --git a/src/erasure-code/isa/isa-l/include/gf_vect_mul.h b/src/erasure-code/isa/isa-l/include/gf_vect_mul.h index ef19845788d..bf4fd01a60b 100644 --- a/src/erasure-code/isa/isa-l/include/gf_vect_mul.h +++ b/src/erasure-code/isa/isa-l/include/gf_vect_mul.h @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2011-2014 Intel Corporation All rights reserved. + Copyright(c) 2011-2015 Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -52,8 +52,8 @@ extern "C" { * 32-element constant array based on constant C. gftbl(C) = {C{00}, * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len * and src must be aligned to 32B. - * @requires SSE4.1 + * * @param len Length of vector in bytes. Must be aligned to 32B. * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C. * @param src Pointer to src data array. Must be aligned to 32B. @@ -73,8 +73,8 @@ int gf_vect_mul_sse(int len, unsigned char *gftbl, void *src, void *dest); * 32-element constant array based on constant C. gftbl(C) = {C{00}, * C{01}, C{02}, ... , C{0f} }, {C{00}, C{10}, C{20}, ... , C{f0} }. Len * and src must be aligned to 32B. - * @requires AVX + * * @param len Length of vector in bytes. Must be aligned to 32B. * @param gftbl Pointer to 32-byte array of pre-calculated constants based on C. * @param src Pointer to src data array. Must be aligned to 32B. diff --git a/src/erasure-code/isa/isa-l/include/reg_sizes.asm b/src/erasure-code/isa/isa-l/include/reg_sizes.asm index ed212520bf6..219ba069ebd 100644 --- a/src/erasure-code/isa/isa-l/include/reg_sizes.asm +++ b/src/erasure-code/isa/isa-l/include/reg_sizes.asm @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2014 Intel Corporation All rights reserved. +; Copyright(c) 2011-2015 Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions diff --git a/src/erasure-code/isa/isa-l/include/types.h b/src/erasure-code/isa/isa-l/include/types.h index 0feed472e6a..695d94eefa7 100644 --- a/src/erasure-code/isa/isa-l/include/types.h +++ b/src/erasure-code/isa/isa-l/include/types.h @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2011-2014 Intel Corporation All rights reserved. + Copyright(c) 2011-2015 Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions |