avx2: Improve speed when > 10 input or output shards. (#174)

Speeds are including a limiting the number of goroutines with all AVX2 paths,

Before/after
```
benchmark                                 old ns/op     new ns/op     delta
BenchmarkGalois128K-32                    2240          2240          +0.00%
BenchmarkGalois1M-32                      19578         18891         -3.51%
BenchmarkGaloisXor128K-32                 2798          2852          +1.93%
BenchmarkGaloisXor1M-32                   23334         23345         +0.05%
BenchmarkEncode2x1x1M-32                  34357         34370         +0.04%
BenchmarkEncode10x2x10000-32              3210          3093          -3.64%
BenchmarkEncode100x20x10000-32            362925        148214        -59.16%
BenchmarkEncode17x3x1M-32                 323767        224157        -30.77%
BenchmarkEncode10x4x16M-32                8376895       8376737       -0.00%
BenchmarkEncode5x2x1M-32                  68365         66861         -2.20%
BenchmarkEncode10x2x1M-32                 101407        93023         -8.27%
BenchmarkEncode10x4x1M-32                 171880        155477        -9.54%
BenchmarkEncode50x20x1M-32                3704691       3015047       -18.62%
BenchmarkEncode17x3x16M-32                10279233      10106658      -1.68%
BenchmarkEncode_8x4x8M-32                 3438245       3326479       -3.25%
BenchmarkEncode_12x4x12M-32               6632257       6581637       -0.76%
BenchmarkEncode_16x4x16M-32               10815755      10788377      -0.25%
BenchmarkEncode_16x4x32M-32               21029061      21507995      +2.28%
BenchmarkEncode_16x4x64M-32               42145450      43876850      +4.11%
BenchmarkEncode_8x5x8M-32                 4543208       3846378       -15.34%
BenchmarkEncode_8x6x8M-32                 5065494       4397218       -13.19%
BenchmarkEncode_8x7x8M-32                 5818995       4962884       -14.71%
BenchmarkEncode_8x9x8M-32                 6215449       6114898       -1.62%
BenchmarkEncode_8x10x8M-32                6923415       6610501       -4.52%
BenchmarkEncode_8x11x8M-32                7365988       7010473       -4.83%
BenchmarkEncode_8x8x05M-32                150857        136820        -9.30%
BenchmarkEncode_8x8x1M-32                 256722        254854        -0.73%
BenchmarkEncode_8x8x8M-32                 5547790       5422048       -2.27%
BenchmarkEncode_8x8x32M-32                23038643      22705859      -1.44%
BenchmarkEncode_24x8x24M-32               27729259      30332216      +9.39%
BenchmarkEncode_24x8x48M-32               53865705      61187658      +13.59%
BenchmarkVerify10x2x10000-32              8769          8154          -7.01%
BenchmarkVerify10x2x1M-32                 516149        476180        -7.74%
BenchmarkVerify5x2x1M-32                  443888        419541        -5.48%
BenchmarkVerify10x4x1M-32                 1030299       948021        -7.99%
BenchmarkVerify50x20x1M-32                7209689       6186891       -14.19%
BenchmarkVerify10x4x16M-32                17774456      17681879      -0.52%
BenchmarkReconstruct10x2x10000-32         3352          3256          -2.86%
BenchmarkReconstruct50x5x50000-32         166417        140900        -15.33%
BenchmarkReconstruct10x2x1M-32            189711        174615        -7.96%
BenchmarkReconstruct5x2x1M-32             128080        126520        -1.22%
BenchmarkReconstruct10x4x1M-32            273312        254017        -7.06%
BenchmarkReconstruct50x20x1M-32           3628812       3192474       -12.02%
BenchmarkReconstruct10x4x16M-32           8562186       8781479       +2.56%
BenchmarkReconstructData10x2x10000-32     3241          3116          -3.86%
BenchmarkReconstructData50x5x50000-32     162520        134794        -17.06%
BenchmarkReconstructData10x2x1M-32        171253        161955        -5.43%
BenchmarkReconstructData5x2x1M-32         102215        106942        +4.62%
BenchmarkReconstructData10x4x1M-32        225593        219969        -2.49%
BenchmarkReconstructData50x20x1M-32       2515311       2129721       -15.33%
BenchmarkReconstructData10x4x16M-32       6980308       6698111       -4.04%
BenchmarkReconstructP10x2x10000-32        924           937           +1.35%
BenchmarkReconstructP10x5x20000-32        1639          1703          +3.90%
BenchmarkSplit10x4x160M-32                4984993       4898045       -1.74%
BenchmarkSplit5x2x5M-32                   380415        221446        -41.79%
BenchmarkSplit10x2x1M-32                  58761         53335         -9.23%
BenchmarkSplit10x4x10M-32                 643188        410959        -36.11%
BenchmarkSplit50x20x50M-32                1843879       1647205       -10.67%
BenchmarkSplit17x3x272M-32                3684920       3613951       -1.93%
BenchmarkParallel_8x8x64K-32              7022          6630          -5.58%
BenchmarkParallel_8x8x05M-32              348308        348369        +0.02%
BenchmarkParallel_20x10x05M-32            575672        581028        +0.93%
BenchmarkParallel_8x8x1M-32               716033        697167        -2.63%
BenchmarkParallel_8x8x8M-32               5716048       5616437       -1.74%
BenchmarkParallel_8x8x32M-32              22650878      22098667      -2.44%
BenchmarkParallel_8x3x1M-32               406839        399125        -1.90%
BenchmarkParallel_8x4x1M-32               459107        463890        +1.04%
BenchmarkParallel_8x5x1M-32               527488        520334        -1.36%
BenchmarkStreamEncode10x2x10000-32        6013          5878          -2.25%
BenchmarkStreamEncode100x20x10000-32      503124        267894        -46.75%
BenchmarkStreamEncode17x3x1M-32           1561838       1376618       -11.86%
BenchmarkStreamEncode10x4x16M-32          19124427      17762582      -7.12%
BenchmarkStreamEncode5x2x1M-32            429701        384666        -10.48%
BenchmarkStreamEncode10x2x1M-32           801257        763637        -4.70%
BenchmarkStreamEncode10x4x1M-32           876065        820744        -6.31%
BenchmarkStreamEncode50x20x1M-32          7205112       6081398       -15.60%
BenchmarkStreamEncode17x3x16M-32          27182786      26117143      -3.92%
BenchmarkStreamVerify10x2x10000-32        13767         14026         +1.88%
BenchmarkStreamVerify50x5x50000-32        826983        690453        -16.51%
BenchmarkStreamVerify10x2x1M-32           1238566       1182591       -4.52%
BenchmarkStreamVerify5x2x1M-32            892661        806301        -9.67%
BenchmarkStreamVerify10x4x1M-32           1676394       1631495       -2.68%
BenchmarkStreamVerify50x20x1M-32          10877875      10037678      -7.72%
BenchmarkStreamVerify10x4x16M-32          27599576      30435400      +10.27%

benchmark                                 old MB/s      new MB/s      speedup
BenchmarkGalois128K-32                    58518.53      58510.17      1.00x
BenchmarkGalois1M-32                      53558.10      55507.44      1.04x
BenchmarkGaloisXor128K-32                 46839.74      45961.09      0.98x
BenchmarkGaloisXor1M-32                   44936.98      44917.46      1.00x
BenchmarkEncode2x1x1M-32                  91561.27      91524.11      1.00x
BenchmarkEncode10x2x10000-32              37385.54      38792.54      1.04x
BenchmarkEncode100x20x10000-32            3306.47       8096.40       2.45x
BenchmarkEncode17x3x1M-32                 64773.49      93557.14      1.44x
BenchmarkEncode10x4x16M-32                28039.15      28039.68      1.00x
BenchmarkEncode5x2x1M-32                  107365.88     109781.16     1.02x
BenchmarkEncode10x2x1M-32                 124083.62     135266.27     1.09x
BenchmarkEncode10x4x1M-32                 85408.99      94419.71      1.11x
BenchmarkEncode50x20x1M-32                19812.81      24344.67      1.23x
BenchmarkEncode17x3x16M-32                32642.93      33200.32      1.02x
BenchmarkEncode_8x4x8M-32                 29277.52      30261.21      1.03x
BenchmarkEncode_12x4x12M-32               30355.67      30589.14      1.01x
BenchmarkEncode_16x4x16M-32               31023.66      31102.39      1.00x
BenchmarkEncode_16x4x32M-32               31912.44      31201.82      0.98x
BenchmarkEncode_16x4x64M-32               31846.32      30589.65      0.96x
BenchmarkEncode_8x5x8M-32                 24003.28      28351.84      1.18x
BenchmarkEncode_8x6x8M-32                 23184.41      26707.91      1.15x
BenchmarkEncode_8x7x8M-32                 21623.86      25354.03      1.17x
BenchmarkEncode_8x9x8M-32                 22943.85      23321.13      1.02x
BenchmarkEncode_8x10x8M-32                21809.31      22841.68      1.05x
BenchmarkEncode_8x11x8M-32                21637.77      22735.06      1.05x
BenchmarkEncode_8x8x05M-32                55606.22      61311.47      1.10x
BenchmarkEncode_8x8x1M-32                 65351.80      65830.73      1.01x
BenchmarkEncode_8x8x8M-32                 24193.01      24754.07      1.02x
BenchmarkEncode_8x8x32M-32                23303.06      23644.60      1.01x
BenchmarkEncode_24x8x24M-32               29041.76      26549.54      0.91x
BenchmarkEncode_24x8x48M-32               29900.52      26322.51      0.88x
BenchmarkVerify10x2x10000-32              13685.12      14717.10      1.08x
BenchmarkVerify10x2x1M-32                 24378.43      26424.72      1.08x
BenchmarkVerify5x2x1M-32                  16535.79      17495.41      1.06x
BenchmarkVerify10x4x1M-32                 14248.35      15484.96      1.09x
BenchmarkVerify50x20x1M-32                10180.79      11863.85      1.17x
BenchmarkVerify10x4x16M-32                13214.53      13283.71      1.01x
BenchmarkReconstruct10x2x10000-32         35799.16      36854.89      1.03x
BenchmarkReconstruct50x5x50000-32         33049.47      39034.89      1.18x
BenchmarkReconstruct10x2x1M-32            66326.88      72061.06      1.09x
BenchmarkReconstruct5x2x1M-32             57308.21      58014.92      1.01x
BenchmarkReconstruct10x4x1M-32            53711.74      57791.66      1.08x
BenchmarkReconstruct50x20x1M-32           20227.09      22991.67      1.14x
BenchmarkReconstruct10x4x16M-32           27432.37      26747.32      0.98x
BenchmarkReconstructData10x2x10000-32     37030.86      38511.87      1.04x
BenchmarkReconstructData50x5x50000-32     33842.07      40802.85      1.21x
BenchmarkReconstructData10x2x1M-32        73475.57      77693.87      1.06x
BenchmarkReconstructData5x2x1M-32         71809.58      68635.57      0.96x
BenchmarkReconstructData10x4x1M-32        65073.27      66736.88      1.03x
BenchmarkReconstructData50x20x1M-32       29181.41      34464.76      1.18x
BenchmarkReconstructData10x4x16M-32       33649.09      35066.75      1.04x
BenchmarkReconstructP10x2x10000-32        129819.98     128086.76     0.99x
BenchmarkReconstructP10x5x20000-32        183073.89     176202.21     0.96x
BenchmarkParallel_8x8x64K-32              149327.33     158153.67     1.06x
BenchmarkParallel_8x8x05M-32              24083.89      24079.69      1.00x
BenchmarkParallel_20x10x05M-32            27322.20      27070.35      0.99x
BenchmarkParallel_8x8x1M-32               23430.78      24064.83      1.03x
BenchmarkParallel_8x8x8M-32               23480.86      23897.31      1.02x
BenchmarkParallel_8x8x32M-32              23701.99      24294.27      1.02x
BenchmarkParallel_8x3x1M-32               28351.11      28899.03      1.02x
BenchmarkParallel_8x4x1M-32               27407.34      27124.76      0.99x
BenchmarkParallel_8x5x1M-32               25842.27      26197.58      1.01x
BenchmarkStreamEncode10x2x10000-32        16629.76      17012.26      1.02x
BenchmarkStreamEncode100x20x10000-32      1987.58       3732.83       1.88x
BenchmarkStreamEncode17x3x1M-32           11413.34      12948.97      1.13x
BenchmarkStreamEncode10x4x16M-32          8772.66       9445.26       1.08x
BenchmarkStreamEncode5x2x1M-32            12201.21      13629.70      1.12x
BenchmarkStreamEncode10x2x1M-32           13086.64      13731.34      1.05x
BenchmarkStreamEncode10x4x1M-32           11969.16      12775.92      1.07x
BenchmarkStreamEncode50x20x1M-32          7276.61       8621.18       1.18x
BenchmarkStreamEncode17x3x16M-32          10492.40      10920.52      1.04x
BenchmarkStreamVerify10x2x10000-32        7264.00       7129.49       0.98x
BenchmarkStreamVerify50x5x50000-32        6046.07       7241.62       1.20x
BenchmarkStreamVerify10x2x1M-32           8466.05       8866.77       1.05x
BenchmarkStreamVerify5x2x1M-32            5873.31       6502.39       1.11x
BenchmarkStreamVerify10x4x1M-32           6254.95       6427.09       1.03x
BenchmarkStreamVerify50x20x1M-32          4819.76       5223.20       1.08x
BenchmarkStreamVerify10x4x16M-32          6078.79       5512.40       0.91x 
```
master
Klaus Post 2021-12-09 03:28:44 -08:00 committed by GitHub
parent 5593e2b2dd
commit 1bb4d699e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 39146 additions and 3655 deletions

View File

@ -2,7 +2,8 @@
// +build generate
//go:generate go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon
//go:generate gofmt -w ../galois_gen_switch_amd64.go
//go:generate go fmt ../galois_gen_switch_amd64.go
//go:generate go fmt ../galois_gen_amd64.go
package main
@ -36,14 +37,15 @@ func main() {
Constraint(buildtags.Not("nogen").ToConstraint())
Constraint(buildtags.Term("gc").ToConstraint())
const perLoopBits = 5
const perLoopBits = 6
const perLoop = 1 << perLoopBits
for i := 1; i <= inputMax; i++ {
for j := 1; j <= outputMax; j++ {
//genMulAvx2(fmt.Sprintf("mulAvxTwoXor_%dx%d", i, j), i, j, true)
genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%d", i, j), i, j, false)
genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64", i, j), i, j, false)
genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%dXor", i, j), i, j, true)
genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64Xor", i, j), i, j, true)
}
}
f, err := os.Create("../galois_gen_switch_amd64.go")
@ -62,19 +64,26 @@ func main() {
package reedsolomon
import "fmt"
import (
"fmt"
)
`)
w.WriteString("const avx2CodeGen = true\n")
w.WriteString(fmt.Sprintf("const maxAvx2Inputs = %d\nconst maxAvx2Outputs = %d\n", inputMax, outputMax))
w.WriteString(fmt.Sprintf(`const (
avx2CodeGen = true
maxAvx2Inputs = %d
maxAvx2Outputs = %d
minAvx2Size = %d
avxSizeMask = maxInt - (minAvx2Size-1)
)`, inputMax, outputMax, perLoop))
w.WriteString(`
func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
n := stop-start
n := (stop-start) & avxSizeMask
`)
w.WriteString(fmt.Sprintf("n = (n>>%d)<<%d\n\n", perLoopBits, perLoopBits))
w.WriteString(`switch len(in) {
`)
for in, defs := range switchDefs[:] {
@ -88,6 +97,25 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
w.WriteString(`}
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
}
func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
n := (stop-start) & avxSizeMask
`)
w.WriteString(`switch len(in) {
`)
for in, defs := range switchDefsX[:] {
w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1))
for out, def := range defs[:] {
w.WriteString(fmt.Sprintf(" case %d:\n", out+1))
w.WriteString(def)
}
w.WriteString("}\n")
}
w.WriteString(`}
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
}
`)
Generate()
}
@ -129,12 +157,21 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
}
}
x := ""
if xor {
x = "Xor"
}
TEXT(name, attr.NOSPLIT, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)"))
// SWITCH DEFINITION:
s := fmt.Sprintf(" mulAvxTwo_%dx%d(matrix, in, out, start, n)\n", inputs, outputs)
s := fmt.Sprintf(" mulAvxTwo_%dx%d%s(matrix, in, out, start, n)\n", inputs, outputs, x)
s += fmt.Sprintf("\t\t\t\treturn n\n")
switchDefs[inputs-1][outputs-1] = s
if xor {
switchDefsX[inputs-1][outputs-1] = s
} else {
switchDefs[inputs-1][outputs-1] = s
}
if loadNone {
Comment("Loading no tables to registers")
@ -197,7 +234,6 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
if err != nil {
panic(err)
}
outBase := addr.Addr
outSlicePtr := GP64()
MOVQ(addr.Addr, outSlicePtr)
for i := range dst {
@ -241,13 +277,13 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
SHRQ(U8(perLoopBits), length)
}
Label(name + "_loop")
if xor {
// Load data before loop or during first iteration?
// No clear winner.
preloadInput := xor && false
if preloadInput {
Commentf("Load %d outputs", outputs)
} else {
Commentf("Clear %d outputs", outputs)
}
for i := range dst {
if xor {
for i := range dst {
if regDst {
VMOVDQU(Mem{Base: dstPtr[i]}, dst[i])
if prefetchDst > 0 {
@ -256,13 +292,11 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
continue
}
ptr := GP64()
MOVQ(outBase, ptr)
MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr)
VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i])
if prefetchDst > 0 {
PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1})
}
} else {
VPXOR(dst[i], dst[i], dst[i])
}
}
@ -279,6 +313,22 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
VPAND(lowMask, inLow, inLow)
VPAND(lowMask, inHigh, inHigh)
for j := range dst {
//Commentf(" xor:%v i: %v", xor, i)
if !preloadInput && xor && i == 0 {
if regDst {
VMOVDQU(Mem{Base: dstPtr[j]}, dst[j])
if prefetchDst > 0 {
PREFETCHT0(Mem{Base: dstPtr[j], Disp: prefetchDst})
}
} else {
ptr := GP64()
MOVQ(Mem{Base: outSlicePtr, Disp: j * 24}, ptr)
VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[j])
if prefetchDst > 0 {
PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1})
}
}
}
if loadNone {
VMOVDQU(Mem{Base: matrixBase, Disp: 64 * (i*outputs + j)}, lookLow)
VMOVDQU(Mem{Base: matrixBase, Disp: 32 + 64*(i*outputs+j)}, lookHigh)
@ -288,8 +338,13 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
VPSHUFB(inLow, inLo[i*outputs+j], lookLow)
VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh)
}
VPXOR(lookLow, lookHigh, lookLow)
VPXOR(lookLow, dst[j], dst[j])
if i == 0 && !xor {
// We don't have any existing data, write directly.
VPXOR(lookLow, lookHigh, dst[j])
} else {
VPXOR(lookLow, lookHigh, lookLow)
VPXOR(lookLow, dst[j], dst[j])
}
}
}
Commentf("Store %d outputs", outputs)
@ -340,35 +395,42 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) {
// Load shuffle masks on every use.
var loadNone bool
// Use registers for destination registers.
var regDst = false
var regDst = true
var reloadLength = false
// lo, hi, 1 in, 1 out, 2 tmp, 1 mask
est := total*2 + outputs + 5
est := total*4 + outputs + 7
if outputs == 1 {
// We don't need to keep a copy of the input if only 1 output.
est -= 2
}
if true || est > 16 {
if est > 16 {
loadNone = true
// We run out of GP registers first, now.
if inputs+outputs > 13 {
regDst = false
}
// Save one register by reloading length.
if true || inputs+outputs > 12 && regDst {
if inputs+outputs > 12 && regDst {
reloadLength = true
}
}
TEXT(name, 0, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)"))
x := ""
if xor {
x = "Xor"
}
// SWITCH DEFINITION:
s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits)
s += fmt.Sprintf(" mulAvxTwo_%dx%d_64(matrix, in, out, start, n)\n", inputs, outputs)
//s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits)
s := fmt.Sprintf(" mulAvxTwo_%dx%d_64%s(matrix, in, out, start, n)\n", inputs, outputs, x)
s += fmt.Sprintf("\t\t\t\treturn n\n")
switchDefs[inputs-1][outputs-1] = s
if xor {
switchDefsX[inputs-1][outputs-1] = s
} else {
switchDefs[inputs-1][outputs-1] = s
}
if loadNone {
Comment("Loading no tables to registers")
@ -474,33 +536,31 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) {
VPBROADCASTB(lowMask.AsX(), lowMask)
if reloadLength {
Commentf("Reload length to save a register")
length = Load(Param("n"), GP64())
SHRQ(U8(perLoopBits), length)
}
Label(name + "_loop")
if xor {
Commentf("Load %d outputs", outputs)
} else {
Commentf("Clear %d outputs", outputs)
}
for i := range dst {
if xor {
for i := range dst {
if regDst {
VMOVDQU(Mem{Base: dstPtr[i]}, dst[i])
VMOVDQU(Mem{Base: dstPtr[i], Disp: 32}, dst2[i])
if prefetchDst > 0 {
PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst})
}
continue
}
ptr := GP64()
MOVQ(outBase, ptr)
MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr)
VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i])
VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1, Disp: 32}, dst2[i])
if prefetchDst > 0 {
PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1})
}
} else {
VPXOR(dst[i], dst[i], dst[i])
VPXOR(dst2[i], dst2[i], dst2[i])
}
}
@ -536,10 +596,16 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) {
VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh)
VPSHUFB(in2High, inHi[i*outputs+j], lookHigh2)
}
VPXOR(lookLow, lookHigh, lookLow)
VPXOR(lookLow2, lookHigh2, lookLow2)
VPXOR(lookLow, dst[j], dst[j])
VPXOR(lookLow2, dst2[j], dst2[j])
if i == 0 && !xor {
// We don't have any existing data, write directly.
VPXOR(lookLow, lookHigh, dst[j])
VPXOR(lookLow2, lookHigh2, dst2[j])
} else {
VPXOR(lookLow, lookHigh, lookLow)
VPXOR(lookLow2, lookHigh2, lookLow2)
VPXOR(lookLow, dst[j], dst[j])
VPXOR(lookLow2, dst2[j], dst2[j])
}
}
}
Commentf("Store %d outputs", outputs)

View File

@ -901,7 +901,7 @@ func galExp(a byte, n int) byte {
return expTable[logResult]
}
func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte {
func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte {
if !avx2CodeGen {
panic("codegen not enabled")
}
@ -915,7 +915,7 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte
dst = dst[:wantBytes]
}
for i, row := range matrixRows[:outputs] {
for j, idx := range row[:inputs] {
for j, idx := range row[inIdx : inIdx+inputs] {
dstIdx := (j*outputs + i) * 64
dstPart := dst[dstIdx:]
dstPart = dstPart[:64]

View File

@ -225,8 +225,9 @@ func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outp
// Perform the same as codeSomeShards, but taking advantage of
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) {
// Process using no goroutines
outputCount := len(outputs)
start, end := 0, r.o.perRound
if end > byteCount {
end = byteCount
@ -272,7 +273,8 @@ func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
// Perform the same as codeSomeShards, but taking advantage of
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) {
outputCount := len(outputs)
var wg sync.WaitGroup
do := byteCount / r.o.maxGoroutines
if do < r.o.minSplitSize {

View File

@ -331,9 +331,9 @@ func testCodeSomeShardsAvx512WithLength(t *testing.T, ds, ps, l int, parallel bo
}
if parallel {
r.codeSomeShardsAvx512P(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0]))
r.codeSomeShardsAvx512P(r.parity, shards[:r.DataShards], shards[r.DataShards:], len(shards[0]))
} else {
r.codeSomeShardsAvx512(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0]))
r.codeSomeShardsAvx512(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0]))
}
correct, _ := r.Verify(shards)

View File

@ -107,6 +107,9 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {
in = in[done:]
out = out[done:]
}
if len(in) == 0 {
return
}
out = out[:len(in)]
mt := mulTable[c][:256]
for i := range in {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -3,10 +3,16 @@
package reedsolomon
const maxAvx2Inputs = 0
const maxAvx2Outputs = 0
const maxAvx2Inputs = 1
const maxAvx2Outputs = 1
const minAvx2Size = 1
const avxSizeMask = 0
const avx2CodeGen = false
func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
panic("avx2 codegen not available")
}
func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
panic("avx2 codegen not available")
}

View File

@ -5,29 +5,31 @@
package reedsolomon
import "fmt"
import (
"fmt"
)
const avx2CodeGen = true
const maxAvx2Inputs = 10
const maxAvx2Outputs = 10
const (
avx2CodeGen = true
maxAvx2Inputs = 10
maxAvx2Outputs = 10
minAvx2Size = 64
avxSizeMask = maxInt - (minAvx2Size - 1)
)
func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
n := stop - start
n = (n >> 5) << 5
n := (stop - start) & avxSizeMask
switch len(in) {
case 1:
switch len(out) {
case 1:
n = (n >> 6) << 6
mulAvxTwo_1x1_64(matrix, in, out, start, n)
return n
case 2:
n = (n >> 6) << 6
mulAvxTwo_1x2_64(matrix, in, out, start, n)
return n
case 3:
n = (n >> 6) << 6
mulAvxTwo_1x3_64(matrix, in, out, start, n)
return n
case 4:
@ -55,15 +57,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
case 2:
switch len(out) {
case 1:
n = (n >> 6) << 6
mulAvxTwo_2x1_64(matrix, in, out, start, n)
return n
case 2:
n = (n >> 6) << 6
mulAvxTwo_2x2_64(matrix, in, out, start, n)
return n
case 3:
n = (n >> 6) << 6
mulAvxTwo_2x3_64(matrix, in, out, start, n)
return n
case 4:
@ -91,15 +90,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
case 3:
switch len(out) {
case 1:
n = (n >> 6) << 6
mulAvxTwo_3x1_64(matrix, in, out, start, n)
return n
case 2:
n = (n >> 6) << 6
mulAvxTwo_3x2_64(matrix, in, out, start, n)
return n
case 3:
n = (n >> 6) << 6
mulAvxTwo_3x3_64(matrix, in, out, start, n)
return n
case 4:
@ -127,15 +123,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
case 4:
switch len(out) {
case 1:
n = (n >> 6) << 6
mulAvxTwo_4x1_64(matrix, in, out, start, n)
return n
case 2:
n = (n >> 6) << 6
mulAvxTwo_4x2_64(matrix, in, out, start, n)
return n
case 3:
n = (n >> 6) << 6
mulAvxTwo_4x3_64(matrix, in, out, start, n)
return n
case 4:
@ -163,15 +156,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
case 5:
switch len(out) {
case 1:
n = (n >> 6) << 6
mulAvxTwo_5x1_64(matrix, in, out, start, n)
return n
case 2:
n = (n >> 6) << 6
mulAvxTwo_5x2_64(matrix, in, out, start, n)
return n
case 3:
n = (n >> 6) << 6
mulAvxTwo_5x3_64(matrix, in, out, start, n)
return n
case 4:
@ -199,15 +189,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
case 6:
switch len(out) {
case 1:
n = (n >> 6) << 6
mulAvxTwo_6x1_64(matrix, in, out, start, n)
return n
case 2:
n = (n >> 6) << 6
mulAvxTwo_6x2_64(matrix, in, out, start, n)
return n
case 3:
n = (n >> 6) << 6
mulAvxTwo_6x3_64(matrix, in, out, start, n)
return n
case 4:
@ -235,15 +222,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
case 7:
switch len(out) {
case 1:
n = (n >> 6) << 6
mulAvxTwo_7x1_64(matrix, in, out, start, n)
return n
case 2:
n = (n >> 6) << 6
mulAvxTwo_7x2_64(matrix, in, out, start, n)
return n
case 3:
n = (n >> 6) << 6
mulAvxTwo_7x3_64(matrix, in, out, start, n)
return n
case 4:
@ -271,15 +255,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
case 8:
switch len(out) {
case 1:
n = (n >> 6) << 6
mulAvxTwo_8x1_64(matrix, in, out, start, n)
return n
case 2:
n = (n >> 6) << 6
mulAvxTwo_8x2_64(matrix, in, out, start, n)
return n
case 3:
n = (n >> 6) << 6
mulAvxTwo_8x3_64(matrix, in, out, start, n)
return n
case 4:
@ -307,15 +288,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
case 9:
switch len(out) {
case 1:
n = (n >> 6) << 6
mulAvxTwo_9x1_64(matrix, in, out, start, n)
return n
case 2:
n = (n >> 6) << 6
mulAvxTwo_9x2_64(matrix, in, out, start, n)
return n
case 3:
n = (n >> 6) << 6
mulAvxTwo_9x3_64(matrix, in, out, start, n)
return n
case 4:
@ -343,15 +321,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
case 10:
switch len(out) {
case 1:
n = (n >> 6) << 6
mulAvxTwo_10x1_64(matrix, in, out, start, n)
return n
case 2:
n = (n >> 6) << 6
mulAvxTwo_10x2_64(matrix, in, out, start, n)
return n
case 3:
n = (n >> 6) << 6
mulAvxTwo_10x3_64(matrix, in, out, start, n)
return n
case 4:
@ -379,3 +354,341 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
}
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
}
func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
n := (stop - start) & avxSizeMask
switch len(in) {
case 1:
switch len(out) {
case 1:
mulAvxTwo_1x1_64Xor(matrix, in, out, start, n)
return n
case 2:
mulAvxTwo_1x2_64Xor(matrix, in, out, start, n)
return n
case 3:
mulAvxTwo_1x3_64Xor(matrix, in, out, start, n)
return n
case 4:
mulAvxTwo_1x4Xor(matrix, in, out, start, n)
return n
case 5:
mulAvxTwo_1x5Xor(matrix, in, out, start, n)
return n
case 6:
mulAvxTwo_1x6Xor(matrix, in, out, start, n)
return n
case 7:
mulAvxTwo_1x7Xor(matrix, in, out, start, n)
return n
case 8:
mulAvxTwo_1x8Xor(matrix, in, out, start, n)
return n
case 9:
mulAvxTwo_1x9Xor(matrix, in, out, start, n)
return n
case 10:
mulAvxTwo_1x10Xor(matrix, in, out, start, n)
return n
}
case 2:
switch len(out) {
case 1:
mulAvxTwo_2x1_64Xor(matrix, in, out, start, n)
return n
case 2:
mulAvxTwo_2x2_64Xor(matrix, in, out, start, n)
return n
case 3:
mulAvxTwo_2x3_64Xor(matrix, in, out, start, n)
return n
case 4:
mulAvxTwo_2x4Xor(matrix, in, out, start, n)
return n
case 5:
mulAvxTwo_2x5Xor(matrix, in, out, start, n)
return n
case 6:
mulAvxTwo_2x6Xor(matrix, in, out, start, n)
return n
case 7:
mulAvxTwo_2x7Xor(matrix, in, out, start, n)
return n
case 8:
mulAvxTwo_2x8Xor(matrix, in, out, start, n)
return n
case 9:
mulAvxTwo_2x9Xor(matrix, in, out, start, n)
return n
case 10:
mulAvxTwo_2x10Xor(matrix, in, out, start, n)
return n
}
case 3:
switch len(out) {
case 1:
mulAvxTwo_3x1_64Xor(matrix, in, out, start, n)
return n
case 2:
mulAvxTwo_3x2_64Xor(matrix, in, out, start, n)
return n
case 3:
mulAvxTwo_3x3_64Xor(matrix, in, out, start, n)
return n
case 4:
mulAvxTwo_3x4Xor(matrix, in, out, start, n)
return n
case 5:
mulAvxTwo_3x5Xor(matrix, in, out, start, n)
return n
case 6:
mulAvxTwo_3x6Xor(matrix, in, out, start, n)
return n
case 7:
mulAvxTwo_3x7Xor(matrix, in, out, start, n)
return n
case 8:
mulAvxTwo_3x8Xor(matrix, in, out, start, n)
return n
case 9:
mulAvxTwo_3x9Xor(matrix, in, out, start, n)
return n
case 10:
mulAvxTwo_3x10Xor(matrix, in, out, start, n)
return n
}
case 4:
switch len(out) {
case 1:
mulAvxTwo_4x1_64Xor(matrix, in, out, start, n)
return n
case 2:
mulAvxTwo_4x2_64Xor(matrix, in, out, start, n)
return n
case 3:
mulAvxTwo_4x3_64Xor(matrix, in, out, start, n)
return n
case 4:
mulAvxTwo_4x4Xor(matrix, in, out, start, n)
return n
case 5:
mulAvxTwo_4x5Xor(matrix, in, out, start, n)
return n
case 6:
mulAvxTwo_4x6Xor(matrix, in, out, start, n)
return n
case 7:
mulAvxTwo_4x7Xor(matrix, in, out, start, n)
return n
case 8:
mulAvxTwo_4x8Xor(matrix, in, out, start, n)
return n
case 9:
mulAvxTwo_4x9Xor(matrix, in, out, start, n)
return n
case 10:
mulAvxTwo_4x10Xor(matrix, in, out, start, n)
return n
}
case 5:
switch len(out) {
case 1:
mulAvxTwo_5x1_64Xor(matrix, in, out, start, n)
return n
case 2:
mulAvxTwo_5x2_64Xor(matrix, in, out, start, n)
return n
case 3:
mulAvxTwo_5x3_64Xor(matrix, in, out, start, n)
return n
case 4:
mulAvxTwo_5x4Xor(matrix, in, out, start, n)
return n
case 5:
mulAvxTwo_5x5Xor(matrix, in, out, start, n)
return n
case 6:
mulAvxTwo_5x6Xor(matrix, in, out, start, n)
return n
case 7:
mulAvxTwo_5x7Xor(matrix, in, out, start, n)
return n
case 8:
mulAvxTwo_5x8Xor(matrix, in, out, start, n)
return n
case 9:
mulAvxTwo_5x9Xor(matrix, in, out, start, n)
return n
case 10:
mulAvxTwo_5x10Xor(matrix, in, out, start, n)
return n
}
case 6:
switch len(out) {
case 1:
mulAvxTwo_6x1_64Xor(matrix, in, out, start, n)
return n
case 2:
mulAvxTwo_6x2_64Xor(matrix, in, out, start, n)
return n
case 3:
mulAvxTwo_6x3_64Xor(matrix, in, out, start, n)
return n
case 4:
mulAvxTwo_6x4Xor(matrix, in, out, start, n)
return n
case 5:
mulAvxTwo_6x5Xor(matrix, in, out, start, n)
return n
case 6:
mulAvxTwo_6x6Xor(matrix, in, out, start, n)
return n
case 7:
mulAvxTwo_6x7Xor(matrix, in, out, start, n)
return n
case 8:
mulAvxTwo_6x8Xor(matrix, in, out, start, n)
return n
case 9:
mulAvxTwo_6x9Xor(matrix, in, out, start, n)
return n
case 10:
mulAvxTwo_6x10Xor(matrix, in, out, start, n)
return n
}
case 7:
switch len(out) {
case 1:
mulAvxTwo_7x1_64Xor(matrix, in, out, start, n)
return n
case 2:
mulAvxTwo_7x2_64Xor(matrix, in, out, start, n)
return n
case 3:
mulAvxTwo_7x3_64Xor(matrix, in, out, start, n)
return n
case 4:
mulAvxTwo_7x4Xor(matrix, in, out, start, n)
return n
case 5:
mulAvxTwo_7x5Xor(matrix, in, out, start, n)
return n
case 6:
mulAvxTwo_7x6Xor(matrix, in, out, start, n)
return n
case 7:
mulAvxTwo_7x7Xor(matrix, in, out, start, n)
return n
case 8:
mulAvxTwo_7x8Xor(matrix, in, out, start, n)
return n
case 9:
mulAvxTwo_7x9Xor(matrix, in, out, start, n)
return n
case 10:
mulAvxTwo_7x10Xor(matrix, in, out, start, n)
return n
}
case 8:
switch len(out) {
case 1:
mulAvxTwo_8x1_64Xor(matrix, in, out, start, n)
return n
case 2:
mulAvxTwo_8x2_64Xor(matrix, in, out, start, n)
return n
case 3:
mulAvxTwo_8x3_64Xor(matrix, in, out, start, n)
return n
case 4:
mulAvxTwo_8x4Xor(matrix, in, out, start, n)
return n
case 5:
mulAvxTwo_8x5Xor(matrix, in, out, start, n)
return n
case 6:
mulAvxTwo_8x6Xor(matrix, in, out, start, n)
return n
case 7:
mulAvxTwo_8x7Xor(matrix, in, out, start, n)
return n
case 8:
mulAvxTwo_8x8Xor(matrix, in, out, start, n)
return n
case 9:
mulAvxTwo_8x9Xor(matrix, in, out, start, n)
return n
case 10:
mulAvxTwo_8x10Xor(matrix, in, out, start, n)
return n
}
case 9:
switch len(out) {
case 1:
mulAvxTwo_9x1_64Xor(matrix, in, out, start, n)
return n
case 2:
mulAvxTwo_9x2_64Xor(matrix, in, out, start, n)
return n
case 3:
mulAvxTwo_9x3_64Xor(matrix, in, out, start, n)
return n
case 4:
mulAvxTwo_9x4Xor(matrix, in, out, start, n)
return n
case 5:
mulAvxTwo_9x5Xor(matrix, in, out, start, n)
return n
case 6:
mulAvxTwo_9x6Xor(matrix, in, out, start, n)
return n
case 7:
mulAvxTwo_9x7Xor(matrix, in, out, start, n)
return n
case 8:
mulAvxTwo_9x8Xor(matrix, in, out, start, n)
return n
case 9:
mulAvxTwo_9x9Xor(matrix, in, out, start, n)
return n
case 10:
mulAvxTwo_9x10Xor(matrix, in, out, start, n)
return n
}
case 10:
switch len(out) {
case 1:
mulAvxTwo_10x1_64Xor(matrix, in, out, start, n)
return n
case 2:
mulAvxTwo_10x2_64Xor(matrix, in, out, start, n)
return n
case 3:
mulAvxTwo_10x3_64Xor(matrix, in, out, start, n)
return n
case 4:
mulAvxTwo_10x4Xor(matrix, in, out, start, n)
return n
case 5:
mulAvxTwo_10x5Xor(matrix, in, out, start, n)
return n
case 6:
mulAvxTwo_10x6Xor(matrix, in, out, start, n)
return n
case 7:
mulAvxTwo_10x7Xor(matrix, in, out, start, n)
return n
case 8:
mulAvxTwo_10x8Xor(matrix, in, out, start, n)
return n
case 9:
mulAvxTwo_10x9Xor(matrix, in, out, start, n)
return n
case 10:
mulAvxTwo_10x10Xor(matrix, in, out, start, n)
return n
}
}
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
}

View File

@ -5,10 +5,10 @@
package reedsolomon
func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) {
panic("codeSomeShardsAvx512 should not be called if built without asm")
}
func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) {
panic("codeSomeShardsAvx512P should not be called if built without asm")
}

View File

@ -112,6 +112,9 @@ const (
avx2CodeGenMinSize = 64
avx2CodeGenMinShards = 3
avx2CodeGenMaxGoroutines = 8
intSize = 32 << (^uint(0) >> 63) // 32 or 64
maxInt = 1<<(intSize-1) - 1
)
// reedSolomon contains a matrix for a specific
@ -291,6 +294,24 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
// Calculate what we want per round
r.o.perRound = cpuid.CPU.Cache.L2
divide := parityShards + 1
if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) {
// Base on L1 cache if we have many inputs.
r.o.perRound = cpuid.CPU.Cache.L1D
divide = 0
if dataShards > maxAvx2Inputs {
divide += maxAvx2Inputs
} else {
divide += dataShards
}
if parityShards > maxAvx2Inputs {
divide += maxAvx2Outputs
} else {
divide += parityShards
}
}
if r.o.perRound <= 0 {
// Set to 128K if undetectable.
r.o.perRound = 128 << 10
@ -300,8 +321,9 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
// If multiple threads per core, make sure they don't contend for cache.
r.o.perRound /= cpuid.CPU.ThreadsPerCore
}
// 1 input + parity must fit in cache, and we add one more to be safer.
r.o.perRound = r.o.perRound / (1 + parityShards)
r.o.perRound = r.o.perRound / divide
// Align to 64 bytes.
r.o.perRound = ((r.o.perRound + 63) / 64) * 64
@ -319,10 +341,6 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
}
}
if r.o.perRound < r.o.minSplitSize {
r.o.perRound = r.o.minSplitSize
}
if r.o.shardSize > 0 {
p := runtime.GOMAXPROCS(0)
if p == 1 || r.o.shardSize <= r.o.minSplitSize*2 {
@ -347,7 +365,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
// Generated AVX2 does not need data to stay in L1 cache between runs.
// We will be purely limited by RAM speed.
if r.canAVX2C(avx2CodeGenMinSize, r.DataShards, r.ParityShards) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines {
if r.canAVX2C(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines {
r.o.maxGoroutines = avx2CodeGenMaxGoroutines
}
@ -366,8 +384,9 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
}
if avx2CodeGen && r.o.useAVX2 {
sz := r.DataShards * r.ParityShards * 2 * 32
r.mPool.New = func() interface{} {
return make([]byte, r.Shards*2*32)
return make([]byte, sz)
}
}
return &r, err
@ -398,7 +417,7 @@ func (r *reedSolomon) Encode(shards [][]byte) error {
output := shards[r.DataShards:]
// Do the coding.
r.codeSomeShards(r.parity, shards[0:r.DataShards], output, r.ParityShards, len(shards[0]))
r.codeSomeShards(r.parity, shards[0:r.DataShards], output[:r.ParityShards], len(shards[0]))
return nil
}
@ -558,7 +577,7 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) {
toCheck := shards[r.DataShards:]
// Do the checking.
return r.checkSomeShards(r.parity, shards[0:r.DataShards], toCheck, r.ParityShards, len(shards[0])), nil
return r.checkSomeShards(r.parity, shards[:r.DataShards], toCheck[:r.ParityShards], len(shards[0])), nil
}
func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool {
@ -576,19 +595,19 @@ func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool {
// The number of outputs computed, and the
// number of matrix rows used, is determined by
// outputCount, which is the number of outputs to compute.
func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteCount int) {
if len(outputs) == 0 {
return
}
switch {
case r.o.useAVX512 && r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize && len(inputs) >= 4 && len(outputs) >= 2:
r.codeSomeShardsAvx512P(matrixRows, inputs, outputs, outputCount, byteCount)
r.codeSomeShardsAvx512P(matrixRows, inputs, outputs, byteCount)
return
case r.o.useAVX512 && len(inputs) >= 4 && len(outputs) >= 2:
r.codeSomeShardsAvx512(matrixRows, inputs, outputs, outputCount, byteCount)
r.codeSomeShardsAvx512(matrixRows, inputs, outputs, byteCount)
return
case r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize:
r.codeSomeShardsP(matrixRows, inputs, outputs, outputCount, byteCount)
case byteCount > r.o.minSplitSize:
r.codeSomeShardsP(matrixRows, inputs, outputs, byteCount)
return
}
@ -598,16 +617,49 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outpu
end = len(inputs[0])
}
if r.canAVX2C(byteCount, len(inputs), len(outputs)) {
m := genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte))
m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.mPool.Get().([]byte))
start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount)
r.mPool.Put(m)
end = len(inputs[0])
} else if len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount, maxAvx2Inputs, maxAvx2Outputs) {
end = len(inputs[0])
inIdx := 0
m := r.mPool.Get().([]byte)
defer r.mPool.Put(m)
ins := inputs
for len(ins) > 0 {
inPer := ins
if len(inPer) > maxAvx2Inputs {
inPer = inPer[:maxAvx2Inputs]
}
outs := outputs
outIdx := 0
for len(outs) > 0 {
outPer := outs
if len(outPer) > maxAvx2Outputs {
outPer = outPer[:maxAvx2Outputs]
}
m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m)
if inIdx == 0 {
galMulSlicesAvx2(m, inPer, outPer, 0, byteCount)
} else {
galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount)
}
start = byteCount & avxSizeMask
outIdx += len(outPer)
outs = outs[len(outPer):]
}
inIdx += len(inPer)
ins = ins[len(inPer):]
}
if start >= end {
return
}
}
for start < len(inputs[0]) {
for c := 0; c < r.DataShards; c++ {
for c := 0; c < len(inputs); c++ {
in := inputs[c][start:end]
for iRow := 0; iRow < outputCount; iRow++ {
for iRow := 0; iRow < len(outputs); iRow++ {
if c == 0 {
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:end], &r.o)
} else {
@ -625,15 +677,21 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outpu
// Perform the same as codeSomeShards, but split the workload into
// several goroutines.
func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byteCount int) {
var wg sync.WaitGroup
gor := r.o.maxGoroutines
var avx2Matrix []byte
useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs))
if useAvx2 {
avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte))
avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.mPool.Get().([]byte))
defer r.mPool.Put(avx2Matrix)
} else if byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
// It appears there is a switchover point at around 10MB where
// Regular processing is faster...
r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount)
return
}
do := byteCount / gor
@ -641,6 +699,40 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp
do = r.o.minSplitSize
}
exec := func(start, stop int) {
if useAvx2 && stop-start >= 64 {
start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop)
}
lstart, lstop := start, start+r.o.perRound
if lstop > stop {
lstop = stop
}
for lstart < stop {
for c := 0; c < len(inputs); c++ {
in := inputs[c][lstart:lstop]
for iRow := 0; iRow < len(outputs); iRow++ {
if c == 0 {
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
} else {
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
}
}
}
lstart = lstop
lstop += r.o.perRound
if lstop > stop {
lstop = stop
}
}
wg.Done()
}
if gor <= 1 {
wg.Add(1)
exec(0, byteCount)
return
}
// Make sizes divisible by 64
do = (do + 63) & (^63)
start := 0
@ -650,34 +742,162 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp
}
wg.Add(1)
go func(start, stop int) {
if useAvx2 && stop-start >= 64 {
start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop)
go exec(start, start+do)
start += do
}
wg.Wait()
}
// Perform the same as codeSomeShards, but split the workload into
// several goroutines.
func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int) {
var wg sync.WaitGroup
gor := r.o.maxGoroutines
type state struct {
input [][]byte
output [][]byte
m []byte
first bool
}
// Make a plan...
plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs))
tmp := r.mPool.Get().([]byte)
defer func(b []byte) {
r.mPool.Put(b)
}(tmp)
// Flips between input first to output first.
// We put the smallest data load in the inner loop.
if len(inputs) > len(outputs) {
inIdx := 0
ins := inputs
for len(ins) > 0 {
inPer := ins
if len(inPer) > maxAvx2Inputs {
inPer = inPer[:maxAvx2Inputs]
}
outs := outputs
outIdx := 0
for len(outs) > 0 {
outPer := outs
if len(outPer) > maxAvx2Outputs {
outPer = outPer[:maxAvx2Outputs]
}
// Generate local matrix
m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp)
tmp = tmp[len(m):]
plan = append(plan, state{
input: inPer,
output: outPer,
m: m,
first: inIdx == 0,
})
outIdx += len(outPer)
outs = outs[len(outPer):]
}
inIdx += len(inPer)
ins = ins[len(inPer):]
}
} else {
outs := outputs
outIdx := 0
for len(outs) > 0 {
outPer := outs
if len(outPer) > maxAvx2Outputs {
outPer = outPer[:maxAvx2Outputs]
}
lstart, lstop := start, start+r.o.perRound
inIdx := 0
ins := inputs
for len(ins) > 0 {
inPer := ins
if len(inPer) > maxAvx2Inputs {
inPer = inPer[:maxAvx2Inputs]
}
// Generate local matrix
m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp)
tmp = tmp[len(m):]
//fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound)
plan = append(plan, state{
input: inPer,
output: outPer,
m: m,
first: inIdx == 0,
})
inIdx += len(inPer)
ins = ins[len(inPer):]
}
outIdx += len(outPer)
outs = outs[len(outPer):]
}
}
do := byteCount / gor
if do < r.o.minSplitSize {
do = r.o.minSplitSize
}
exec := func(start, stop int) {
lstart, lstop := start, start+r.o.perRound
if lstop > stop {
lstop = stop
}
for lstart < stop {
if lstop-lstart >= minAvx2Size {
// Execute plan...
for _, p := range plan {
if p.first {
galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop)
} else {
galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop)
}
}
lstart += (lstop - lstart) & avxSizeMask
if lstart == lstop {
lstop += r.o.perRound
if lstop > stop {
lstop = stop
}
continue
}
}
for c := range inputs {
in := inputs[c][lstart:lstop]
for iRow := 0; iRow < len(outputs); iRow++ {
if c == 0 {
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
} else {
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
}
}
}
lstart = lstop
lstop += r.o.perRound
if lstop > stop {
lstop = stop
}
for lstart < stop {
for c := 0; c < r.DataShards; c++ {
in := inputs[c][lstart:lstop]
for iRow := 0; iRow < outputCount; iRow++ {
if c == 0 {
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
} else {
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
}
}
}
lstart = lstop
lstop += r.o.perRound
if lstop > stop {
lstop = stop
}
}
wg.Done()
}(start, start+do)
}
wg.Done()
}
if gor == 1 {
wg.Add(1)
exec(0, byteCount)
return
}
// Make sizes divisible by 64
do = (do + 63) & (^63)
start := 0
for start < byteCount {
if start+do > byteCount {
do = byteCount - start
}
wg.Add(1)
go exec(start, start+do)
start += do
}
wg.Wait()
@ -686,7 +906,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp
// checkSomeShards is mostly the same as codeSomeShards,
// except this will check values and return
// as soon as a difference is found.
func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, byteCount int) bool {
if len(toCheck) == 0 {
return true
}
@ -695,7 +915,7 @@ func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outp
for i := range outputs {
outputs[i] = make([]byte, byteCount)
}
r.codeSomeShards(matrixRows, inputs, outputs, outputCount, byteCount)
r.codeSomeShards(matrixRows, inputs, outputs, byteCount)
for i, calc := range outputs {
if !bytes.Equal(calc, toCheck[i]) {
@ -902,7 +1122,7 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error {
outputCount++
}
}
r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], outputCount, shardSize)
r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], shardSize)
if dataOnly {
// Exit out early if we are only interested in the data shards
@ -928,7 +1148,7 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error {
outputCount++
}
}
r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], outputCount, shardSize)
r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], shardSize)
return nil
}

View File

@ -191,7 +191,7 @@ func TestEncoding(t *testing.T) {
// note that par1 matric will fail on some combinations.
var testSizes = [][2]int{
{1, 0}, {3, 0}, {5, 0}, {8, 0}, {10, 0}, {12, 0}, {14, 0}, {41, 0}, {49, 0},
{1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}}
{1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}, {5, 20}}
var testDataSizes = []int{10, 100, 1000, 10001, 100003, 1000055}
var testDataSizesShort = []int{10, 10001, 100003}
@ -893,6 +893,7 @@ func benchmarkEncode(b *testing.B, dataShards, parityShards, shardSize int) {
b.SetBytes(int64(shardSize * (dataShards + parityShards)))
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
err = r.Encode(shards)
if err != nil {
@ -937,7 +938,7 @@ func BenchmarkEncode10x4x1M(b *testing.B) {
benchmarkEncode(b, 10, 4, 1024*1024)
}
// Benchmark 50 data shards and 20 parity shards with 1MB each.
// Benchmark 50 data shards and 20 parity shards with 1M each.
func BenchmarkEncode50x20x1M(b *testing.B) {
benchmarkEncode(b, 50, 20, 1024*1024)
}
@ -989,6 +990,7 @@ func benchmarkVerify(b *testing.B, dataShards, parityShards, shardSize int) {
b.SetBytes(int64(shardSize * (dataShards + parityShards)))
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, err = r.Verify(shards)
if err != nil {
@ -1003,7 +1005,7 @@ func BenchmarkVerify10x2x10000(b *testing.B) {
}
// Benchmark 50 data slices with 5 parity slices holding 100000 bytes each
func BenchmarkVerify50x5x50000(b *testing.B) {
func BenchmarkVerify50x5x100000(b *testing.B) {
benchmarkVerify(b, 50, 5, 100000)
}
@ -1359,11 +1361,11 @@ func TestCodeSomeShards(t *testing.T) {
shards, _ := enc.Split(data)
old := runtime.GOMAXPROCS(1)
r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0]))
r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0]))
// hopefully more than 1 CPU
runtime.GOMAXPROCS(runtime.NumCPU())
r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0]))
r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0]))
// reset MAXPROCS, otherwise testing complains
runtime.GOMAXPROCS(old)
@ -1642,7 +1644,9 @@ func benchmarkParallel(b *testing.B, dataShards, parityShards, shardSize int) {
c := runtime.GOMAXPROCS(0)
// Note that concurrency also affects total data size and will make caches less effective.
b.Log("Total data:", (c*dataShards*shardSize)>>20, "MiB", "parity:", (c*parityShards*shardSize)>>20, "MiB")
if testing.Verbose() {
b.Log("Total data:", (c*dataShards*shardSize)>>20, "MiB", "parity:", (c*parityShards*shardSize)>>20, "MiB")
}
// Create independent shards
shardsCh := make(chan [][]byte, c)
for i := 0; i < c; i++ {