Add GF16 AVX2, AVX512 and SSSE3 (#193)

* Add GF16 AVX2
* Add SSSE3 fallback.
* Fix reconstruction was skipped if first shard was empty.
* Combine lookups in pure Go
* Faster xor on pure Go.
* Add 4way butterfly AVX2.
* Add fftDIT4 avx2. Add avx512 version. Add noescape.
* Remove +build space. Do size varied 800x200 bench.
* Use VPTERNLOGD for avx512.
* Remove refMulAdd inner loop bounds checks. ~10-20% faster
master
Klaus Post 2022-07-26 03:37:28 -07:00 committed by GitHub
parent 49be604db0
commit 3a82d28edb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 2483 additions and 84 deletions

View File

@ -1,7 +1,7 @@
//go:build generate
// +build generate
//go:generate go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon
//go:generate go run -tags=generate . -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon
//go:generate go fmt ../galois_gen_switch_amd64.go
//go:generate go fmt ../galois_gen_amd64.go
//go:generate go run cleanup.go ../galois_gen_amd64.s
@ -129,6 +129,7 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
}
`)
genGF16()
Generate()
}

846
_gen/gf16.go Normal file
View File

@ -0,0 +1,846 @@
//go:build generate
// +build generate
package main
import (
"fmt"
"github.com/mmcloughlin/avo/attr"
. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/operand"
"github.com/mmcloughlin/avo/reg"
)
type table256 struct {
Lo, Hi Op
loadLo128, loadHi128 *Mem
loadLo256, loadHi256 *Mem
useZmmLo, useZmmHi *reg.VecPhysical
}
func (t *table256) prepare() {
t.prepareLo()
t.prepareHi()
}
func (t *table256) prepareHi() {
if t.loadHi128 != nil {
t.Hi = YMM()
// Load and expand tables
VBROADCASTI128(*t.loadHi128, t.Hi)
}
if t.loadHi256 != nil {
t.Hi = YMM()
// Load and expand tables
VMOVDQU(*t.loadHi256, t.Hi)
}
if t.useZmmHi != nil {
r := *t.useZmmHi
t.Hi = r.AsY()
}
}
func (t *table256) prepareLo() {
if t.loadLo128 != nil {
t.Lo = YMM()
// Load and expand tables
VBROADCASTI128(*t.loadLo128, t.Lo)
}
if t.loadLo256 != nil {
t.Lo = YMM()
// Load and expand tables
VMOVDQU(*t.loadLo256, t.Lo)
}
if t.useZmmLo != nil {
r := *t.useZmmLo
t.Lo = r.AsY()
}
}
// table128 contains memory pointers to tables
type table128 struct {
Lo, Hi Op
}
type gf16ctx struct {
clrMask reg.VecVirtual
clrMask128 reg.VecVirtual
avx512 bool
}
func genGF16() {
var ctx gf16ctx
// Ported from static void IFFT_DIT2
// https://github.com/catid/leopard/blob/master/LeopardFF16.cpp#L629
{
TEXT("ifftDIT2_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
Pragma("noescape")
tablePtr := Load(Param("table"), GP64())
tables := [4]table256{}
for i, t := range tables {
t.Lo, t.Hi = YMM(), YMM()
// Load and expand tables
VBROADCASTI128(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
VBROADCASTI128(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
tables[i] = t
}
bytes := Load(Param("x").Len(), GP64())
x := Load(Param("x").Base(), GP64())
y := Load(Param("y").Base(), GP64())
// Generate mask
ctx.clrMask = YMM()
tmpMask := GP64()
MOVQ(U32(15), tmpMask)
MOVQ(tmpMask, ctx.clrMask.AsX())
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
xLo, xHi, yLo, yHi := YMM(), YMM(), YMM(), YMM()
Label("loop")
VMOVDQU(Mem{Base: x, Disp: 0}, xLo)
VMOVDQU(Mem{Base: x, Disp: 32}, xHi)
VMOVDQU(Mem{Base: y, Disp: 0}, yLo)
VMOVDQU(Mem{Base: y, Disp: 32}, yHi)
VPXOR(yLo, xLo, yLo)
VPXOR(yHi, xHi, yHi)
VMOVDQU(yLo, Mem{Base: y, Disp: 0})
VMOVDQU(yHi, Mem{Base: y, Disp: 32})
leoMulAdd256(ctx, xLo, xHi, yLo, yHi, tables)
VMOVDQU(xLo, Mem{Base: x, Disp: 0})
VMOVDQU(xHi, Mem{Base: x, Disp: 32})
ADDQ(U8(64), x)
ADDQ(U8(64), y)
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
VZEROUPPER()
RET()
}
{
TEXT("fftDIT2_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
Pragma("noescape")
tablePtr := Load(Param("table"), GP64())
tables := [4]table256{}
for i, t := range tables {
t.Lo, t.Hi = YMM(), YMM()
// Load and expand tables
VBROADCASTI128(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
VBROADCASTI128(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
tables[i] = t
}
bytes := Load(Param("x").Len(), GP64())
x := Load(Param("x").Base(), GP64())
y := Load(Param("y").Base(), GP64())
// Generate mask
ctx.clrMask = YMM()
tmpMask := GP64()
MOVQ(U32(15), tmpMask)
MOVQ(tmpMask, ctx.clrMask.AsX())
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
xLo, xHi, yLo, yHi := YMM(), YMM(), YMM(), YMM()
Label("loop")
VMOVDQU(Mem{Base: x, Disp: 0}, xLo)
VMOVDQU(Mem{Base: x, Disp: 32}, xHi)
VMOVDQU(Mem{Base: y, Disp: 0}, yLo)
VMOVDQU(Mem{Base: y, Disp: 32}, yHi)
leoMulAdd256(ctx, xLo, xHi, yLo, yHi, tables)
VMOVDQU(xLo, Mem{Base: x, Disp: 0})
VMOVDQU(xHi, Mem{Base: x, Disp: 32})
// Reload, or we go beyond 16 regs..
if true {
yLo, yHi = YMM(), YMM()
VMOVDQU(Mem{Base: y, Disp: 0}, yLo)
VMOVDQU(Mem{Base: y, Disp: 32}, yHi)
}
VPXOR(yLo, xLo, yLo)
VPXOR(yHi, xHi, yHi)
VMOVDQU(yLo, Mem{Base: y, Disp: 0})
VMOVDQU(yHi, Mem{Base: y, Disp: 32})
ADDQ(U8(64), x)
ADDQ(U8(64), y)
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
VZEROUPPER()
RET()
}
{
TEXT("mulgf16_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
Pragma("noescape")
tablePtr := Load(Param("table"), GP64())
tables := [4]table256{}
for i, t := range tables {
t.Lo, t.Hi = YMM(), YMM()
// Load and expand tables
VBROADCASTI128(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
VBROADCASTI128(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
tables[i] = t
}
bytes := Load(Param("x").Len(), GP64())
x := Load(Param("x").Base(), GP64())
y := Load(Param("y").Base(), GP64())
// Generate mask
ctx.clrMask = YMM()
tmpMask := GP64()
MOVQ(U32(15), tmpMask)
MOVQ(tmpMask, ctx.clrMask.AsX())
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
dataLo, dataHi := YMM(), YMM()
Label("loop")
VMOVDQU(Mem{Base: y, Disp: 0}, dataLo)
VMOVDQU(Mem{Base: y, Disp: 32}, dataHi)
prodLo, prodHi := leoMul256(ctx, dataLo, dataHi, tables)
VMOVDQU(prodLo, Mem{Base: x, Disp: 0})
VMOVDQU(prodHi, Mem{Base: x, Disp: 32})
ADDQ(U8(64), x)
ADDQ(U8(64), y)
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
VZEROUPPER()
RET()
}
for _, avx512 := range []bool{true, false} {
// AVX-512 only uses more registers for tables.
var suffix = "avx2"
if avx512 {
suffix = "avx512"
}
ctx.avx512 = avx512
extZMMs := []reg.VecPhysical{reg.Z16, reg.Z17, reg.Z18, reg.Z19, reg.Z20, reg.Z21, reg.Z22, reg.Z23, reg.Z24, reg.Z25, reg.Z26, reg.Z27, reg.Z28, reg.Z29, reg.Z30, reg.Z31}
{
TEXT("ifftDIT4_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, table01 *[8*16]uint8, table23 *[8*16]uint8, table02 *[8*16]uint8, logMask uint8)"))
Pragma("noescape")
Comment("dist must be multiplied by 24 (size of slice header)")
Comment("logmask must be log_m01==kModulus, log_m23==kModulus, log_m02==kModulus from lowest to bit 3")
// Unpack tables to stack. Slower.
const unpackTables = false
table01Ptr := Load(Param("table01"), GP64())
table23Ptr := Load(Param("table23"), GP64())
table02Ptr := Load(Param("table02"), GP64())
// Prepare table pointers.
table01 := [4]table256{}
table23 := [4]table256{}
table02 := [4]table256{}
if avx512 {
usedZmm := 0
fill := func(t *[4]table256, ptr reg.Register) {
for i := range table01 {
t := &t[i]
if len(extZMMs)-usedZmm >= 2 {
tmpLo, tmpHi := YMM(), YMM()
t.useZmmLo, t.useZmmHi = &extZMMs[usedZmm], &extZMMs[usedZmm+1]
usedZmm += 2
// Load and expand tables
VBROADCASTI128(Mem{Base: ptr, Disp: i * 16}, tmpLo)
VBROADCASTI128(Mem{Base: ptr, Disp: i*16 + 16*4}, tmpHi)
VMOVAPS(tmpLo.AsZ(), *t.useZmmLo)
VMOVAPS(tmpHi.AsZ(), *t.useZmmHi)
} else {
t.loadLo128 = &Mem{Base: ptr, Disp: i * 16}
t.loadHi128 = &Mem{Base: ptr, Disp: i*16 + 16*4}
}
}
}
fill(&table02, table02Ptr)
fill(&table01, table01Ptr)
fill(&table23, table23Ptr)
}
for i := range table01 {
if avx512 {
continue
}
if unpackTables {
toStack := func(m Mem) *Mem {
stack := AllocLocal(32)
y := YMM()
VBROADCASTI128(m, y)
VMOVDQU(y, stack)
return &stack
}
table01[i].loadLo256 = toStack(Mem{Base: table01Ptr, Disp: i * 16})
table23[i].loadLo256 = toStack(Mem{Base: table23Ptr, Disp: i * 16})
table02[i].loadLo256 = toStack(Mem{Base: table02Ptr, Disp: i * 16})
table01[i].loadHi256 = toStack(Mem{Base: table01Ptr, Disp: i*16 + 16*4})
table23[i].loadHi256 = toStack(Mem{Base: table23Ptr, Disp: i*16 + 16*4})
table02[i].loadHi256 = toStack(Mem{Base: table02Ptr, Disp: i*16 + 16*4})
} else {
table01[i].loadLo128 = &Mem{Base: table01Ptr, Disp: i * 16}
table23[i].loadLo128 = &Mem{Base: table23Ptr, Disp: i * 16}
table02[i].loadLo128 = &Mem{Base: table02Ptr, Disp: i * 16}
table01[i].loadHi128 = &Mem{Base: table01Ptr, Disp: i*16 + 16*4}
table23[i].loadHi128 = &Mem{Base: table23Ptr, Disp: i*16 + 16*4}
table02[i].loadHi128 = &Mem{Base: table02Ptr, Disp: i*16 + 16*4}
}
}
// Generate mask
ctx.clrMask = YMM()
tmpMask := GP64()
MOVQ(U32(15), tmpMask)
MOVQ(tmpMask, ctx.clrMask.AsX())
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
dist := Load(Param("dist"), GP64())
// Pointers to each "work"
var work [4]reg.GPVirtual
workTable := Load(Param("work").Base(), GP64()) // &work[0]
bytes := GP64()
// Load length of work[0]
MOVQ(Mem{Base: workTable, Disp: 8}, bytes)
offset := GP64()
XORQ(offset, offset)
for i := range work {
work[i] = GP64()
// work[i] = &workTable[dist*i]
MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i])
if i < len(work)-1 {
ADDQ(dist, offset)
}
}
var workRegLo [4]reg.VecVirtual
var workRegHi [4]reg.VecVirtual
workRegLo[0], workRegHi[0] = YMM(), YMM()
workRegLo[1], workRegHi[1] = YMM(), YMM()
mask := Load(Param("logMask"), GP64())
Label("loop")
VMOVDQU(Mem{Base: work[0], Disp: 0}, workRegLo[0])
VMOVDQU(Mem{Base: work[0], Disp: 32}, workRegHi[0])
VMOVDQU(Mem{Base: work[1], Disp: 0}, workRegLo[1])
VMOVDQU(Mem{Base: work[1], Disp: 32}, workRegHi[1])
// First layer:
VPXOR(workRegLo[0], workRegLo[1], workRegLo[1])
VPXOR(workRegHi[0], workRegHi[1], workRegHi[1])
// Test bit 0
BTQ(U8(0), mask)
JC(LabelRef("skip_m01"))
leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[1], workRegHi[1], table01)
Label("skip_m01")
workRegLo[2], workRegHi[2] = YMM(), YMM()
workRegLo[3], workRegHi[3] = YMM(), YMM()
VMOVDQU(Mem{Base: work[2], Disp: 0}, workRegLo[2])
VMOVDQU(Mem{Base: work[2], Disp: 32}, workRegHi[2])
VMOVDQU(Mem{Base: work[3], Disp: 0}, workRegLo[3])
VMOVDQU(Mem{Base: work[3], Disp: 32}, workRegHi[3])
VPXOR(workRegLo[2], workRegLo[3], workRegLo[3])
VPXOR(workRegHi[2], workRegHi[3], workRegHi[3])
// Test bit 1
BTQ(U8(1), mask)
JC(LabelRef("skip_m23"))
leoMulAdd256(ctx, workRegLo[2], workRegHi[2], workRegLo[3], workRegHi[3], table23)
Label("skip_m23")
// Second layer:
VPXOR(workRegLo[0], workRegLo[2], workRegLo[2])
VPXOR(workRegHi[0], workRegHi[2], workRegHi[2])
VPXOR(workRegLo[1], workRegLo[3], workRegLo[3])
VPXOR(workRegHi[1], workRegHi[3], workRegHi[3])
// Test bit 2
BTQ(U8(2), mask)
JC(LabelRef("skip_m02"))
leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[2], workRegHi[2], table02)
leoMulAdd256(ctx, workRegLo[1], workRegHi[1], workRegLo[3], workRegHi[3], table02)
Label("skip_m02")
// Store + Next loop:
for i := range work {
VMOVDQU(workRegLo[i], Mem{Base: work[i], Disp: 0})
VMOVDQU(workRegHi[i], Mem{Base: work[i], Disp: 32})
ADDQ(U8(64), work[i])
}
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
VZEROUPPER()
RET()
}
{
TEXT("fftDIT4_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, table01 *[8*16]uint8, table23 *[8*16]uint8, table02 *[8*16]uint8, logMask uint8)"))
Pragma("noescape")
Comment("dist must be multiplied by 24 (size of slice header)")
Comment("logmask must be log_m01==kModulus, log_m23==kModulus, log_m02==kModulus from lowest to bit 3")
// Unpack tables to stack. Slower.
const unpackTables = false
table01Ptr := Load(Param("table01"), GP64())
table23Ptr := Load(Param("table23"), GP64())
table02Ptr := Load(Param("table02"), GP64())
// Prepare table pointers.
table01 := [4]table256{}
table23 := [4]table256{}
table02 := [4]table256{}
if avx512 {
usedZmm := 0
fill := func(t *[4]table256, ptr reg.Register) {
for i := range table01 {
t := &t[i]
if len(extZMMs)-usedZmm >= 2 {
tmpLo, tmpHi := YMM(), YMM()
t.useZmmLo, t.useZmmHi = &extZMMs[usedZmm], &extZMMs[usedZmm+1]
usedZmm += 2
// Load and expand tables
VBROADCASTI128(Mem{Base: ptr, Disp: i * 16}, tmpLo)
VBROADCASTI128(Mem{Base: ptr, Disp: i*16 + 16*4}, tmpHi)
VMOVAPS(tmpLo.AsZ(), *t.useZmmLo)
VMOVAPS(tmpHi.AsZ(), *t.useZmmHi)
} else {
t.loadLo128 = &Mem{Base: ptr, Disp: i * 16}
t.loadHi128 = &Mem{Base: ptr, Disp: i*16 + 16*4}
}
}
}
fill(&table02, table02Ptr)
fill(&table01, table01Ptr)
fill(&table23, table23Ptr)
}
for i := range table01 {
if avx512 {
continue
}
if unpackTables {
toStack := func(m Mem) *Mem {
stack := AllocLocal(32)
y := YMM()
VBROADCASTI128(m, y)
VMOVDQU(y, stack)
return &stack
}
table01[i].loadLo256 = toStack(Mem{Base: table01Ptr, Disp: i * 16})
table23[i].loadLo256 = toStack(Mem{Base: table23Ptr, Disp: i * 16})
table02[i].loadLo256 = toStack(Mem{Base: table02Ptr, Disp: i * 16})
table01[i].loadHi256 = toStack(Mem{Base: table01Ptr, Disp: i*16 + 16*4})
table23[i].loadHi256 = toStack(Mem{Base: table23Ptr, Disp: i*16 + 16*4})
table02[i].loadHi256 = toStack(Mem{Base: table02Ptr, Disp: i*16 + 16*4})
} else {
table01[i].loadLo128 = &Mem{Base: table01Ptr, Disp: i * 16}
table23[i].loadLo128 = &Mem{Base: table23Ptr, Disp: i * 16}
table02[i].loadLo128 = &Mem{Base: table02Ptr, Disp: i * 16}
table01[i].loadHi128 = &Mem{Base: table01Ptr, Disp: i*16 + 16*4}
table23[i].loadHi128 = &Mem{Base: table23Ptr, Disp: i*16 + 16*4}
table02[i].loadHi128 = &Mem{Base: table02Ptr, Disp: i*16 + 16*4}
}
}
// Generate mask
ctx.clrMask = YMM()
tmpMask := GP64()
MOVQ(U32(15), tmpMask)
MOVQ(tmpMask, ctx.clrMask.AsX())
VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask)
dist := Load(Param("dist"), GP64())
// Pointers to each "work"
var work [4]reg.GPVirtual
workTable := Load(Param("work").Base(), GP64()) // &work[0]
bytes := GP64()
// Load length of work[0]
MOVQ(Mem{Base: workTable, Disp: 8}, bytes)
offset := GP64()
XORQ(offset, offset)
for i := range work {
work[i] = GP64()
// work[i] = &workTable[dist*i]
MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i])
if i < len(work)-1 {
ADDQ(dist, offset)
}
}
var workRegLo [4]reg.VecVirtual
var workRegHi [4]reg.VecVirtual
workRegLo[0], workRegHi[0] = YMM(), YMM()
workRegLo[1], workRegHi[1] = YMM(), YMM()
workRegLo[2], workRegHi[2] = YMM(), YMM()
workRegLo[3], workRegHi[3] = YMM(), YMM()
mask := Load(Param("logMask"), GP64())
Label("loop")
VMOVDQU(Mem{Base: work[0], Disp: 0}, workRegLo[0])
VMOVDQU(Mem{Base: work[0], Disp: 32}, workRegHi[0])
VMOVDQU(Mem{Base: work[2], Disp: 0}, workRegLo[2])
VMOVDQU(Mem{Base: work[2], Disp: 32}, workRegHi[2])
VMOVDQU(Mem{Base: work[1], Disp: 0}, workRegLo[1])
VMOVDQU(Mem{Base: work[1], Disp: 32}, workRegHi[1])
VMOVDQU(Mem{Base: work[3], Disp: 0}, workRegLo[3])
VMOVDQU(Mem{Base: work[3], Disp: 32}, workRegHi[3])
// First layer:
// Test bit 0
BTQ(U8(0), mask)
JC(LabelRef("skip_m02"))
leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[2], workRegHi[2], table02)
leoMulAdd256(ctx, workRegLo[1], workRegHi[1], workRegLo[3], workRegHi[3], table02)
Label("skip_m02")
VPXOR(workRegLo[0], workRegLo[2], workRegLo[2])
VPXOR(workRegHi[0], workRegHi[2], workRegHi[2])
VPXOR(workRegLo[1], workRegLo[3], workRegLo[3])
VPXOR(workRegHi[1], workRegHi[3], workRegHi[3])
// Second layer:
// Test bit 1
BTQ(U8(1), mask)
JC(LabelRef("skip_m01"))
leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[1], workRegHi[1], table01)
Label("skip_m01")
VPXOR(workRegLo[0], workRegLo[1], workRegLo[1])
VPXOR(workRegHi[0], workRegHi[1], workRegHi[1])
// Store...
for i := range work[:2] {
VMOVDQU(workRegLo[i], Mem{Base: work[i], Disp: 0})
VMOVDQU(workRegHi[i], Mem{Base: work[i], Disp: 32})
ADDQ(U8(64), work[i])
}
// Test bit 2
BTQ(U8(2), mask)
JC(LabelRef("skip_m23"))
leoMulAdd256(ctx, workRegLo[2], workRegHi[2], workRegLo[3], workRegHi[3], table23)
Label("skip_m23")
VPXOR(workRegLo[2], workRegLo[3], workRegLo[3])
VPXOR(workRegHi[2], workRegHi[3], workRegHi[3])
// Store + Next loop:
for i := range work[2:] {
i := i + 2
VMOVDQU(workRegLo[i], Mem{Base: work[i], Disp: 0})
VMOVDQU(workRegHi[i], Mem{Base: work[i], Disp: 32})
ADDQ(U8(64), work[i])
}
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
VZEROUPPER()
RET()
}
}
// SSSE3:
ctx.avx512 = false
{
TEXT("ifftDIT2_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
Pragma("noescape")
tablePtr := Load(Param("table"), GP64())
tables := [4]table128{}
for i, t := range tables {
// We almost have enough space for all tables.
if i > 2 {
t.Lo, t.Hi = Mem{Base: tablePtr, Disp: i * 16}, Mem{Base: tablePtr, Disp: i*16 + 16*4}
} else {
t.Lo, t.Hi = XMM(), XMM()
MOVUPS(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
MOVUPS(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
}
tables[i] = t
}
// Generate mask
zero := XMM()
XORPS(zero, zero) // Zero, so bytes will be copied.
fifteen, mask := GP64(), XMM()
MOVQ(U32(0xf), fifteen)
MOVQ(fifteen, mask)
PSHUFB(zero, mask)
ctx.clrMask128 = mask
bytes := Load(Param("x").Len(), GP64())
x := Load(Param("x").Base(), GP64())
y := Load(Param("y").Base(), GP64())
Label("loop")
for i := 0; i < 2; i++ {
xLo, xHi, yLo, yHi := XMM(), XMM(), XMM(), XMM()
MOVUPS(Mem{Base: x, Disp: i*16 + 0}, xLo)
MOVUPS(Mem{Base: x, Disp: i*16 + 32}, xHi)
MOVUPS(Mem{Base: y, Disp: i*16 + 0}, yLo)
MOVUPS(Mem{Base: y, Disp: i*16 + 32}, yHi)
PXOR(xLo, yLo)
PXOR(xHi, yHi)
MOVUPS(yLo, Mem{Base: y, Disp: i*16 + 0})
MOVUPS(yHi, Mem{Base: y, Disp: i*16 + 32})
leoMulAdd128(ctx, xLo, xHi, yLo, yHi, tables)
MOVUPS(xLo, Mem{Base: x, Disp: i*16 + 0})
MOVUPS(xHi, Mem{Base: x, Disp: i*16 + 32})
}
ADDQ(U8(64), x)
ADDQ(U8(64), y)
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
RET()
}
{
TEXT("fftDIT2_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
Pragma("noescape")
tablePtr := Load(Param("table"), GP64())
tables := [4]table128{}
for i, t := range tables {
// We almost have enough space for all tables.
if i > 2 {
t.Lo, t.Hi = Mem{Base: tablePtr, Disp: i * 16}, Mem{Base: tablePtr, Disp: i*16 + 16*4}
} else {
t.Lo, t.Hi = XMM(), XMM()
MOVUPS(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
MOVUPS(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
}
tables[i] = t
}
// Generate mask
zero := XMM()
XORPS(zero, zero) // Zero, so bytes will be copied.
fifteen, mask := GP64(), XMM()
MOVQ(U32(0xf), fifteen)
MOVQ(fifteen, mask)
PSHUFB(zero, mask)
ctx.clrMask128 = mask
bytes := Load(Param("x").Len(), GP64())
x := Load(Param("x").Base(), GP64())
y := Load(Param("y").Base(), GP64())
Label("loop")
for i := 0; i < 2; i++ {
xLo, xHi, yLo, yHi := XMM(), XMM(), XMM(), XMM()
MOVUPS(Mem{Base: y, Disp: i*16 + 0}, yLo)
MOVUPS(Mem{Base: y, Disp: i*16 + 32}, yHi)
prodLo, prodHi := leoMul128(ctx, yLo, yHi, tables)
MOVUPS(Mem{Base: x, Disp: i*16 + 0}, xLo)
MOVUPS(Mem{Base: x, Disp: i*16 + 32}, xHi)
PXOR(prodLo, xLo)
PXOR(prodHi, xHi)
MOVUPS(xLo, Mem{Base: x, Disp: i*16 + 0})
MOVUPS(xHi, Mem{Base: x, Disp: i*16 + 32})
PXOR(xLo, yLo)
PXOR(xHi, yHi)
MOVUPS(yLo, Mem{Base: y, Disp: i*16 + 0})
MOVUPS(yHi, Mem{Base: y, Disp: i*16 + 32})
}
ADDQ(U8(64), x)
ADDQ(U8(64), y)
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
RET()
}
{
TEXT("mulgf16_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)"))
Pragma("noescape")
tablePtr := Load(Param("table"), GP64())
tables := [4]table128{}
for i, t := range tables {
// We have enough space for all tables.
if i > 3 {
t.Lo, t.Hi = Mem{Base: tablePtr, Disp: i * 16}, Mem{Base: tablePtr, Disp: i*16 + 16*4}
} else {
t.Lo, t.Hi = XMM(), XMM()
MOVUPS(Mem{Base: tablePtr, Disp: i * 16}, t.Lo)
MOVUPS(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi)
}
tables[i] = t
}
bytes := Load(Param("x").Len(), GP64())
x := Load(Param("x").Base(), GP64())
y := Load(Param("y").Base(), GP64())
// Generate mask
zero := XMM()
XORPS(zero, zero) // Zero, so bytes will be copied.
fifteen, mask := GP64(), XMM()
MOVQ(U32(0xf), fifteen)
MOVQ(fifteen, mask)
PSHUFB(zero, mask)
ctx.clrMask128 = mask
Label("loop")
for i := 0; i < 2; i++ {
dataLo, dataHi := XMM(), XMM()
MOVUPS(Mem{Base: y, Disp: i*16 + 0}, dataLo)
MOVUPS(Mem{Base: y, Disp: i*16 + 32}, dataHi)
prodLo, prodHi := leoMul128(ctx, dataLo, dataHi, tables)
MOVUPS(prodLo, Mem{Base: x, Disp: i*16 + 0})
MOVUPS(prodHi, Mem{Base: x, Disp: i*16 + 32})
}
ADDQ(U8(64), x)
ADDQ(U8(64), y)
SUBQ(U8(64), bytes)
JNZ(LabelRef("loop"))
RET()
}
}
// xLo, xHi updated, yLo, yHi preserved...
func leoMulAdd256(ctx gf16ctx, xLo, xHi, yLo, yHi reg.VecVirtual, table [4]table256) {
// inlined:
// prodLo, prodHi := leoMul256(ctx, yLo, yHi, table)
lo := yLo
hi := yHi
data0, data1 := YMM(), YMM()
VPSRLQ(U8(4), lo, data1) // data1 = lo >> 4
VPAND(ctx.clrMask, lo, data0) // data0 = lo&0xf
VPAND(ctx.clrMask, data1, data1) // data 1 = data1 &0xf
prodLo, prodHi := YMM(), YMM()
table[0].prepare()
VPSHUFB(data0, table[0].Lo, prodLo)
VPSHUFB(data0, table[0].Hi, prodHi)
tmpLo, tmpHi := YMM(), YMM()
table[1].prepare()
VPSHUFB(data1, table[1].Lo, tmpLo)
VPSHUFB(data1, table[1].Hi, tmpHi)
VPXOR(prodLo, tmpLo, prodLo)
VPXOR(prodHi, tmpHi, prodHi)
// Now process high
data0, data1 = YMM(), YMM() // Realloc to break dep
VPAND(hi, ctx.clrMask, data0)
VPSRLQ(U8(4), hi, data1)
VPAND(ctx.clrMask, data1, data1)
tmpLo, tmpHi = YMM(), YMM() // Realloc to break dep
table[2].prepare()
VPSHUFB(data0, table[2].Lo, tmpLo)
VPSHUFB(data0, table[2].Hi, tmpHi)
VPXOR(prodLo, tmpLo, prodLo)
VPXOR(prodHi, tmpHi, prodHi)
table[3].prepare()
VPSHUFB(data1, table[3].Lo, tmpLo)
VPSHUFB(data1, table[3].Hi, tmpHi)
if ctx.avx512 {
VPTERNLOGD(U8(0x96), prodLo, tmpLo, xLo)
VPTERNLOGD(U8(0x96), prodHi, tmpHi, xHi)
} else {
VPXOR3way(prodLo, tmpLo, xLo)
VPXOR3way(prodHi, tmpHi, xHi)
}
}
// leoMul256 lo, hi preserved...
func leoMul256(ctx gf16ctx, lo, hi reg.VecVirtual, table [4]table256) (prodLo, prodHi reg.VecVirtual) {
data0, data1 := YMM(), YMM()
VPSRLQ(U8(4), lo, data1) // data1 = lo >> 4
VPAND(ctx.clrMask, lo, data0) // data0 = lo&0xf
VPAND(ctx.clrMask, data1, data1) // data 1 = data1 &0xf
prodLo, prodHi = YMM(), YMM()
table[0].prepare()
VPSHUFB(data0, table[0].Lo, prodLo)
VPSHUFB(data0, table[0].Hi, prodHi)
tmpLo, tmpHi := YMM(), YMM()
table[1].prepare()
VPSHUFB(data1, table[1].Lo, tmpLo)
VPSHUFB(data1, table[1].Hi, tmpHi)
VPXOR(prodLo, tmpLo, prodLo)
VPXOR(prodHi, tmpHi, prodHi)
// Now process high
data0, data1 = YMM(), YMM() // Realloc to break dep
VPAND(hi, ctx.clrMask, data0)
VPSRLQ(U8(4), hi, data1)
VPAND(ctx.clrMask, data1, data1)
tmpLo, tmpHi = YMM(), YMM() // Realloc to break dep
table[2].prepare()
VPSHUFB(data0, table[2].Lo, tmpLo)
VPSHUFB(data0, table[2].Hi, tmpHi)
VPXOR(prodLo, tmpLo, prodLo)
VPXOR(prodHi, tmpHi, prodHi)
table[3].prepare()
VPSHUFB(data1, table[3].Lo, tmpLo)
VPSHUFB(data1, table[3].Hi, tmpHi)
VPXOR(prodLo, tmpLo, prodLo)
VPXOR(prodHi, tmpHi, prodHi)
return
}
func leoMulAdd128(ctx gf16ctx, xLo, xHi, yLo, yHi reg.VecVirtual, table [4]table128) {
prodLo, prodHi := leoMul128(ctx, yLo, yHi, table)
PXOR(prodLo, xLo)
PXOR(prodHi, xHi)
}
// leoMul128 lo, hi preseved (but likely will take extra regs to reuse)
func leoMul128(ctx gf16ctx, lo, hi reg.VecVirtual, table [4]table128) (prodLo, prodHi reg.VecVirtual) {
data0, data1 := XMM(), XMM()
MOVAPS(lo, data1)
PSRLQ(U8(4), data1) // data1 = lo >> 4
MOVAPS(lo, data0)
PAND(ctx.clrMask128, data0) // data0 = lo&0xf
PAND(ctx.clrMask128, data1) // data 1 = data1 &0xf
prodLo, prodHi = XMM(), XMM()
MOVUPS(table[0].Lo, prodLo)
MOVUPS(table[0].Hi, prodHi)
PSHUFB(data0, prodLo)
PSHUFB(data0, prodHi)
tmpLo, tmpHi := XMM(), XMM()
MOVUPS(table[1].Lo, tmpLo)
MOVUPS(table[1].Hi, tmpHi)
PSHUFB(data1, tmpLo)
PSHUFB(data1, tmpHi)
PXOR(tmpLo, prodLo)
PXOR(tmpHi, prodHi)
// Now process high
data0, data1 = XMM(), XMM() // Realloc to break dep
MOVAPS(hi, data0)
MOVAPS(hi, data1)
PAND(ctx.clrMask128, data0)
PSRLQ(U8(4), data1)
PAND(ctx.clrMask128, data1)
tmpLo, tmpHi = XMM(), XMM() // Realloc to break dep
MOVUPS(table[2].Lo, tmpLo)
MOVUPS(table[2].Hi, tmpHi)
PSHUFB(data0, tmpLo)
PSHUFB(data0, tmpHi)
PXOR(tmpLo, prodLo)
PXOR(tmpHi, prodHi)
MOVUPS(table[3].Lo, tmpLo)
MOVUPS(table[3].Hi, tmpHi)
PSHUFB(data1, tmpLo)
PSHUFB(data1, tmpHi)
PXOR(tmpLo, prodLo)
PXOR(tmpHi, prodHi)
return
}

View File

@ -6,6 +6,8 @@
package reedsolomon
import "encoding/binary"
const (
// The number of elements in the field.
fieldSize = 256
@ -929,3 +931,24 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte)
}
return dst
}
// xor slices writing to out.
func sliceXorGo(in, out []byte, _ *options) {
for len(out) >= 32 {
inS := in[:32]
v0 := binary.LittleEndian.Uint64(out[:8]) ^ binary.LittleEndian.Uint64(inS[:8])
v1 := binary.LittleEndian.Uint64(out[8:16]) ^ binary.LittleEndian.Uint64(inS[8:16])
v2 := binary.LittleEndian.Uint64(out[16:24]) ^ binary.LittleEndian.Uint64(inS[16:24])
v3 := binary.LittleEndian.Uint64(out[24:32]) ^ binary.LittleEndian.Uint64(inS[24:32])
binary.LittleEndian.PutUint64(out[:8], v0)
binary.LittleEndian.PutUint64(out[8:16], v1)
binary.LittleEndian.PutUint64(out[16:24], v2)
binary.LittleEndian.PutUint64(out[24:32], v3)
out = out[32:]
in = in[32:]
}
out = out[:len(in)]
for n, input := range in {
out[n] ^= input
}
}

View File

@ -132,9 +132,121 @@ func sliceXor(in, out []byte, o *options) {
in = in[done:]
out = out[done:]
}
} else {
sliceXorGo(in, out, o)
return
}
out = out[:len(in)]
for i := range in {
out[i] ^= in[i]
}
}
// 4-way butterfly
func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
if o.useAVX2 || o.useAVX512 {
if len(work[0]) > 0 {
var mask uint8
if log_m01 == modulus {
mask |= 1 << 0
}
if log_m23 == modulus {
mask |= 1 << 1
}
if log_m02 == modulus {
mask |= 1 << 2
}
t01 := &multiply256LUT[log_m01]
t23 := &multiply256LUT[log_m23]
t02 := &multiply256LUT[log_m02]
if o.useAVX512 {
ifftDIT4_avx512(work, dist*24, t01, t23, t02, mask)
} else {
ifftDIT4_avx2(work, dist*24, t01, t23, t02, mask)
}
}
return
}
ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
}
func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
if o.useAVX2 || o.useAVX512 {
if len(work[0]) > 0 {
var mask uint8
if log_m02 == modulus {
mask |= 1 << 0
}
if log_m01 == modulus {
mask |= 1 << 1
}
if log_m23 == modulus {
mask |= 1 << 2
}
t01 := &multiply256LUT[log_m01]
t23 := &multiply256LUT[log_m23]
t02 := &multiply256LUT[log_m02]
if o.useAVX512 {
fftDIT4_avx512(work, dist*24, t01, t23, t02, mask)
} else {
fftDIT4_avx2(work, dist*24, t01, t23, t02, mask)
}
}
return
}
fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
}
// 2-way butterfly forward
func fftDIT2(x, y []byte, log_m ffe, o *options) {
if o.useAVX2 {
if len(x) > 0 {
tmp := &multiply256LUT[log_m]
fftDIT2_avx2(x, y, tmp)
}
} else if o.useSSSE3 {
if len(x) > 0 {
tmp := &multiply256LUT[log_m]
fftDIT2_ssse3(x, y, tmp)
}
} else {
// Reference version:
refMulAdd(x, y, log_m)
sliceXor(x, y, o)
}
}
// 2-way butterfly
func ifftDIT2(x, y []byte, log_m ffe, o *options) {
if o.useAVX2 {
if len(x) > 0 {
tmp := &multiply256LUT[log_m]
ifftDIT2_avx2(x, y, tmp)
}
} else if o.useSSSE3 {
if len(x) > 0 {
tmp := &multiply256LUT[log_m]
ifftDIT2_ssse3(x, y, tmp)
}
} else {
// Reference version:
sliceXor(x, y, o)
refMulAdd(x, y, log_m)
}
}
func mulgf16(x, y []byte, log_m ffe, o *options) {
if o.useAVX2 {
if len(x) > 0 {
tmp := &multiply256LUT[log_m]
mulgf16_avx2(x, y, tmp)
}
} else if o.useSSSE3 {
if len(x) > 0 {
tmp := &multiply256LUT[log_m]
mulgf16_ssse3(x, y, tmp)
}
} else {
refMul(x, y, log_m)
}
}

View File

@ -64,3 +64,33 @@ func sliceXor(in, out []byte, o *options) {
}
}
}
// 4-way butterfly
func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
}
// 4-way butterfly
func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
}
// 2-way butterfly forward
func fftDIT2(x, y []byte, log_m ffe, o *options) {
// Reference version:
refMulAdd(x, y, log_m)
// 64 byte aligned, always full.
galXorNEON(x, y)
}
// 2-way butterfly
func ifftDIT2(x, y []byte, log_m ffe, o *options) {
// 64 byte aligned, always full.
galXorNEON(x, y)
// Reference version:
refMulAdd(x, y, log_m)
}
func mulgf16(x, y []byte, log_m ffe, o *options) {
refMul(x, y, log_m)
}

View File

@ -1176,3 +1176,33 @@ func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int)
// mulAvxTwo_10x10Xor takes 10 inputs and produces 10 outputs.
//go:noescape
func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int)
//go:noescape
func ifftDIT2_avx2(x []byte, y []byte, table *[128]uint8)
//go:noescape
func fftDIT2_avx2(x []byte, y []byte, table *[128]uint8)
//go:noescape
func mulgf16_avx2(x []byte, y []byte, table *[128]uint8)
//go:noescape
func ifftDIT4_avx512(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8, logMask uint8)
//go:noescape
func fftDIT4_avx512(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8, logMask uint8)
//go:noescape
func ifftDIT4_avx2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8, logMask uint8)
//go:noescape
func fftDIT4_avx2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8, logMask uint8)
//go:noescape
func ifftDIT2_ssse3(x []byte, y []byte, table *[128]uint8)
//go:noescape
func fftDIT2_ssse3(x []byte, y []byte, table *[128]uint8)
//go:noescape
func mulgf16_ssse3(x []byte, y []byte, table *[128]uint8)

File diff suppressed because it is too large Load Diff

View File

@ -7,8 +7,6 @@
package reedsolomon
import "encoding/binary"
func galMulSlice(c byte, in, out []byte, o *options) {
out = out[:len(in)]
if c == 1 {
@ -34,25 +32,38 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {
}
// simple slice xor
func sliceXor(in, out []byte, _ *options) {
for len(out) >= 32 {
inS := in[:32]
v0 := binary.LittleEndian.Uint64(out[:]) ^ binary.LittleEndian.Uint64(inS[:])
v1 := binary.LittleEndian.Uint64(out[8:]) ^ binary.LittleEndian.Uint64(inS[8:])
v2 := binary.LittleEndian.Uint64(out[16:]) ^ binary.LittleEndian.Uint64(inS[16:])
v3 := binary.LittleEndian.Uint64(out[24:]) ^ binary.LittleEndian.Uint64(inS[24:])
binary.LittleEndian.PutUint64(out[:], v0)
binary.LittleEndian.PutUint64(out[8:], v1)
binary.LittleEndian.PutUint64(out[16:], v2)
binary.LittleEndian.PutUint64(out[24:], v3)
out = out[32:]
in = in[32:]
}
for n, input := range in {
out[n] ^= input
}
func sliceXor(in, out []byte, o *options) {
sliceXorGo(in, out, o)
}
func init() {
defaultOptions.useAVX512 = false
}
// 4-way butterfly
func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
}
// 4-way butterfly
func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
}
// 2-way butterfly forward
func fftDIT2(x, y []byte, log_m ffe, o *options) {
// Reference version:
refMulAdd(x, y, log_m)
sliceXorGo(x, y, o)
}
// 2-way butterfly inverse
func ifftDIT2(x, y []byte, log_m ffe, o *options) {
// Reference version:
sliceXorGo(x, y, o)
refMulAdd(x, y, log_m)
}
func mulgf16(x, y []byte, log_m ffe, o *options) {
refMul(x, y, log_m)
}

View File

@ -72,3 +72,31 @@ func sliceXor(in, out []byte, o *options) {
out[n] ^= input
}
}
// 4-way butterfly
func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
}
// 4-way butterfly
func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o)
}
// 2-way butterfly forward
func fftDIT2(x, y []byte, log_m ffe, o *options) {
// Reference version:
refMulAdd(x, y, log_m)
sliceXor(x, y, o)
}
// 2-way butterfly inverse
func ifftDIT2(x, y []byte, log_m ffe, o *options) {
// Reference version:
sliceXor(x, y, o)
refMulAdd(x, y, log_m)
}
func mulgf16(x, y []byte, log_m ffe, o *options) {
refMul(x, y, log_m)
}

View File

@ -17,6 +17,8 @@ import (
"math/bits"
"sync"
"unsafe"
"github.com/klauspost/cpuid/v2"
)
// reedSolomonFF16 is like reedSolomon but for more than 256 total shards.
@ -25,6 +27,8 @@ type reedSolomonFF16 struct {
ParityShards int // Number of parity shards, should not be modified.
Shards int // Total number of shards. Calculated, and should not be modified.
workPool sync.Pool
o options
}
@ -77,9 +81,15 @@ var (
var mul16LUTs *[order]mul16LUT
type mul16LUT struct {
LUT [4 * 16]ffe
// Contains Lo product as a single lookup.
// Should be XORed with Hi lookup for result.
Lo [256]ffe
Hi [256]ffe
}
// Stores lookup for avx2
var multiply256LUT *[order][8 * 16]byte
func (r *reedSolomonFF16) Encode(shards [][]byte) error {
if len(shards) != r.Shards {
return ErrTooFewShards
@ -98,11 +108,23 @@ func (r *reedSolomonFF16) encode(shards [][]byte) error {
}
m := ceilPow2(r.ParityShards)
work := make([][]byte, m*2)
for i := range work {
work[i] = make([]byte, shardSize)
var work [][]byte
if w, ok := r.workPool.Get().([][]byte); ok {
work = w
}
if cap(work) >= m*2 {
work = work[:m*2]
} else {
work = make([][]byte, m*2)
}
for i := range work {
if cap(work[i]) < shardSize {
work[i] = make([]byte, shardSize)
} else {
work[i] = work[i][:shardSize]
}
}
defer r.workPool.Put(work)
mtrunc := m
if r.DataShards < mtrunc {
@ -245,7 +267,7 @@ func (r *reedSolomonFF16) reconstruct(shards [][]byte, recoverAll bool) error {
return err
}
shardSize := len(shards[0])
shardSize := shardSize(shards)
if shardSize%64 != 0 {
return ErrShardSize
}
@ -278,16 +300,29 @@ func (r *reedSolomonFF16) reconstruct(shards [][]byte, recoverAll bool) error {
fwht(errLocs[:], order, order)
work := make([][]byte, n)
for i := range work {
work[i] = make([]byte, shardSize)
var work [][]byte
if w, ok := r.workPool.Get().([][]byte); ok {
work = w
}
if cap(work) >= n {
work = work[:n]
} else {
work = make([][]byte, n)
}
for i := range work {
if cap(work[i]) < shardSize {
work[i] = make([]byte, shardSize)
} else {
work[i] = work[i][:shardSize]
}
}
defer r.workPool.Put(work)
// work <- recovery data
for i := 0; i < r.ParityShards; i++ {
if len(shards[i+r.DataShards]) != 0 {
mul(work[i], shards[i+r.DataShards], errLocs[i])
mulgf16(work[i], shards[i+r.DataShards], errLocs[i], &r.o)
} else {
memclr(work[i])
}
@ -300,7 +335,7 @@ func (r *reedSolomonFF16) reconstruct(shards [][]byte, recoverAll bool) error {
for i := 0; i < r.DataShards; i++ {
if len(shards[i]) != 0 {
mul(work[m+i], shards[i], errLocs[m+i])
mulgf16(work[m+i], shards[i], errLocs[m+i], &r.o)
} else {
memclr(work[m+i])
}
@ -353,13 +388,12 @@ func (r *reedSolomonFF16) reconstruct(shards [][]byte, recoverAll bool) error {
}
if i >= r.DataShards {
// Parity shard.
mul(shards[i], work[i-r.DataShards], modulus-errLocs[i-r.DataShards])
mulgf16(shards[i], work[i-r.DataShards], modulus-errLocs[i-r.DataShards], &r.o)
} else {
// Data shard.
mul(shards[i], work[i+m], modulus-errLocs[i+m])
mulgf16(shards[i], work[i+m], modulus-errLocs[i+m], &r.o)
}
}
return nil
}
@ -453,7 +487,7 @@ func fftDIT(work [][]byte, mtrunc, m int, skewLUT []ffe, o *options) {
}
// 4-way butterfly
func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
func fftDIT4Ref(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
// First layer:
if log_m02 == modulus {
sliceXor(work[0], work[dist*2], o)
@ -477,13 +511,6 @@ func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options)
}
}
// 2-way butterfly
func fftDIT2(x, y []byte, log_m ffe, o *options) {
// Reference version:
refMulAdd(x, y, log_m)
sliceXor(x, y, o)
}
// Unrolled IFFT for encoder
func ifftDITEncoder(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, m int, skewLUT []ffe, o *options) {
// I tried rolling the memcpy/memset into the first layer of the FFT and
@ -556,8 +583,7 @@ func ifftDITEncoder(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, m
}
}
// 4-way butterfly
func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
func ifftDIT4Ref(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) {
// First layer:
if log_m01 == modulus {
sliceXor(work[0], work[dist], o)
@ -581,31 +607,24 @@ func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options
}
}
// 2-way butterfly
func ifftDIT2(x, y []byte, log_m ffe, o *options) {
// Reference version:
sliceXor(x, y, o)
refMulAdd(x, y, log_m)
}
// Reference version of muladd: x[] ^= y[] * log_m
func refMulAdd(x, y []byte, log_m ffe) {
lut := mul16LUTs[log_m].LUT
lut := &mul16LUTs[log_m]
for off := 0; off < len(x); off += 64 {
for i := 0; i < 32; i++ {
lo := y[off+i]
hi := y[off+i+32]
for len(x) >= 64 {
// Assert sizes for no bounds checks in loop
hiA := y[32:64]
loA := y[:32]
dst := x[:64] // Needed, but not checked...
for i, lo := range loA {
hi := hiA[i]
prod := lut.Lo[lo] ^ lut.Hi[hi]
prod :=
lut[(lo&15)] ^
lut[(lo>>4)+16] ^
lut[(hi&15)+32] ^
lut[(hi>>4)+48]
x[off+i] ^= byte(prod)
x[off+i+32] ^= byte(prod >> 8)
dst[i] ^= byte(prod)
dst[i+32] ^= byte(prod >> 8)
}
x = x[64:]
y = y[64:]
}
}
@ -622,24 +641,17 @@ func slicesXor(v1, v2 [][]byte, o *options) {
}
}
func mul(x, y []byte, log_m ffe) {
refMul(x, y, log_m)
}
// Reference version of mul: x[] = y[] * log_m
func refMul(x, y []byte, log_m ffe) {
lut := mul16LUTs[log_m].LUT
lut := &mul16LUTs[log_m]
for off := 0; off < len(x); off += 64 {
for i := 0; i < 32; i++ {
lo := y[off+i]
hi := y[off+i+32]
prod :=
lut[(lo&15)] ^
lut[(lo>>4)+16] ^
lut[(hi&15)+32] ^
lut[(hi>>4)+48]
loA := y[off : off+32]
hiA := y[off+32:]
hiA = hiA[:len(loA)]
for i, lo := range loA {
hi := hiA[i]
prod := lut.Lo[lo] ^ lut.Hi[hi]
x[off+i] = byte(prod)
x[off+i+32] = byte(prod >> 8)
@ -843,10 +855,9 @@ func initMul16LUT() {
// For each log_m multiplicand:
for log_m := 0; log_m < order; log_m++ {
lut := &mul16LUTs[log_m]
var tmp [64]ffe
for nibble, shift := 0, 0; nibble < 4; {
nibble_lut := lut.LUT[nibble*16:]
nibble_lut := tmp[nibble*16:]
for xnibble := 0; xnibble < 16; xnibble++ {
prod := mulLog(ffe(xnibble<<shift), ffe(log_m))
@ -855,5 +866,29 @@ func initMul16LUT() {
nibble++
shift += 4
}
lut := &mul16LUTs[log_m]
for i := range lut.Lo[:] {
lut.Lo[i] = tmp[i&15] ^ tmp[((i>>4)+16)]
lut.Hi[i] = tmp[((i&15)+32)] ^ tmp[((i>>4)+48)]
}
}
if cpuid.CPU.Has(cpuid.SSSE3) || cpuid.CPU.Has(cpuid.AVX2) || cpuid.CPU.Has(cpuid.AVX512F) {
multiply256LUT = &[order][16 * 8]byte{}
for logM := range multiply256LUT[:] {
// For each 4 bits of the finite field width in bits:
shift := 0
for i := 0; i < 4; i++ {
// Construct 16 entry LUT for PSHUFB
prodLo := multiply256LUT[logM][i*16 : i*16+16]
prodHi := multiply256LUT[logM][4*16+i*16 : 4*16+i*16+16]
for x := range prodLo[:] {
prod := mulLog(ffe(x<<shift), ffe(logM))
prodLo[x] = byte(prod)
prodHi[x] = byte(prod >> 8)
}
shift += 4
}
}
}
}

View File

@ -14,6 +14,7 @@ import (
"math/rand"
"os"
"runtime"
"strconv"
"testing"
)
@ -192,7 +193,7 @@ func TestEncoding(t *testing.T) {
var testSizes = [][2]int{
{1, 0}, {3, 0}, {5, 0}, {8, 0}, {10, 0}, {12, 0}, {14, 0}, {41, 0}, {49, 0},
{1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}, {5, 20},
{256, 1},
{256, 20}, {500, 300}, {2945, 129},
}
var testDataSizes = []int{10, 100, 1000, 10001, 100003, 1000055}
var testDataSizesShort = []int{10, 10001, 100003}
@ -208,6 +209,9 @@ func testEncoding(t *testing.T, o ...Option) {
}
for _, perShard := range sz {
if data+parity > 256 {
if perShard > 1000 {
t.Skip("long tests not needed. Not length sensitive")
}
// Round up to 64 bytes.
perShard = (perShard + 63) &^ 63
}
@ -1004,6 +1008,22 @@ func BenchmarkEncode2x1x1M(b *testing.B) {
benchmarkEncode(b, 2, 1, 1024*1024)
}
// Benchmark 800 data slices with 200 parity slices
func BenchmarkEncode800x200(b *testing.B) {
for size := 64; size <= 1<<20; size *= 4 {
b.Run(fmt.Sprintf("%v", size), func(b *testing.B) {
benchmarkEncode(b, 800, 200, size)
})
}
}
func BenchmarkEncodeLeopard(b *testing.B) {
size := (64 << 20) / 800 / 64 * 64
b.Run(strconv.Itoa(size), func(b *testing.B) {
benchmarkEncode(b, 800, 200, size)
})
}
func BenchmarkEncode10x2x10000(b *testing.B) {
benchmarkEncode(b, 10, 2, 10000)
}
@ -1097,6 +1117,15 @@ func benchmarkVerify(b *testing.B, dataShards, parityShards, shardSize int) {
}
}
// Benchmark 800 data slices with 200 parity slices
func BenchmarkVerify800x200(b *testing.B) {
for size := 64; size <= 1<<20; size *= 4 {
b.Run(fmt.Sprintf("%v", size), func(b *testing.B) {
benchmarkVerify(b, 800, 200, size)
})
}
}
// Benchmark 10 data slices with 2 parity slices holding 10000 bytes each
func BenchmarkVerify10x2x10000(b *testing.B) {
benchmarkVerify(b, 10, 2, 10000)
@ -1177,6 +1206,15 @@ func BenchmarkReconstruct10x2x10000(b *testing.B) {
benchmarkReconstruct(b, 10, 2, 10000)
}
// Benchmark 800 data slices with 200 parity slices
func BenchmarkReconstruct800x200(b *testing.B) {
for size := 64; size <= 1<<20; size *= 4 {
b.Run(fmt.Sprintf("%v", size), func(b *testing.B) {
benchmarkReconstruct(b, 800, 200, size)
})
}
}
// Benchmark 50 data slices with 5 parity slices holding 100000 bytes each
func BenchmarkReconstruct50x5x50000(b *testing.B) {
benchmarkReconstruct(b, 50, 5, 100000)
@ -1252,6 +1290,15 @@ func BenchmarkReconstructData10x2x10000(b *testing.B) {
benchmarkReconstructData(b, 10, 2, 10000)
}
// Benchmark 800 data slices with 200 parity slices
func BenchmarkReconstructData800x200(b *testing.B) {
for size := 64; size <= 1<<20; size *= 4 {
b.Run(fmt.Sprintf("%v", size), func(b *testing.B) {
benchmarkReconstructData(b, 800, 200, size)
})
}
}
// Benchmark 50 data slices with 5 parity slices holding 100000 bytes each
func BenchmarkReconstructData50x5x50000(b *testing.B) {
benchmarkReconstructData(b, 50, 5, 100000)