Improve fwht speed (#198)

Improve fwht speed

* Send `*[65536]ffe` instead of slice, so 16 bit lookups can be done without bounds checks.
* Unroll fwht4
* Move `s2` out of loop.
* Load values instead of modifying pointers.

```
BenchmarkDecode1K/16x16-32                  1029           1175899 ns/op          27.87 MB/s       16410 B/op         17 allocs/op
BenchmarkDecode1K/32x32-32                  1023           1184744 ns/op          55.32 MB/s       32794 B/op         33 allocs/op
BenchmarkDecode1K/64x64-32                   979           1240467 ns/op         105.66 MB/s       65701 B/op         65 allocs/op
BenchmarkDecode1K/128x128-32                 922           1314928 ns/op         199.36 MB/s      131703 B/op        129 allocs/op
BenchmarkDecode1K/256x256-32                 792           1530508 ns/op         342.56 MB/s      263548 B/op        258 allocs/op
```

After:
```
BenchmarkDecode1K/16x16-32                  1503            798172 ns/op          41.05 MB/s       16408 B/op         17 allocs/op
BenchmarkDecode1K/32x32-32                  1483            804507 ns/op          81.46 MB/s       32792 B/op         33 allocs/op
BenchmarkDecode1K/64x64-32                  1408            852737 ns/op         153.71 MB/s       65658 B/op         65 allocs/op
BenchmarkDecode1K/128x128-32                1315            917534 ns/op         285.70 MB/s      131513 B/op        129 allocs/op
BenchmarkDecode1K/256x256-32                1069           1115760 ns/op         469.89 MB/s      263689 B/op        258 allocs/op
```
master
Klaus Post 2022-07-29 07:26:51 -07:00 committed by GitHub
parent 9c824807d6
commit 7b7dbe6919
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 33 additions and 7 deletions

View File

@ -390,13 +390,13 @@ func (r *reedSolomonFF16) reconstruct(shards [][]byte, recoverAll bool) error {
}
// Evaluate error locator polynomial
fwht(errLocs[:], order, m+r.DataShards)
fwht(&errLocs, order, m+r.DataShards)
for i := 0; i < order; i++ {
errLocs[i] = ffe((uint(errLocs[i]) * uint(logWalsh[i])) % modulus)
}
fwht(errLocs[:], order, order)
fwht(&errLocs, order, order)
var work [][]byte
if w, ok := r.workPool.Get().([][]byte); ok {
@ -798,7 +798,7 @@ func ceilPow2(n int) int {
// Decimation in time (DIT) Fast Walsh-Hadamard Transform
// Unrolls pairs of layers to perform cross-layer operations in registers
// mtrunc: Number of elements that are non-zero at the front of data
func fwht(data []ffe, m, mtrunc int) {
func fwht(data *[order]ffe, m, mtrunc int) {
// Decimation in time: Unroll 2 layers at a time
dist := 1
dist4 := 4
@ -806,8 +806,28 @@ func fwht(data []ffe, m, mtrunc int) {
// For each set of dist*4 elements:
for r := 0; r < mtrunc; r += dist4 {
// For each set of dist elements:
for i := r; i < r+dist; i++ {
fwht4(data[i:], dist)
// Use 16 bit indices to avoid bounds check on [65536]ffe.
dist := uint16(dist)
off := uint16(r)
for i := uint16(0); i < dist; i++ {
// fwht4(data[i:], dist) inlined...
// Reading values appear faster than updating pointers.
// Casting to uint is not faster.
t0 := data[off]
t1 := data[off+dist]
t2 := data[off+dist*2]
t3 := data[off+dist*3]
t0, t1 = fwht2alt(t0, t1)
t2, t3 = fwht2alt(t2, t3)
t0, t2 = fwht2alt(t0, t2)
t1, t3 = fwht2alt(t1, t3)
data[off] = t0
data[off+dist] = t1
data[off+dist*2] = t2
data[off+dist*3] = t3
off++
}
}
dist = dist4
@ -816,7 +836,8 @@ func fwht(data []ffe, m, mtrunc int) {
// If there is one layer left:
if dist < m {
for i := 0; i < dist; i++ {
dist := uint16(dist)
for i := uint16(0); i < dist; i++ {
fwht2(&data[i], &data[i+dist])
}
}
@ -844,6 +865,11 @@ func fwht2(a, b *ffe) {
*b = dif
}
// fwht2alt is as fwht2, but returns result.
func fwht2alt(a, b ffe) (ffe, ffe) {
return addMod(a, b), subMod(a, b)
}
var initOnce sync.Once
func initConstants() {
@ -945,7 +971,7 @@ func initFFTSkew() {
}
logWalsh[0] = 0
fwht(logWalsh[:], order, order)
fwht(logWalsh, order, order)
}
func initMul16LUT() {