Improve fwht speed (#198)
Improve fwht speed * Send `*[65536]ffe` instead of slice, so 16 bit lookups can be done without bounds checks. * Unroll fwht4 * Move `s2` out of loop. * Load values instead of modifying pointers. ``` BenchmarkDecode1K/16x16-32 1029 1175899 ns/op 27.87 MB/s 16410 B/op 17 allocs/op BenchmarkDecode1K/32x32-32 1023 1184744 ns/op 55.32 MB/s 32794 B/op 33 allocs/op BenchmarkDecode1K/64x64-32 979 1240467 ns/op 105.66 MB/s 65701 B/op 65 allocs/op BenchmarkDecode1K/128x128-32 922 1314928 ns/op 199.36 MB/s 131703 B/op 129 allocs/op BenchmarkDecode1K/256x256-32 792 1530508 ns/op 342.56 MB/s 263548 B/op 258 allocs/op ``` After: ``` BenchmarkDecode1K/16x16-32 1503 798172 ns/op 41.05 MB/s 16408 B/op 17 allocs/op BenchmarkDecode1K/32x32-32 1483 804507 ns/op 81.46 MB/s 32792 B/op 33 allocs/op BenchmarkDecode1K/64x64-32 1408 852737 ns/op 153.71 MB/s 65658 B/op 65 allocs/op BenchmarkDecode1K/128x128-32 1315 917534 ns/op 285.70 MB/s 131513 B/op 129 allocs/op BenchmarkDecode1K/256x256-32 1069 1115760 ns/op 469.89 MB/s 263689 B/op 258 allocs/op ```master
parent
9c824807d6
commit
7b7dbe6919
40
leopard.go
40
leopard.go
|
@ -390,13 +390,13 @@ func (r *reedSolomonFF16) reconstruct(shards [][]byte, recoverAll bool) error {
|
|||
}
|
||||
// Evaluate error locator polynomial
|
||||
|
||||
fwht(errLocs[:], order, m+r.DataShards)
|
||||
fwht(&errLocs, order, m+r.DataShards)
|
||||
|
||||
for i := 0; i < order; i++ {
|
||||
errLocs[i] = ffe((uint(errLocs[i]) * uint(logWalsh[i])) % modulus)
|
||||
}
|
||||
|
||||
fwht(errLocs[:], order, order)
|
||||
fwht(&errLocs, order, order)
|
||||
|
||||
var work [][]byte
|
||||
if w, ok := r.workPool.Get().([][]byte); ok {
|
||||
|
@ -798,7 +798,7 @@ func ceilPow2(n int) int {
|
|||
// Decimation in time (DIT) Fast Walsh-Hadamard Transform
|
||||
// Unrolls pairs of layers to perform cross-layer operations in registers
|
||||
// mtrunc: Number of elements that are non-zero at the front of data
|
||||
func fwht(data []ffe, m, mtrunc int) {
|
||||
func fwht(data *[order]ffe, m, mtrunc int) {
|
||||
// Decimation in time: Unroll 2 layers at a time
|
||||
dist := 1
|
||||
dist4 := 4
|
||||
|
@ -806,8 +806,28 @@ func fwht(data []ffe, m, mtrunc int) {
|
|||
// For each set of dist*4 elements:
|
||||
for r := 0; r < mtrunc; r += dist4 {
|
||||
// For each set of dist elements:
|
||||
for i := r; i < r+dist; i++ {
|
||||
fwht4(data[i:], dist)
|
||||
// Use 16 bit indices to avoid bounds check on [65536]ffe.
|
||||
dist := uint16(dist)
|
||||
off := uint16(r)
|
||||
for i := uint16(0); i < dist; i++ {
|
||||
// fwht4(data[i:], dist) inlined...
|
||||
// Reading values appear faster than updating pointers.
|
||||
// Casting to uint is not faster.
|
||||
t0 := data[off]
|
||||
t1 := data[off+dist]
|
||||
t2 := data[off+dist*2]
|
||||
t3 := data[off+dist*3]
|
||||
|
||||
t0, t1 = fwht2alt(t0, t1)
|
||||
t2, t3 = fwht2alt(t2, t3)
|
||||
t0, t2 = fwht2alt(t0, t2)
|
||||
t1, t3 = fwht2alt(t1, t3)
|
||||
|
||||
data[off] = t0
|
||||
data[off+dist] = t1
|
||||
data[off+dist*2] = t2
|
||||
data[off+dist*3] = t3
|
||||
off++
|
||||
}
|
||||
}
|
||||
dist = dist4
|
||||
|
@ -816,7 +836,8 @@ func fwht(data []ffe, m, mtrunc int) {
|
|||
|
||||
// If there is one layer left:
|
||||
if dist < m {
|
||||
for i := 0; i < dist; i++ {
|
||||
dist := uint16(dist)
|
||||
for i := uint16(0); i < dist; i++ {
|
||||
fwht2(&data[i], &data[i+dist])
|
||||
}
|
||||
}
|
||||
|
@ -844,6 +865,11 @@ func fwht2(a, b *ffe) {
|
|||
*b = dif
|
||||
}
|
||||
|
||||
// fwht2alt is as fwht2, but returns result.
|
||||
func fwht2alt(a, b ffe) (ffe, ffe) {
|
||||
return addMod(a, b), subMod(a, b)
|
||||
}
|
||||
|
||||
var initOnce sync.Once
|
||||
|
||||
func initConstants() {
|
||||
|
@ -945,7 +971,7 @@ func initFFTSkew() {
|
|||
}
|
||||
logWalsh[0] = 0
|
||||
|
||||
fwht(logWalsh[:], order, order)
|
||||
fwht(logWalsh, order, order)
|
||||
}
|
||||
|
||||
func initMul16LUT() {
|
||||
|
|
Loading…
Reference in New Issue