Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

use simd masking for amd64&arm64#326

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
nhooyr merged 26 commits intocoder:devfromwdvxdr1123:patch-simd-mask
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from1 commit
Commits
Show all changes
26 commits
Select commitHold shift + click to select a range
5df0303
mask.go: Use SIMD masking for amd64 and arm64
wdvxdr1123Jan 24, 2022
cda2170
Refactor and compile masking code again
nhooyrOct 19, 2023
f5397ae
mask_asm.go: Disable AVX2
nhooyrOct 19, 2023
14172e5
Benchmark pure go masking algorithm separately from assembly
nhooyrOct 19, 2023
685a56e
Update README.md to indicate assembly websocket masking
nhooyrOct 19, 2023
cb7509a
mask_amd64.s: Remove AVX2 fully
nhooyrOct 19, 2023
3f8c9e0
mask_amd64.s: Minor improvements
nhooyrOct 19, 2023
367743d
mask_amd64.sh: Cleanup
nhooyrOct 19, 2023
27f80cb
mask.go: Cleanup assembly and add nbio benchmark
nhooyrOct 19, 2023
369d641
mask_arm64.s: Cleanup
nhooyrOct 20, 2023
fb13df2
ci/bench.sh: Benchmark masking on arm64 with QEMU
nhooyrOct 20, 2023
ecf7dec
ci/bench.sh: Install QEMU on CI
nhooyrOct 20, 2023
d34e5d4
wsjson: Add json.Encoder vs json.Marshal benchmark
nhooyrOct 20, 2023
e25d968
ci/bench.sh: Don't profile by default
nhooyrOct 20, 2023
640e3c2
ci/bench.sh: Try function instead of alias
nhooyrOct 20, 2023
0596e7a
wsjson: Extend benchmark with multiple sizes
nhooyrOct 20, 2023
30447a3
ci/bench.sh: Just symlink the expected qemu-aarch64 binary name
nhooyrOct 20, 2023
f4e61e5
ci/fmt.sh: Error if changes on CI
nhooyrOct 21, 2023
f533f43
mask.go: Reorganize
nhooyrOct 21, 2023
a1bb441
ci: Fix dev coverage output
nhooyrFeb 7, 2024
fee3739
mask_asm: Note implementation may not be perfect
nhooyrFeb 7, 2024
68fc887
mask.go: Revert my changes
nhooyrFeb 22, 2024
f62cef3
test.sh: Test assembly masking on arm64
nhooyrFeb 22, 2024
92acb74
internal/xcpu: Vendor golang.org/x/sys/cpu
nhooyrFeb 22, 2024
17e1b86
mask_asm: Disable AVX2
nhooyrFeb 22, 2024
2cd18b3
README.md: Link to assembly benchmark results
nhooyrFeb 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
PrevPrevious commit
NextNext commit
mask.go: Reorganize
  • Loading branch information
@nhooyr
nhooyr committedFeb 22, 2024
commitf533f430c7d63e9e0bceb2dcbbd5d75602803b82
125 changes: 0 additions & 125 deletionsframe.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -8,7 +8,6 @@ import (
"fmt"
"io"
"math"
"math/bits"

"nhooyr.io/websocket/internal/errd"
)
Expand DownExpand Up@@ -172,127 +171,3 @@ func writeFrameHeader(h header, w *bufio.Writer, buf []byte) (err error) {

return nil
}

// maskGo applies the WebSocket masking algorithm to p
// with the given key.
// See https://tools.ietf.org/html/rfc6455#section-5.3
//
// The returned value is the correctly rotated key to
// to continue to mask/unmask the message.
//
// It is optimized for LittleEndian and expects the key
// to be in little endian.
//
// See https://github.com/golang/go/issues/31586
//
//lint:ignore U1000 mask.go
func maskGo(b []byte, key uint32) uint32 {
if len(b) >= 8 {
key64 := uint64(key)<<32 | uint64(key)

// At some point in the future we can clean these unrolled loops up.
// See https://github.com/golang/go/issues/31586#issuecomment-487436401

// Then we xor until b is less than 128 bytes.
for len(b) >= 128 {
v := binary.LittleEndian.Uint64(b)
binary.LittleEndian.PutUint64(b, v^key64)
v = binary.LittleEndian.Uint64(b[8:16])
binary.LittleEndian.PutUint64(b[8:16], v^key64)
v = binary.LittleEndian.Uint64(b[16:24])
binary.LittleEndian.PutUint64(b[16:24], v^key64)
v = binary.LittleEndian.Uint64(b[24:32])
binary.LittleEndian.PutUint64(b[24:32], v^key64)
v = binary.LittleEndian.Uint64(b[32:40])
binary.LittleEndian.PutUint64(b[32:40], v^key64)
v = binary.LittleEndian.Uint64(b[40:48])
binary.LittleEndian.PutUint64(b[40:48], v^key64)
v = binary.LittleEndian.Uint64(b[48:56])
binary.LittleEndian.PutUint64(b[48:56], v^key64)
v = binary.LittleEndian.Uint64(b[56:64])
binary.LittleEndian.PutUint64(b[56:64], v^key64)
v = binary.LittleEndian.Uint64(b[64:72])
binary.LittleEndian.PutUint64(b[64:72], v^key64)
v = binary.LittleEndian.Uint64(b[72:80])
binary.LittleEndian.PutUint64(b[72:80], v^key64)
v = binary.LittleEndian.Uint64(b[80:88])
binary.LittleEndian.PutUint64(b[80:88], v^key64)
v = binary.LittleEndian.Uint64(b[88:96])
binary.LittleEndian.PutUint64(b[88:96], v^key64)
v = binary.LittleEndian.Uint64(b[96:104])
binary.LittleEndian.PutUint64(b[96:104], v^key64)
v = binary.LittleEndian.Uint64(b[104:112])
binary.LittleEndian.PutUint64(b[104:112], v^key64)
v = binary.LittleEndian.Uint64(b[112:120])
binary.LittleEndian.PutUint64(b[112:120], v^key64)
v = binary.LittleEndian.Uint64(b[120:128])
binary.LittleEndian.PutUint64(b[120:128], v^key64)
b = b[128:]
}

// Then we xor until b is less than 64 bytes.
for len(b) >= 64 {
v := binary.LittleEndian.Uint64(b)
binary.LittleEndian.PutUint64(b, v^key64)
v = binary.LittleEndian.Uint64(b[8:16])
binary.LittleEndian.PutUint64(b[8:16], v^key64)
v = binary.LittleEndian.Uint64(b[16:24])
binary.LittleEndian.PutUint64(b[16:24], v^key64)
v = binary.LittleEndian.Uint64(b[24:32])
binary.LittleEndian.PutUint64(b[24:32], v^key64)
v = binary.LittleEndian.Uint64(b[32:40])
binary.LittleEndian.PutUint64(b[32:40], v^key64)
v = binary.LittleEndian.Uint64(b[40:48])
binary.LittleEndian.PutUint64(b[40:48], v^key64)
v = binary.LittleEndian.Uint64(b[48:56])
binary.LittleEndian.PutUint64(b[48:56], v^key64)
v = binary.LittleEndian.Uint64(b[56:64])
binary.LittleEndian.PutUint64(b[56:64], v^key64)
b = b[64:]
}

// Then we xor until b is less than 32 bytes.
for len(b) >= 32 {
v := binary.LittleEndian.Uint64(b)
binary.LittleEndian.PutUint64(b, v^key64)
v = binary.LittleEndian.Uint64(b[8:16])
binary.LittleEndian.PutUint64(b[8:16], v^key64)
v = binary.LittleEndian.Uint64(b[16:24])
binary.LittleEndian.PutUint64(b[16:24], v^key64)
v = binary.LittleEndian.Uint64(b[24:32])
binary.LittleEndian.PutUint64(b[24:32], v^key64)
b = b[32:]
}

// Then we xor until b is less than 16 bytes.
for len(b) >= 16 {
v := binary.LittleEndian.Uint64(b)
binary.LittleEndian.PutUint64(b, v^key64)
v = binary.LittleEndian.Uint64(b[8:16])
binary.LittleEndian.PutUint64(b[8:16], v^key64)
b = b[16:]
}

// Then we xor until b is less than 8 bytes.
for len(b) >= 8 {
v := binary.LittleEndian.Uint64(b)
binary.LittleEndian.PutUint64(b, v^key64)
b = b[8:]
}
}

// Then we xor until b is less than 4 bytes.
for len(b) >= 4 {
v := binary.LittleEndian.Uint32(b)
binary.LittleEndian.PutUint32(b, v^key)
b = b[4:]
}

// xor remaining bytes.
for i := range b {
b[i] ^= byte(key)
key = bits.RotateLeft32(key, -8)
}

return key
}
131 changes: 127 additions & 4 deletionsmask.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,130 @@
//go:build !amd64 && !arm64 && !js

package websocket

func mask(b []byte, key uint32) uint32 {
return maskGo(b, key)
import (
"encoding/binary"
"math/bits"
)

// maskGo applies the WebSocket masking algorithm to p
// with the given key.
// See https://tools.ietf.org/html/rfc6455#section-5.3
//
// The returned value is the correctly rotated key to
// to continue to mask/unmask the message.
//
// It is optimized for LittleEndian and expects the key
// to be in little endian.
//
// See https://github.com/golang/go/issues/31586
//
//lint:ignore U1000 mask.go
func maskGo(b []byte, key uint32) uint32 {
if len(b) >= 8 {
key64 := uint64(key)<<32 | uint64(key)

// At some point in the future we can clean these unrolled loops up.
// See https://github.com/golang/go/issues/31586#issuecomment-487436401

// Then we xor until b is less than 128 bytes.
for len(b) >= 128 {
v := binary.LittleEndian.Uint64(b)
binary.LittleEndian.PutUint64(b, v^key64)
v = binary.LittleEndian.Uint64(b[8:16])
binary.LittleEndian.PutUint64(b[8:16], v^key64)
v = binary.LittleEndian.Uint64(b[16:24])
binary.LittleEndian.PutUint64(b[16:24], v^key64)
v = binary.LittleEndian.Uint64(b[24:32])
binary.LittleEndian.PutUint64(b[24:32], v^key64)
v = binary.LittleEndian.Uint64(b[32:40])
binary.LittleEndian.PutUint64(b[32:40], v^key64)
v = binary.LittleEndian.Uint64(b[40:48])
binary.LittleEndian.PutUint64(b[40:48], v^key64)
v = binary.LittleEndian.Uint64(b[48:56])
binary.LittleEndian.PutUint64(b[48:56], v^key64)
v = binary.LittleEndian.Uint64(b[56:64])
binary.LittleEndian.PutUint64(b[56:64], v^key64)
v = binary.LittleEndian.Uint64(b[64:72])
binary.LittleEndian.PutUint64(b[64:72], v^key64)
v = binary.LittleEndian.Uint64(b[72:80])
binary.LittleEndian.PutUint64(b[72:80], v^key64)
v = binary.LittleEndian.Uint64(b[80:88])
binary.LittleEndian.PutUint64(b[80:88], v^key64)
v = binary.LittleEndian.Uint64(b[88:96])
binary.LittleEndian.PutUint64(b[88:96], v^key64)
v = binary.LittleEndian.Uint64(b[96:104])
binary.LittleEndian.PutUint64(b[96:104], v^key64)
v = binary.LittleEndian.Uint64(b[104:112])
binary.LittleEndian.PutUint64(b[104:112], v^key64)
v = binary.LittleEndian.Uint64(b[112:120])
binary.LittleEndian.PutUint64(b[112:120], v^key64)
v = binary.LittleEndian.Uint64(b[120:128])
binary.LittleEndian.PutUint64(b[120:128], v^key64)
b = b[128:]
}

// Then we xor until b is less than 64 bytes.
for len(b) >= 64 {
v := binary.LittleEndian.Uint64(b)
binary.LittleEndian.PutUint64(b, v^key64)
v = binary.LittleEndian.Uint64(b[8:16])
binary.LittleEndian.PutUint64(b[8:16], v^key64)
v = binary.LittleEndian.Uint64(b[16:24])
binary.LittleEndian.PutUint64(b[16:24], v^key64)
v = binary.LittleEndian.Uint64(b[24:32])
binary.LittleEndian.PutUint64(b[24:32], v^key64)
v = binary.LittleEndian.Uint64(b[32:40])
binary.LittleEndian.PutUint64(b[32:40], v^key64)
v = binary.LittleEndian.Uint64(b[40:48])
binary.LittleEndian.PutUint64(b[40:48], v^key64)
v = binary.LittleEndian.Uint64(b[48:56])
binary.LittleEndian.PutUint64(b[48:56], v^key64)
v = binary.LittleEndian.Uint64(b[56:64])
binary.LittleEndian.PutUint64(b[56:64], v^key64)
b = b[64:]
}

// Then we xor until b is less than 32 bytes.
for len(b) >= 32 {
v := binary.LittleEndian.Uint64(b)
binary.LittleEndian.PutUint64(b, v^key64)
v = binary.LittleEndian.Uint64(b[8:16])
binary.LittleEndian.PutUint64(b[8:16], v^key64)
v = binary.LittleEndian.Uint64(b[16:24])
binary.LittleEndian.PutUint64(b[16:24], v^key64)
v = binary.LittleEndian.Uint64(b[24:32])
binary.LittleEndian.PutUint64(b[24:32], v^key64)
b = b[32:]
}

// Then we xor until b is less than 16 bytes.
for len(b) >= 16 {
v := binary.LittleEndian.Uint64(b)
binary.LittleEndian.PutUint64(b, v^key64)
v = binary.LittleEndian.Uint64(b[8:16])
binary.LittleEndian.PutUint64(b[8:16], v^key64)
b = b[16:]
}

// Then we xor until b is less than 8 bytes.
for len(b) >= 8 {
v := binary.LittleEndian.Uint64(b)
binary.LittleEndian.PutUint64(b, v^key64)
b = b[8:]
}
}

// Then we xor until b is less than 4 bytes.
for len(b) >= 4 {
v := binary.LittleEndian.Uint32(b)
binary.LittleEndian.PutUint32(b, v^key)
b = b[4:]
}

// xor remaining bytes.
for i := range b {
b[i] ^= byte(key)
key = bits.RotateLeft32(key, -8)
}

return key
}
36 changes: 30 additions & 6 deletionsmask_amd64.s
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -19,11 +19,16 @@ TEXT ·maskAsm(SB), NOSPLIT, $0-28

CMPQ CX, $8
JL less_than_8
CMPQ CX, $512
CMPQ CX, $128
JLE sse
TESTQ $31, AX
JNZ unaligned

aligned:
CMPB ·useAVX2(SB), $1
JE avx2
JMP sse

unaligned_loop_1byte:
XORB SI, (AX)
INCQ AX
Expand All@@ -40,7 +45,7 @@ unaligned_loop_1byte:
ORQ DX, DI

TESTQ $31, AX
JZsse
JZaligned

unaligned:
// $7 & len, if not zero jump to loop_1b.
Expand All@@ -54,17 +59,36 @@ unaligned_loop:
SUBQ $8, CX
TESTQ $31, AX
JNZ unaligned_loop
JMP sse

JMP aligned

avx2:
CMPQ CX, $128
JL sse
VMOVQ DI, X0
VPBROADCASTQ X0, Y0

// TODO: shouldn't these be aligned movs now?
// TODO: should be 256?
avx2_loop:
VMOVDQU (AX), Y1
VPXOR Y0, Y1, Y2
VMOVDQU Y2, (AX)
ADDQ $128, AX
SUBQ $128, CX
CMPQ CX, $128
// Loop if CX >= 128.
JAE avx2_loop

// TODO: should be 128?
sse:
CMPQ CX, $64
JL less_than_64
MOVQ DI, X0
PUNPCKLQDQ X0, X0

sse_loop:
MOVOU0*16(AX), X1
MOVOU1*16(AX), X2
MOVOU (AX), X1
MOVOU 16(AX), X2
MOVOU 2*16(AX), X3
MOVOU 3*16(AX), X4
PXOR X0, X1
Expand Down
2 changes: 2 additions & 0 deletionsmask_asm.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -9,5 +9,7 @@ func mask(b []byte, key uint32) uint32 {
return key
}

var useAVX2 = true

//go:noescape
func maskAsm(b *byte, len int, key uint32) uint32
7 changes: 7 additions & 0 deletionsmask_go.go
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
//go:build !amd64 && !arm64 && !js

package websocket

func mask(b []byte, key uint32) uint32 {
return maskGo(b, key)
}

[8]ページ先頭

©2009-2025 Movatter.jp