Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitcfca343

Browse files
wdvxdr1123nhooyr
authored andcommitted
mask.go: Use SIMD masking for amd64 and arm64
goos: windowsgoarch: amd64pkg: nhooyr.io/websocketcpu: Intel(R) Core(TM) i5-9300H CPU @ 2.40GHzBenchmark_mask/2/basic-8 425339004 2.795 ns/op 715.66 MB/sBenchmark_mask/2/nhooyr-8 379937766 3.186 ns/op 627.78 MB/sBenchmark_mask/2/gorilla-8 392164167 3.071 ns/op 651.24 MB/sBenchmark_mask/2/gobwas-8 310037222 3.880 ns/op 515.46 MB/sBenchmark_mask/3/basic-8 321408024 3.806 ns/op 788.32 MB/sBenchmark_mask/3/nhooyr-8 350726338 3.478 ns/op 862.58 MB/sBenchmark_mask/3/gorilla-8 332217727 3.634 ns/op 825.43 MB/sBenchmark_mask/3/gobwas-8 247376214 4.886 ns/op 614.01 MB/sBenchmark_mask/4/basic-8 261182472 4.582 ns/op 872.91 MB/sBenchmark_mask/4/nhooyr-8 381830712 3.262 ns/op1226.05 MB/sBenchmark_mask/4/gorilla-8 272616304 4.395 ns/op 910.04 MB/sBenchmark_mask/4/gobwas-8 204574558 5.855 ns/op 683.19 MB/sBenchmark_mask/8/basic-8 191330037 6.162 ns/op1298.24 MB/sBenchmark_mask/8/nhooyr-8 369694992 3.285 ns/op2435.65 MB/sBenchmark_mask/8/gorilla-8 175388466 6.743 ns/op1186.48 MB/sBenchmark_mask/8/gobwas-8 241719933 4.886 ns/op1637.45 MB/sBenchmark_mask/16/basic-8 100000000 10.92 ns/op1464.83 MB/sBenchmark_mask/16/nhooyr-8 272565096 4.436 ns/op3606.98 MB/sBenchmark_mask/16/gorilla-8 100000000 11.20 ns/op1428.53 MB/sBenchmark_mask/16/gobwas-8 221356798 5.405 ns/op2960.45 MB/sBenchmark_mask/32/basic-8 61476984 20.40 ns/op1568.80 MB/sBenchmark_mask/32/nhooyr-8 238665572 5.050 ns/op6337.22 MB/sBenchmark_mask/32/gorilla-8 100000000 12.09 ns/op2647.28 MB/sBenchmark_mask/32/gobwas-8 186077235 6.477 ns/op4940.36 MB/sBenchmark_mask/128/basic-8 14629720 80.90 ns/op1582.19 MB/sBenchmark_mask/128/nhooyr-8 181241968 6.565 ns/op19497.98 MB/sBenchmark_mask/128/gorilla-8 68308342 16.76 ns/op7639.37 MB/sBenchmark_mask/128/gobwas-8 94582026 12.97 ns/op9872.11 MB/sBenchmark_mask/512/basic-8 3921001 305.6 ns/op1675.55 MB/sBenchmark_mask/512/nhooyr-8 123102199 9.721 ns/op52669.11 MB/sBenchmark_mask/512/gorilla-8 32355914 38.18 ns/op13411.43 MB/sBenchmark_mask/512/gobwas-8 31528501 37.80 ns/op13544.37 MB/sBenchmark_mask/4096/basic-8 491804 2381 ns/op1720.39 MB/sBenchmark_mask/4096/nhooyr-8 26159691 46.98 ns/op87187.73 MB/sBenchmark_mask/4096/gorilla-8 4898440 243.6 ns/op16817.89 MB/sBenchmark_mask/4096/gobwas-8 4336398 277.2 ns/op14776.40 MB/sBenchmark_mask/16384/basic-8 113842 9623 ns/op1702.66 MB/sBenchmark_mask/16384/nhooyr-8 8088847 154.5 ns/op106058.18 MB/sBenchmark_mask/16384/gorilla-8 1282993 933.6 ns/op17549.90 MB/sBenchmark_mask/16384/gobwas-8 997347 1086 ns/op15093.49 MB/sWe're about 4-5x faster then gorilla now.
1 parent535fd2c commitcfca343

File tree

7 files changed

+257
-1
lines changed

7 files changed

+257
-1
lines changed

‎frame.go‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ func writeFrameHeader(h header, w *bufio.Writer, buf []byte) (err error) {
184184
// to be in little endian.
185185
//
186186
// See https://github.com/golang/go/issues/31586
187-
funcmask(keyuint32,b []byte)uint32 {
187+
funcmaskGo(keyuint32,b []byte)uint32 {
188188
iflen(b)>=8 {
189189
key64:=uint64(key)<<32|uint64(key)
190190

‎go.mod‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
modulenhooyr.io/websocket
22

33
go1.19
4+
5+
requiregolang.org/x/sysv0.13.0

‎go.sum‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
golang.org/x/sysv0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
2+
golang.org/x/sysv0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=

‎mask_amd64.s‎

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
#include"textflag.h"
2+
3+
// func maskAsm(b *byte, len int, key uint32)
4+
TEXT ·maskAsm(SB), NOSPLIT,$0-28
5+
// AX = b
6+
// CX = len (left length)
7+
// SI = key (uint32)
8+
// DI = uint64(SI) | uint64(SI)<<32
9+
MOVQ b+0(FP), AX
10+
MOVQ len+8(FP), CX
11+
MOVL key+16(FP), SI
12+
13+
// calculate the DI
14+
// DI = SI<<32 | SI
15+
MOVL SI, DI
16+
MOVQ DI, DX
17+
SHLQ$32, DI
18+
ORQ DX, DI
19+
20+
CMPQ CX,$15
21+
JLE less_than_16
22+
CMPQ CX,$63
23+
JLE less_than_64
24+
CMPQ CX,$128
25+
JLE sse
26+
TESTQ$31, AX
27+
JNZ unaligned
28+
29+
aligned:
30+
CMPB ·useAVX2(SB),$1
31+
JE avx2
32+
JMP sse
33+
34+
unaligned_loop_1byte:
35+
XORB SI, (AX)
36+
INCQ AX
37+
DECQ CX
38+
ROLL$24, SI
39+
TESTQ$7, AX
40+
JNZ unaligned_loop_1byte
41+
42+
// calculate DI again since SI was modified
43+
// DI = SI<<32 | SI
44+
MOVL SI, DI
45+
MOVQ DI, DX
46+
SHLQ$32, DI
47+
ORQ DX, DI
48+
49+
TESTQ$31, AX
50+
JZ aligned
51+
52+
unaligned:
53+
TESTQ$7, AX// AND $7 & len, if not zero jump to loop_1b.
54+
JNZ unaligned_loop_1byte
55+
56+
unaligned_loop:
57+
// we don't need to check the CX since we know it's above 128
58+
XORQ DI, (AX)
59+
ADDQ$8, AX
60+
SUBQ$8, CX
61+
TESTQ$31, AX
62+
JNZ unaligned_loop
63+
JMP aligned
64+
65+
avx2:
66+
CMPQ CX,$0x80
67+
JL sse
68+
VMOVQ DI, X0
69+
VPBROADCASTQ X0, Y0
70+
71+
avx2_loop:
72+
VPXOR (AX), Y0, Y1
73+
VPXOR32(AX), Y0, Y2
74+
VPXOR64(AX), Y0, Y3
75+
VPXOR96(AX), Y0, Y4
76+
VMOVDQU Y1, (AX)
77+
VMOVDQU Y2,32(AX)
78+
VMOVDQU Y3,64(AX)
79+
VMOVDQU Y4,96(AX)
80+
ADDQ$0x80, AX
81+
SUBQ$0x80, CX
82+
CMPQ CX,$0x80
83+
JAE avx2_loop// loop if CX >= 0x80
84+
85+
sse:
86+
CMPQ CX,$0x40
87+
JL less_than_64
88+
MOVQ DI, X0
89+
PUNPCKLQDQ X0, X0
90+
91+
sse_loop:
92+
MOVOU0*16(AX), X1
93+
MOVOU 1*16(AX), X2
94+
MOVOU 2*16(AX), X3
95+
MOVOU 3*16(AX), X4
96+
PXOR X0, X1
97+
PXOR X0, X2
98+
PXOR X0, X3
99+
PXOR X0, X4
100+
MOVOU X1,0*16(AX)
101+
MOVOU X2, 1*16(AX)
102+
MOVOU X3, 2*16(AX)
103+
MOVOU X4, 3*16(AX)
104+
ADDQ$0x40, AX
105+
SUBQ$0x40, CX
106+
CMPQ CX,$0x40
107+
JAE sse_loop
108+
109+
less_than_64:
110+
TESTQ$32, CX
111+
JZ less_than_32
112+
XORQ DI, (AX)
113+
XORQ DI,8(AX)
114+
XORQ DI,16(AX)
115+
XORQ DI,24(AX)
116+
ADDQ$32, AX
117+
118+
less_than_32:
119+
TESTQ$16, CX
120+
JZ less_than_16
121+
XORQ DI, (AX)
122+
XORQ DI,8(AX)
123+
ADDQ$16, AX
124+
125+
less_than_16:
126+
TESTQ$8, CX
127+
JZ less_than_8
128+
XORQ DI, (AX)
129+
ADDQ$8, AX
130+
131+
less_than_8:
132+
TESTQ$4, CX
133+
JZ less_than_4
134+
XORL SI, (AX)
135+
ADDQ$4, AX
136+
137+
less_than_4:
138+
TESTQ$2, CX
139+
JZ less_than_2
140+
XORW SI, (AX)
141+
ROLL$16, SI
142+
ADDQ$2, AX
143+
144+
less_than_2:
145+
TESTQ$1, CX
146+
JZ done
147+
XORB SI, (AX)
148+
ROLL$24, SI
149+
150+
done:
151+
MOVL SI,ret+24(FP)
152+
RET

‎mask_arm64.s‎

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#include"textflag.h"
2+
3+
// func maskAsm(b *byte,len, int, key uint32)
4+
TEXT ·maskAsm(SB), NOSPLIT,$0-28
5+
// R0 = b
6+
// R1 = len
7+
// R2 = uint64(key)<<32 | uint64(key)
8+
// R3 = key (uint32)
9+
MOVD b_ptr+0(FP), R0
10+
MOVD b_len+8(FP), R1
11+
MOVWU key+16(FP), R3
12+
MOVD R3, R2
13+
ORR R2<<32, R2, R2
14+
VDUP R2, V0.D2
15+
CMP$64, R1
16+
BLT less_than_64
17+
18+
// todo: optimize unaligned case
19+
loop_64:
20+
VLD1 (R0), [V1.B16, V2.B16, V3.B16, V4.B16]
21+
VEOR V1.B16, V0.B16, V1.B16
22+
VEOR V2.B16, V0.B16, V2.B16
23+
VEOR V3.B16, V0.B16, V3.B16
24+
VEOR V4.B16, V0.B16, V4.B16
25+
VST1.P [V1.B16, V2.B16, V3.B16, V4.B16],64(R0)
26+
SUBS$64, R1
27+
CMP$64, R1
28+
BGE loop_64
29+
30+
less_than_64:
31+
// quick end
32+
CBZ R1, end
33+
TBZ$5, R1, less_than32
34+
VLD1 (R0), [V1.B16, V2.B16]
35+
VEOR V1.B16, V0.B16, V1.B16
36+
VEOR V2.B16, V0.B16, V2.B16
37+
VST1.P [V1.B16, V2.B16],32(R0)
38+
39+
less_than32:
40+
TBZ$4, R1, less_than16
41+
LDP (R0), (R11, R12)
42+
EOR R11, R2, R11
43+
EOR R12, R2, R12
44+
STP.P (R11, R12),16(R0)
45+
46+
less_than16:
47+
TBZ$3, R1, less_than8
48+
MOVD (R0), R11
49+
EOR R2, R11, R11
50+
MOVD.P R11,8(R0)
51+
52+
less_than8:
53+
TBZ$2, R1, less_than4
54+
MOVWU (R0), R11
55+
EORW R2, R11, R11
56+
MOVWU.P R11,4(R0)
57+
58+
less_than4:
59+
TBZ$1, R1, less_than2
60+
MOVHU (R0), R11
61+
EORW R3, R11, R11
62+
MOVHU.P R11,2(R0)
63+
RORW$16, R3
64+
65+
less_than2:
66+
TBZ$0, R1, end
67+
MOVBU (R0), R11
68+
EORW R3, R11, R11
69+
MOVBU.P R11,1(R0)
70+
RORW$8, R3
71+
72+
end:
73+
MOVWU R3,ret+24(FP)
74+
RET

‎mask_asm.go‎

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
//go:build !appengine && (amd64 || arm64)
2+
// +build !appengine
3+
// +build amd64 arm64
4+
5+
package websocket
6+
7+
import"golang.org/x/sys/cpu"
8+
9+
funcmask(keyuint32,b []byte)uint32 {
10+
iflen(b)>0 {
11+
returnmaskAsm(&b[0],len(b),key)
12+
}
13+
returnkey
14+
}
15+
16+
varuseAVX2=cpu.X86.HasAVX2
17+
18+
//go:noescape
19+
funcmaskAsm(b*byte,lenint,keyuint32)uint32

‎mask_generic.go‎

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
//go:build appengine || (!amd64 && !arm64 && !js)
2+
3+
package websocket
4+
5+
funcmask(keyuint32,b []byte)uint32 {
6+
returnmaskGo(key,b)
7+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp