Skip to content

Commit

Permalink
Merge pull request #21 from Jille/vzeroall
Browse files Browse the repository at this point in the history
Call VZEROALL for a 21% benchmark improvement
  • Loading branch information
bwesterb authored Oct 15, 2024
2 parents 344ccb8 + d184c57 commit cd9bb24
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 14 deletions.
30 changes: 16 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,23 +36,25 @@ goos: linux
goarch: amd64
pkg: github.com/bwesterb/go-and
cpu: 13th Gen Intel(R) Core(TM) i9-13900
│ naive │ purego │ asm │
│ sec/op │ sec/op vs base │ sec/op vs base │
And-32 264.51µ ± 6% 64.69µ ± 1% -75.54% (p=0.000 n=10) 24.43µ ± 3% -90.77% (p=0.000 n=10)
Or-32 274.06µ ± 5% 64.89µ ± 2% -76.32% (p=0.000 n=10) 24.30µ ± 1% -91.13% (p=0.000 n=10)
AndNot-32 309.01µ ± 0% 73.10µ ± 1% -76.34% (p=0.000 n=10) 24.52µ ± 2% -92.07% (p=0.000 n=10)
Memset-32 225.74µ ± 4% 56.64µ ± 1% -74.91% (p=0.000 n=10) 15.77µ ± 1% -93.01% (p=0.000 n=10)
Popcnt-32 128.45µ ± 1% 69.35µ ± 0% -46.01% (p=0.000 n=10) 31.80µ ± 3% -75.24% (p=0.000 n=10)
geomean 230.4µ 65.50µ -71.58% 23.59µ -89.76%
│ naive │ purego │ asm │
│ sec/op │ sec/op vs base │ sec/op vs base │
And-32 8162.0n ± 5% 2034.5n ± 1% -75.07% (p=0.000 n=10) 518.4n ± 1% -93.65% (p=0.000 n=10)
Or-32 9751.5n ± 8% 2104.5n ± 3% -78.42% (p=0.000 n=10) 515.0n ± 1% -94.72% (p=0.000 n=10)
Xor-32 8112.5n ± 3% 2029.0n ± 0% -74.99% (p=0.000 n=10) 518.6n ± 1% -93.61% (p=0.000 n=10)
AndNot-32 10685.5n ± 4% 2292.0n ± 2% -78.55% (p=0.000 n=10) 517.8n ± 1% -95.15% (p=0.000 n=10)
Memset-32 167.96µ ± 0% 57.54µ ± 1% -65.74% (p=0.000 n=10) 15.65µ ± 1% -90.68% (p=0.000 n=10)
Popcnt-32 132.15µ ± 1% 71.63µ ± 1% -45.80% (p=0.000 n=10) 36.51µ ± 2% -72.37% (p=0.000 n=10)
geomean 23.13µ 6.592µ -71.50% 1.857µ -91.97%
│ naive │ purego │ asm │
│ B/s │ B/s vs base │ B/s vs base │
And-32 3.521Gi ± 6% 14.397Gi ± 2% +308.89% (p=0.000 n=10) 38.129Gi ± 3% +982.90% (p=0.000 n=10)
Or-32 3.398Gi ± 6% 14.353Gi ± 2% +322.36% (p=0.000 n=10) 38.319Gi ± 1% +1027.60% (p=0.000 n=10)
AndNot-32 3.014Gi ± 0% 12.740Gi ± 1% +322.71% (p=0.000 n=10) 37.988Gi ± 2% +1160.45% (p=0.000 n=10)
Memset-32 4.126Gi ± 3% 16.444Gi ± 1% +298.59% (p=0.000 n=10) 59.051Gi ± 1% +1331.33% (p=0.000 n=10)
Popcnt-32 7.251Gi ± 1% 13.428Gi ± 0% +85.20% (p=0.000 n=10) 29.288Gi ± 3% +303.94% (p=0.000 n=10)
geomean 4.042Gi 14.22Gi +251.80% 39.49Gi +876.93%
And-32 3.651Gi ± 5% 14.649Gi ± 1% +301.20% (p=0.000 n=10) 57.488Gi ± 1% +1474.44% (p=0.000 n=10)
Or-32 3.057Gi ± 8% 14.163Gi ± 3% +363.37% (p=0.000 n=10) 57.872Gi ± 1% +1793.33% (p=0.000 n=10)
Xor-32 3.674Gi ± 3% 14.690Gi ± 0% +299.88% (p=0.000 n=10) 57.469Gi ± 1% +1464.38% (p=0.000 n=10)
AndNot-32 2.789Gi ± 4% 13.003Gi ± 2% +366.21% (p=0.000 n=10) 57.558Gi ± 1% +1963.74% (p=0.000 n=10)
Memset-32 5.545Gi ± 0% 16.187Gi ± 1% +191.91% (p=0.000 n=10) 59.504Gi ± 1% +973.11% (p=0.000 n=10)
Popcnt-32 7.048Gi ± 1% 13.002Gi ± 1% +84.48% (p=0.000 n=10) 25.507Gi ± 2% +261.92% (p=0.000 n=10)
geomean 4.058Gi 14.24Gi +250.89% 50.56Gi +1145.76%
```

### Apple M2 Pro
Expand Down
10 changes: 10 additions & 0 deletions and_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ loop:
ADDQ $0x00000100, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func andAVX(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -98,6 +99,7 @@ loop:
ADDQ $0x00000080, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func orAVX2(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -146,6 +148,7 @@ loop:
ADDQ $0x00000100, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func orAVX(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -194,6 +197,7 @@ loop:
ADDQ $0x00000080, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func xorAVX2(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -242,6 +246,7 @@ loop:
ADDQ $0x00000100, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func xorAVX(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -290,6 +295,7 @@ loop:
ADDQ $0x00000080, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func andNotAVX2(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -338,6 +344,7 @@ loop:
ADDQ $0x00000100, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func andNotAVX(dst *byte, a *byte, b *byte, l uint64)
Expand Down Expand Up @@ -386,6 +393,7 @@ loop:
ADDQ $0x00000080, DX
SUBQ $0x00000001, BX
JNZ loop
VZEROALL
RET

// func popcntAsm(a *byte, l uint64) int
Expand Down Expand Up @@ -438,6 +446,7 @@ loop:
ADDQ $0x00000020, AX
SUBQ $0x00000001, CX
JNZ loop
VZEROALL
RET

// func memsetAVX(dst *byte, l uint64, b byte)
Expand All @@ -454,6 +463,7 @@ loop:
ADDQ $0x00000010, AX
SUBQ $0x00000001, CX
JNZ loop
VZEROALL
RET

DATA zeroes<>+0(SB)/4, $0x00000000
Expand Down
2 changes: 2 additions & 0 deletions internal/asm/src.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ func gen(name string, op func(Op, Op, Op), avxLevel AVXLevel, doc string) {
SUBQ(U32(1), l)
JNZ(LabelRef("loop"))

VZEROALL()
RET()
}

Expand Down Expand Up @@ -175,5 +176,6 @@ func genMemset(avxLevel AVXLevel) {
SUBQ(U32(1), l)
JNZ(LabelRef("loop"))

VZEROALL()
RET()
}

0 comments on commit cd9bb24

Please sign in to comment.