-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add dense matmul benchmark #256
Changes from all commits
8631594
abaff5d
00e9e5a
77be0d8
1271189
9a4e288
a22e4cf
8a02eee
57471ba
fd12ec7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
name: Run benchmarks | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
|
||
jobs: | ||
run-benchmarks: | ||
runs-on: ubuntu-latest | ||
container: | ||
image: ghcr.io/kuleuven-micas/snax:v0.1.14 | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Install snax compiler | ||
run: python3 -m pip install -e . | ||
- name: Reinstall pip modules from requirements | ||
run: python3 -m pip install -r requirements.txt | ||
- name: Run benchmarks | ||
run: python3 genbenchmark.py | ||
working-directory: benchmarks/${{ matrix.kernel }} | ||
- uses: actions/upload-artifact@v4 | ||
with: | ||
name: output_report | ||
path: benchmarks/${{ matrix.kernel }}/output_report.txt | ||
strategy: | ||
matrix: | ||
kernel: [dense_matmul] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
results | ||
generated* | ||
output_report.txt |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# Courtesy of Federico Ficarelli | ||
|
||
.DEFAULT_GOAL := all | ||
|
||
include ../../runtime/snax-gemmx.rules | ||
include ../../runtime/Makefile.rules | ||
|
||
TESTS += generated.x | ||
|
||
SNAXOPTFLAGS = -p convert-linalg-to-kernel,insert-accfg-op{accelerator=snax_gemmx},dispatch-kernels,set-memory-space,set-memory-layout,realize-memref-casts,insert-sync-barrier,dispatch-regions{nb_cores=3},guarded-linalg-to-memref-stream,schedule-memref-linalg,stream-snaxify,convert-linalg-to-accfg,test-add-mcycle-around-launch,convert-accfg-to-csr,snax-copy-to-dma,memref-to-snax,snax-to-func,snax-lower-mcycle,clear-memory-space | ||
|
||
GEN_DATA_OPTS += --m=${SIZE_M} | ||
GEN_DATA_OPTS += --n=${SIZE_N} | ||
GEN_DATA_OPTS += --k=${SIZE_K} | ||
|
||
|
||
CFLAGS += -std=gnu11 | ||
CFLAGS += -Wall -Wextra | ||
# Needed for perfetto script | ||
CFLAGS += -g | ||
ifdef NO_CHECK | ||
CFLAGS += -DNO_CHECK | ||
endif | ||
|
||
data.c data.h: | ||
$(PYTHON) gendata.py ${GEN_DATA_OPTS} | ||
|
||
%.x: %.o main.o data.o | ||
$(LD) $(LDFLAGS) $^ -o $@ | ||
|
||
sim_%: % | ||
rm -fr ./logs/ | ||
$(VLTSIM) $< | ||
|
||
RUN = $(addprefix run_, $(TESTS)) | ||
$(RUN): run_%: sim_% | ||
mv logs $(subst sim_,,$<).logs | ||
|
||
all: $(TESTS) | ||
|
||
allrun: $(RUN) | ||
|
||
clean: | ||
rm -fr *.ll12 *.x *.o *.logs/ logs/ data.h data.c |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
import itertools | ||
import json | ||
import pathlib | ||
from io import StringIO | ||
|
||
from xdsl.builder import ImplicitBuilder | ||
from xdsl.dialects import arith, builtin, func, linalg | ||
from xdsl.dialects.builtin import i8, i32 | ||
from xdsl.ir import Block, Region | ||
from xdsl.printer import Printer | ||
|
||
from util.snax_benchmark import SNAXBenchmark | ||
|
||
|
||
def create_matrix_multiply(k, m, n): | ||
""" | ||
Generate IR in the form of: | ||
``` | ||
builtin.module { | ||
func.func @streamer_matmul(%arg0 : memref<16x16xi8>, %arg1 : memref<16x16xi8, | ||
strided<[1, 16]>>, %arg2 : memref<16x16xi32>) { | ||
%0 = arith.constant 0 : i32 | ||
linalg.quantized_matmul ins(%arg0, %arg1, %0, %0 : memref<16x16xi8>, | ||
memref<16x16xi8, strided<[1, 16]>>, i32, i32) | ||
outs(%arg2 : memref<16x16xi32>) | ||
func.return | ||
} | ||
} | ||
``` | ||
""" | ||
|
||
def get_2d_memref_type(typ, dim_one, dim_two, transpose=False): | ||
layout = ( | ||
builtin.StridedLayoutAttr([1, dim_one]) if transpose else builtin.NoneAttr() | ||
) | ||
return builtin.MemRefType(typ, [dim_one, dim_two], layout=layout) | ||
|
||
input_types = [ | ||
get_2d_memref_type(i8, k, m), | ||
get_2d_memref_type(i8, m, n, transpose=True), | ||
get_2d_memref_type(i32, k, n), | ||
] | ||
|
||
b = Block(arg_types=(input_types)) | ||
|
||
with ImplicitBuilder(b) as (arg0, arg1, arg2): | ||
c0 = arith.Constant.from_int_and_width(0, 32) | ||
linalg.QuantizedMatmulOp([arg0, arg1, c0.result, c0.result], [arg2]) | ||
func.Return() | ||
|
||
region = Region(b) | ||
|
||
function = func.FuncOp.from_region("streamer_matmul", input_types, [], region) | ||
|
||
module = builtin.ModuleOp([function]) | ||
|
||
return module | ||
|
||
|
||
def write_module_to_file(module, file): | ||
output = StringIO() | ||
printer = Printer(stream=output) | ||
printer.print(module) | ||
with open(file, "w") as output_file: | ||
output_file.write(output.getvalue()) | ||
|
||
|
||
def generate_tiled_benchmark(m, n, k) -> SNAXBenchmark: | ||
module = create_matrix_multiply(k, m, n) | ||
write_module_to_file(module, "generated.mlir") | ||
binary = "generated.x" | ||
bm = SNAXBenchmark( | ||
kernel=f"tiled_matmul_generated_{k}x{n}x{m}", | ||
binary=binary, | ||
src_dir=str(pathlib.Path.cwd()), | ||
export_dir=str(pathlib.Path.cwd()), | ||
) | ||
return bm | ||
|
||
|
||
if __name__ == "__main__": | ||
"""Runs the gendata.py script with specified arguments.""" | ||
selected_dims = [32, 48, 64] | ||
|
||
sizes = list(itertools.product(selected_dims, repeat=3)) | ||
|
||
output_report: dict[str, dict] = {} | ||
|
||
for size in sizes: | ||
k, m, n = size | ||
folder = f"test_generated_{k}x{m}x{m}" | ||
bm = generate_tiled_benchmark(k, m, n) | ||
bm.clean() | ||
bm.build( | ||
build_opts=[ | ||
"NO_CHECK=1", | ||
f"SIZE_M={m}", | ||
f"SIZE_N={n}", | ||
f"SIZE_K={k}", | ||
] | ||
) | ||
bm.run() | ||
bm.trace() | ||
bm.process_traces(folder) | ||
bm.copy_binary(folder) | ||
bm.copy_logs(folder) | ||
|
||
# add to output report | ||
trace = bm.log_dir.joinpath(bm.input_file.format(hart="00000")) | ||
with open(trace) as file: | ||
data = json.load(file) | ||
cycles = data[1]["cycles"] | ||
ideal = round((k / 8 + 1) * (m / 8) * (n / 8)) | ||
utilization = ideal / cycles | ||
output_report[bm.benchmark] = { | ||
"cycles": cycles, | ||
"ideal": ideal, | ||
"utilization": utilization, | ||
} | ||
|
||
with open("output_report.txt", "w") as file: | ||
file.write("benchmark\tcycles\tideal\tutilization\t\n") | ||
avg_utilization = 0 | ||
avg_n = 0 | ||
for benchmark in output_report: | ||
file.write(f"{benchmark}\t") | ||
file.write(f"{output_report[benchmark]['cycles']}\t") | ||
file.write(f"{output_report[benchmark]['ideal']}\t") | ||
file.write(f"{output_report[benchmark]['utilization']}\t") | ||
file.write("\n") | ||
avg_utilization += output_report[benchmark]["utilization"] | ||
avg_n += 1 | ||
file.write("--------------------------\n") | ||
file.write("average\t\t\t") | ||
file.write(f"{avg_utilization/avg_n}\t\n") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
# simple script to generate inputs and expected outputs for simple_matmult | ||
|
||
import argparse | ||
|
||
import numpy as np | ||
|
||
from util.gendata import create_data, create_header | ||
|
||
|
||
def create_test_data(n, m, k): | ||
print(f"Creating test data with n={n}, m={m}, k={k}") | ||
# Reset random seed for reproducible behavior | ||
|
||
np.random.seed(0) | ||
|
||
A_size = [n, k] | ||
B_size = [k, m] | ||
|
||
# C = A.B | ||
low_bound = -128 | ||
high_bound = 127 | ||
|
||
A = np.random.randint(low_bound, high_bound, size=A_size, dtype=np.dtype("int8")) | ||
B = np.random.randint(low_bound, high_bound, size=B_size, dtype=np.dtype("int8")) | ||
|
||
# Make sure the product is possible! | ||
assert A.shape[1] == B.shape[0] | ||
C_golden = np.matmul(A.astype(np.dtype("int32")), B.astype(np.dtype("int32"))) | ||
C = np.zeros(C_golden.shape, np.dtype("int32")) | ||
|
||
# Perform layout transformations before writing to memory | ||
|
||
# only thing necessary: transform B from row-major to column-major | ||
B_new_layout = np.transpose(B) | ||
|
||
# C are just all zeros, so layout not important | ||
sizes = { | ||
"N_size": A.shape[0], | ||
"K_size": A.shape[1], | ||
"M_size": B.shape[1], | ||
} | ||
variables = { | ||
"A": A, | ||
"B": B_new_layout, | ||
"C_golden": C_golden, | ||
"C": C, | ||
} | ||
|
||
create_header("data.h", sizes, variables) | ||
create_data("data.c", variables) | ||
Comment on lines
+10
to
+50
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can probably move this to a more common folder :) |
||
|
||
|
||
if __name__ == "__main__": | ||
# Set up the argument parser | ||
parser = argparse.ArgumentParser( | ||
description="Generate test data with specified parameters." | ||
) | ||
# Adding arguments | ||
parser.add_argument("--n", type=int, default=16, help="Value for n (default: 16)") | ||
parser.add_argument("--m", type=int, default=16, help="Value for m (default: 16)") | ||
parser.add_argument("--k", type=int, default=16, help="Value for k (default: 16)") | ||
|
||
# Parse the arguments | ||
args = parser.parse_args() | ||
|
||
# Call the function with parsed arguments | ||
create_test_data(n=args.n, m=args.m, k=args.k) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#include "stdint.h" | ||
|
||
#include "data.h" | ||
#include "memref.h" | ||
#include "snax_rt.h" | ||
|
||
/* | ||
* These libraries are included from github.com/KULeuven-MICAS/snitch_cluster | ||
* Interested users, might want to look at: | ||
* | ||
* /sw/snRuntime/api | ||
* /target/snitch_cluster/sw/runtime/rtl/src | ||
* /target/snitch_cluster/sw/runtime/common | ||
* */ | ||
#include <snrt.h> | ||
|
||
// Kernel provided via external definition | ||
void _mlir_ciface_streamer_matmul(TwoDMemrefI8_t *a, TwoDMemrefI8_t *b, | ||
TwoDMemrefI32_t *c); | ||
|
||
int main() { | ||
{ | ||
|
||
// Create memref objects for data stored in L3 | ||
TwoDMemrefI8_t memrefA; | ||
memrefA.data = &A; | ||
memrefA.aligned_data = memrefA.data; | ||
memrefA.shape[0] = N_size; | ||
memrefA.shape[1] = K_size; | ||
memrefA.stride[0] = K_size; | ||
memrefA.stride[1] = 1; | ||
memrefA.offset = 0; | ||
|
||
TwoDMemrefI8_t memrefB; | ||
memrefB.data = &B; | ||
memrefB.aligned_data = memrefB.data; | ||
memrefB.shape[0] = K_size; | ||
memrefB.shape[1] = M_size; | ||
memrefB.stride[0] = 1; | ||
memrefB.stride[1] = K_size; | ||
memrefB.offset = 0; | ||
|
||
TwoDMemrefI32_t memrefC; | ||
memrefC.data = &C; | ||
memrefC.aligned_data = memrefC.data; | ||
memrefC.shape[0] = N_size; | ||
memrefC.shape[1] = M_size; | ||
memrefC.stride[0] = M_size; | ||
memrefC.stride[1] = 1; | ||
memrefC.offset = 0; | ||
|
||
_mlir_ciface_streamer_matmul(&memrefA, &memrefB, &memrefC); | ||
|
||
snrt_cluster_hw_barrier(); | ||
|
||
// Correctness check - | ||
// from this point on only core 0 is required to be alive. | ||
int thiscore = snrt_cluster_core_idx(); | ||
if (thiscore != 0) | ||
return 0; | ||
|
||
#ifdef NO_CHECK | ||
// No correctness check = | ||
// Always finish as if nothing happened | ||
return 0; | ||
#endif | ||
int nerr = 0; | ||
|
||
for (int i = 0; i < M_size * N_size; i++) { | ||
{ | ||
int32_t error = memrefC.aligned_data[i] - C_golden[i]; | ||
// printf("%d) %d -> %d\n", i, (int32_t)memrefC.aligned_data[i], | ||
// (int32_t)C_golden[i]); | ||
if (error != 0) | ||
nerr += 1; | ||
} | ||
} | ||
|
||
// insert mcycle to show fault in trace | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you mean by this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just a quick way to check if an operation succeeds by looking at the trace. if there are 3 mcycles, it was successful. if there are 4, an error has occured |
||
if (nerr != 0) | ||
snrt_mcycle(); | ||
|
||
return nerr; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want to move this to the tools folder at some point