From 95fb6cb1b833b9567d15b205f3960edff7e4b575 Mon Sep 17 00:00:00 2001 From: Prashant Kumar Date: Tue, 6 Aug 2024 18:27:08 +0530 Subject: [PATCH] [LLVMCPU] Fix test (#18113) Updated the tile-root-fuse-producer-consumer test. --- .../tile-root-fuse-consumer-producer.mlir | 61 ++++++++----------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile-root-fuse-consumer-producer.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile-root-fuse-consumer-producer.mlir index 9f7a7653d8d6..8710da6e4b08 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile-root-fuse-consumer-producer.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile-root-fuse-consumer-producer.mlir @@ -30,9 +30,10 @@ func.func @mmt4d_bias_relu(%arg0: tensor, %arg1: tensor -func.func @quantized_matmul() { +#config = #iree_codegen.lowering_config +#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> +#map3 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)> +func.func @quantized_matmul(%arg0: tensor<2x4x128x16x1xi8>, %arg1: tensor<2x4x16xf32>, %arg2: tensor<2x4x16xf32>, %arg3: tensor<2x688x128x16x1xi8>, %arg4: tensor<2x688x16xf32>, %arg5: tensor<2x688x16xf32>) -> tensor<2x11008x64xf32> { %c2995200 = arith.constant 2995200 : index %c2994688 = arith.constant 2994688 : index %c2994176 = arith.constant 2994176 : index @@ -40,44 +41,30 @@ func.func @quantized_matmul() { %c88064 = arith.constant 88064 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c2995200) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c2994688) flags(ReadOnly) : !flow.dispatch.tensor> - %2 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c2994176) flags(ReadOnly) : !flow.dispatch.tensor> - %3 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c176128) flags(ReadOnly) : !flow.dispatch.tensor> - %4 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c88064) flags(ReadOnly) : !flow.dispatch.tensor> - %5 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %6 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %7 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0, 0], sizes = [2, 4, 128, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x4x128x16x1xi8> - %8 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [2, 4, 16], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x4x16xf32> - %9 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0], sizes = [2, 4, 16], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x4x16xf32> - %10 = flow.dispatch.tensor.load %3, offsets = [0, 0, 0, 0, 0], sizes = [2, 688, 128, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x688x128x16x1xi8> - %11 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0], sizes = [2, 688, 16], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x688x16xf32> - %12 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [2, 688, 16], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<2x688x16xf32> - %13 = tensor.empty() : tensor<2x4x128x16x1xf32> - %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%7, %8, %9 : tensor<2x4x128x16x1xi8>, tensor<2x4x16xf32>, tensor<2x4x16xf32>) outs(%13 : tensor<2x4x128x16x1xf32>) { + %0 = tensor.empty() : tensor<2x4x128x16x1xf32> + %1 = linalg.generic {indexing_maps = [#map2, #map3, #map3, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1, %arg2 : tensor<2x4x128x16x1xi8>, tensor<2x4x16xf32>, tensor<2x4x16xf32>) outs(%0 : tensor<2x4x128x16x1xf32>) { ^bb0(%in: i8, %in_0: f32, %in_1: f32, %out: f32): - %21 = arith.extui %in : i8 to i32 - %22 = arith.uitofp %21 : i32 to f32 - %23 = arith.subf %22, %in_1 : f32 - %24 = arith.mulf %23, %in_0 : f32 - linalg.yield %24 : f32 + %8 = arith.extui %in : i8 to i32 + %9 = arith.uitofp %8 : i32 to f32 + %10 = arith.subf %9, %in_1 : f32 + %11 = arith.mulf %10, %in_0 : f32 + linalg.yield %11 : f32 } -> tensor<2x4x128x16x1xf32> - %15 = tensor.empty() : tensor<2x688x128x16x1xf32> - %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%10, %11, %12 : tensor<2x688x128x16x1xi8>, tensor<2x688x16xf32>, tensor<2x688x16xf32>) outs(%15 : tensor<2x688x128x16x1xf32>) { + %2 = tensor.empty() : tensor<2x688x128x16x1xf32> + %3 = linalg.generic {indexing_maps = [#map2, #map3, #map3, #map2], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%arg3, %arg4, %arg5 : tensor<2x688x128x16x1xi8>, tensor<2x688x16xf32>, tensor<2x688x16xf32>) outs(%2 : tensor<2x688x128x16x1xf32>) { ^bb0(%in: i8, %in_0: f32, %in_1: f32, %out: f32): - %21 = arith.extui %in : i8 to i32 - %22 = arith.uitofp %21 : i32 to f32 - %23 = arith.subf %22, %in_1 : f32 - %24 = arith.mulf %23, %in_0 : f32 - linalg.yield %24 : f32 + %8 = arith.extui %in : i8 to i32 + %9 = arith.uitofp %8 : i32 to f32 + %10 = arith.subf %9, %in_1 : f32 + %11 = arith.mulf %10, %in_0 : f32 + linalg.yield %11 : f32 } -> tensor<2x688x128x16x1xf32> - %17 = tensor.empty() : tensor<2x4x688x16x16xf32> - %18 = linalg.fill ins(%cst : f32) outs(%17 : tensor<2x4x688x16x16xf32>) -> tensor<2x4x688x16x16xf32> - %19 = linalg.batch_mmt4d {lowering_config = #config2} ins(%14, %16 : tensor<2x4x128x16x1xf32>, tensor<2x688x128x16x1xf32>) outs(%18 : tensor<2x4x688x16x16xf32>) -> tensor<2x4x688x16x16xf32> - %20 = tensor.empty() : tensor<2x11008x64xf32> - %unpack = tensor.unpack %19 outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [16, 16] into %20 : tensor<2x4x688x16x16xf32> -> tensor<2x11008x64xf32> - flow.dispatch.tensor.store %unpack, %6, offsets = [0, 0, 0], sizes = [2, 11008, 64], strides = [1, 1, 1] : tensor<2x11008x64xf32> -> !flow.dispatch.tensor> - return + %4 = tensor.empty() : tensor<2x4x688x16x16xf32> + %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<2x4x688x16x16xf32>) -> tensor<2x4x688x16x16xf32> + %6 = linalg.batch_mmt4d {lowering_config = #config} ins(%1, %3 : tensor<2x4x128x16x1xf32>, tensor<2x688x128x16x1xf32>) outs(%5 : tensor<2x4x688x16x16xf32>) -> tensor<2x4x688x16x16xf32> + %7 = tensor.empty() : tensor<2x11008x64xf32> + %unpack = tensor.unpack %6 outer_dims_perm = [0, 2, 1] inner_dims_pos = [2, 1] inner_tiles = [16, 16] into %7 : tensor<2x4x688x16x16xf32> -> tensor<2x11008x64xf32> + return %unpack : tensor<2x11008x64xf32> } // CHECK: func.func @quantized_matmul( // CHECK: scf.for