diff --git a/compiler/plugins/target/LLVMCPU/test/materialize_homogeneous_encodings.mlir b/compiler/plugins/target/LLVMCPU/test/materialize_homogeneous_encodings.mlir index 711837f97eeb..0d16e3b9b4fc 100644 --- a/compiler/plugins/target/LLVMCPU/test/materialize_homogeneous_encodings.mlir +++ b/compiler/plugins/target/LLVMCPU/test/materialize_homogeneous_encodings.mlir @@ -5,11 +5,12 @@ #map1 = affine_map<(d0, d1, d2) -> (d0, d2)> #map2 = affine_map<(d0, d1, d2) -> (d2, d1)> #map3 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding = #iree_encoding.encoding> #device_target_llvm_cpu = #hal.device.target<"llvm-cpu", [#executable_target_embedded_elf_x86_64_]> : !hal.device module attributes {hal.device.targets = [#device_target_llvm_cpu]} { util.func public @lhs_encoding(%arg0: tensor) -> tensor { - %3 = iree_encoding.set_encoding %arg0 : tensor -> tensor>> - %4 = iree_encoding.unset_encoding %3 : tensor>> -> tensor + %3 = iree_encoding.set_encoding %arg0 : tensor -> tensor + %4 = iree_encoding.unset_encoding %3 : tensor -> tensor util.return %4 : tensor } } diff --git a/compiler/plugins/target/VulkanSPIRV/test/materialize_homogeneous_encodings.mlir b/compiler/plugins/target/VulkanSPIRV/test/materialize_homogeneous_encodings.mlir index aa728269a5a5..303326473e8c 100644 --- a/compiler/plugins/target/VulkanSPIRV/test/materialize_homogeneous_encodings.mlir +++ b/compiler/plugins/target/VulkanSPIRV/test/materialize_homogeneous_encodings.mlir @@ -4,11 +4,12 @@ #map1 = affine_map<(d0, d1, d2) -> (d0, d2)> #map2 = affine_map<(d0, d1, d2) -> (d2, d1)> #map3 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding = #iree_encoding.encoding> #device_target_vulkan = #hal.device.target<"vulkan", [#executable_target_vulkan_spirv_fb]> : !hal.device module attributes {hal.device.targets = [#device_target_vulkan]} { util.func public @lhs_encoding(%arg0: tensor) -> tensor { - %3 = iree_encoding.set_encoding %arg0 : tensor -> tensor>> - %4 = iree_encoding.unset_encoding %3 : tensor>> -> tensor + %3 = iree_encoding.set_encoding %arg0 : tensor -> tensor + %4 = iree_encoding.unset_encoding %3 : tensor -> tensor util.return %4 : tensor } } @@ -23,14 +24,15 @@ module attributes {hal.device.targets = [#device_target_vulkan]} { #map1 = affine_map<(d0, d1, d2) -> (d0, d2)> #map2 = affine_map<(d0, d1, d2) -> (d2, d1)> #map3 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding = #iree_encoding.encoding> #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {target_triple = "x86_64-none-elf", cpu_features = "+avx512f"}> #device_target_llvm_cpu = #hal.device.target<"llvm-cpu", [#executable_target_embedded_elf_x86_64_]> : !hal.device #executable_target_vulkan_spirv_fb = #hal.executable.target<"vulkan-spirv", "vulkan-spirv-fb"> #device_target_vulkan = #hal.device.target<"vulkan", [#executable_target_vulkan_spirv_fb]> : !hal.device module attributes {hal.device.targets = [#hal.device.select<[#device_target_vulkan, #device_target_llvm_cpu]> : !hal.device]} { util.func public @lhs_encoding(%arg0: tensor) -> tensor { - %3 = iree_encoding.set_encoding %arg0 : tensor -> tensor>> - %4 = iree_encoding.unset_encoding %3 : tensor>> -> tensor + %3 = iree_encoding.set_encoding %arg0 : tensor -> tensor + %4 = iree_encoding.unset_encoding %3 : tensor -> tensor util.return %4 : tensor } } diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir b/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir index 71ae9826ad84..e9c07d2541e8 100644 --- a/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir @@ -6,15 +6,16 @@ #hal.descriptor_set.binding<1, storage_buffer> ]> ]> +#encoding = #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array> func.func @set_encoding_with_padding_semantics_bf16_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> }{ %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1000], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1000xbf16> - %3 = iree_encoding.set_encoding %2 : tensor<1x1000xbf16> -> tensor<1x1000xbf16, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> - flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1, 1000], strides = [1, 1] : tensor<1x1000xbf16, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> -> !flow.dispatch.tensor, matmul_narrow_M = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>> + %3 = iree_encoding.set_encoding %2 : tensor<1x1000xbf16> -> tensor<1x1000xbf16, #encoding> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1, 1000], strides = [1, 1] : tensor<1x1000xbf16, #encoding> -> !flow.dispatch.tensor> return } // This tests that @@ -45,15 +46,16 @@ func.func @set_encoding_with_padding_semantics_bf16_x86_64_avx512f() attributes #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @set_encoding_7x7x7_matmul_LHS() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx,+avx2,+fma"}> } { %c0 = arith.constant 0 : index %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %14 = flow.dispatch.tensor.load %8, offsets = [0, 0], sizes = [7, 7], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<7x7xf32> - %17 = iree_encoding.set_encoding %14 : tensor<7x7xf32> -> tensor<7x7xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - flow.dispatch.tensor.store %17, %11, offsets = [0, 0], sizes = [7, 7], strides = [1, 1] : tensor<7x7xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + %17 = iree_encoding.set_encoding %14 : tensor<7x7xf32> -> tensor<7x7xf32, #encoding> + flow.dispatch.tensor.store %17, %11, offsets = [0, 0], sizes = [7, 7], strides = [1, 1] : tensor<7x7xf32, #encoding> -> !flow.dispatch.tensor> return } // CHECK-LABEL: func @set_encoding_7x7x7_matmul_LHS( @@ -76,17 +78,18 @@ func.func @set_encoding_7x7x7_matmul_LHS() attributes { #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +#encoding = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @set_encoding_128x80x32_batch_matmul_LHS() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx,+avx2,+fma"}> } { %c0 = arith.constant 0 : index %8 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + %11 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %14 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x80x32xf32> - %17 = iree_encoding.set_encoding %14 : tensor<128x80x32xf32> -> tensor<128x80x32xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %17 = iree_encoding.set_encoding %14 : tensor<128x80x32xf32> -> tensor<128x80x32xf32, #encoding> flow.dispatch.tensor.store %17, %11, offsets = [0, 0, 0], sizes = [128, 80, 32], strides = [1, 1, 1] - : tensor<128x80x32xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - -> !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + : tensor<128x80x32xf32, #encoding> + -> !flow.dispatch.tensor> return } // CHECK-LABEL: func @set_encoding_128x80x32_batch_matmul_LHS( @@ -108,6 +111,7 @@ func.func @set_encoding_128x80x32_batch_matmul_LHS() attributes { #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +#encoding = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @set_encoding_128x32x320_batch_matmul_RHS() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx,+avx2,+fma"}> } { @@ -115,12 +119,12 @@ func.func @set_encoding_128x32x320_batch_matmul_RHS() attributes { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %5 = arith.index_castui %0 {stream.alignment = 64 : index} : i32 to index %10 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %13 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%5) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + %13 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%5) : !flow.dispatch.tensor> %16 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] : !flow.dispatch.tensor> -> tensor<128x32x320xf32> - %19 = iree_encoding.set_encoding %16 : tensor<128x32x320xf32> -> tensor<128x32x320xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> + %19 = iree_encoding.set_encoding %16 : tensor<128x32x320xf32> -> tensor<128x32x320xf32, #encoding> flow.dispatch.tensor.store %19, %13, offsets = [0, 0, 0], sizes = [128, 32, 320], strides = [1, 1, 1] - : tensor<128x32x320xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - -> !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + : tensor<128x32x320xf32, #encoding> + -> !flow.dispatch.tensor> return } // CHECK-LABEL: func @set_encoding_128x32x320_batch_matmul_RHS( @@ -142,6 +146,7 @@ func.func @set_encoding_128x32x320_batch_matmul_RHS() attributes { #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +#encoding = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @unset_encoding_128x80x320_batch_matmul_RESULT() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx,+avx2,+fma"}> } { @@ -149,11 +154,11 @@ func.func @unset_encoding_128x80x320_batch_matmul_RESULT() attributes { %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32 %3 = arith.index_castui %0 : i32 to index %6 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> - %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> + %9 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%3) flags(ReadOnly) : !flow.dispatch.tensor> %10 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0], sizes = [128, 80, 320], strides = [1, 1, 1] - : !flow.dispatch.tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>> - -> tensor<128x80x320xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %11 = iree_encoding.unset_encoding %10 : tensor<128x80x320xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<128x80x320xf32> + : !flow.dispatch.tensor> + -> tensor<128x80x320xf32, #encoding> + %11 = iree_encoding.unset_encoding %10 : tensor<128x80x320xf32, #encoding> -> tensor<128x80x320xf32> flow.dispatch.tensor.store %11, %6, offsets = [0, 0, 0], sizes = [128, 80, 320], strides = [1, 1, 1] : tensor<128x80x320xf32> -> !flow.dispatch.tensor> return } @@ -177,6 +182,9 @@ func.func @unset_encoding_128x80x320_batch_matmul_RESULT() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @pack_gemm_fill_dynamic(%arg0 : tensor, %arg1 : tensor) -> tensor attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx,+avx2,+fma"}> } { @@ -185,14 +193,14 @@ func.func @pack_gemm_fill_dynamic(%arg0 : tensor, %arg1 : tensor %d1 = tensor.dim %arg1, %c1 : tensor - %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor>> - %1 = iree_encoding.set_encoding %arg1 : tensor -> tensor>> - %2 = tensor.empty(%d0, %d1) : tensor>> - %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor>>) - -> tensor>> - %4 = linalg.matmul ins(%0, %1 : tensor>>, tensor>>) - outs(%3 : tensor>>) -> tensor>> - %5 = iree_encoding.unset_encoding %4 : tensor>> -> tensor + %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor + %1 = iree_encoding.set_encoding %arg1 : tensor -> tensor + %2 = tensor.empty(%d0, %d1) : tensor + %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor) + -> tensor + %4 = linalg.matmul ins(%0, %1 : tensor, tensor) + outs(%3 : tensor) -> tensor + %5 = iree_encoding.unset_encoding %4 : tensor -> tensor return %5 : tensor } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -222,6 +230,9 @@ func.func @pack_gemm_fill_dynamic(%arg0 : tensor, %arg1 : tensor (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array> func.func @matvec_shaped_matmul_lowering_f32f32f32_aarch64(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> } { @@ -229,11 +240,11 @@ func.func @matvec_shaped_matmul_lowering_f32f32f32_aarch64(%arg0: !hal.buffer_vi %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<16x16xf32> %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<16x1xf32> %2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<16x1xf32> - %3 = iree_encoding.set_encoding %0 : tensor<16x16xf32> -> tensor<16x16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> - %4 = iree_encoding.set_encoding %1 : tensor<16x1xf32> -> tensor<16x1xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> - %5 = iree_encoding.set_encoding %2 : tensor<16x1xf32> -> tensor<16x1xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> - %6 = linalg.matmul ins(%3, %4 : tensor<16x16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>, tensor<16x1xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>) outs(%5 : tensor<16x1xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>) -> tensor<16x1xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> - %7 = iree_encoding.unset_encoding %6 : tensor<16x1xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> -> tensor<16x1xf32> + %3 = iree_encoding.set_encoding %0 : tensor<16x16xf32> -> tensor<16x16xf32, #encoding_lhs> + %4 = iree_encoding.set_encoding %1 : tensor<16x1xf32> -> tensor<16x1xf32, #encoding_rhs> + %5 = iree_encoding.set_encoding %2 : tensor<16x1xf32> -> tensor<16x1xf32, #encoding_result> + %6 = linalg.matmul ins(%3, %4 : tensor<16x16xf32, #encoding_lhs>, tensor<16x1xf32, #encoding_rhs>) outs(%5 : tensor<16x1xf32, #encoding_result>) -> tensor<16x1xf32, #encoding_result> + %7 = iree_encoding.unset_encoding %6 : tensor<16x1xf32, #encoding_result> -> tensor<16x1xf32> %8 = hal.tensor.export %7 "output0" : tensor<16x1xf32> -> !hal.buffer_view func.return %8 : !hal.buffer_view } @@ -254,6 +265,9 @@ func.func @matvec_shaped_matmul_lowering_f32f32f32_aarch64(%arg0: !hal.buffer_vi #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_f32f32f32_aarch64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", ukernels = "all"}> } { @@ -262,28 +276,28 @@ func.func @matmul_lowering_f32f32f32_aarch64() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -317,15 +331,18 @@ func.func @matmul_lowering_f32f32f32_aarch64() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array> func.func @matvec_lowering_f32f32f32_aarch64(%arg0: tensor<16x16xf32>, %arg1: tensor<16xf32>, %arg2: tensor<16xf32>) -> tensor<16xf32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> } { %c0 = arith.constant 0 : index - %3 = iree_encoding.set_encoding %arg0 : tensor<16x16xf32> -> tensor<16x16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> - %4 = iree_encoding.set_encoding %arg1 : tensor<16xf32> -> tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> - %5 = iree_encoding.set_encoding %arg2 : tensor<16xf32> -> tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> - %6 = linalg.matvec ins(%3, %4 : tensor<16x16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>>, tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>>) outs(%5 : tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>>) -> tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> - %7 = iree_encoding.unset_encoding %6 : tensor<16xf32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>], round_dims_to = array>> -> tensor<16xf32> + %3 = iree_encoding.set_encoding %arg0 : tensor<16x16xf32> -> tensor<16x16xf32, #encoding_lhs> + %4 = iree_encoding.set_encoding %arg1 : tensor<16xf32> -> tensor<16xf32, #encoding_rhs> + %5 = iree_encoding.set_encoding %arg2 : tensor<16xf32> -> tensor<16xf32, #encoding_result> + %6 = linalg.matvec ins(%3, %4 : tensor<16x16xf32, #encoding_lhs>, tensor<16xf32, #encoding_rhs>) outs(%5 : tensor<16xf32, #encoding_result>) -> tensor<16xf32, #encoding_result> + %7 = iree_encoding.unset_encoding %6 : tensor<16xf32, #encoding_result> -> tensor<16xf32> func.return %7 : tensor<16xf32> } // CHECK-LABEL: func @matvec_lowering_f32f32f32_aarch64( @@ -345,33 +362,36 @@ func.func @matvec_lowering_f32f32f32_aarch64(%arg0: tensor<16x16xf32>, %arg1: te #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matvec_lowering_f32f32f32_aarch64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> } { %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>> + : !flow.dispatch.tensor> %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>> + : !flow.dispatch.tensor> %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>> + : !flow.dispatch.tensor> %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [16, 16], strides = [1, 1] - : !flow.dispatch.tensor>>> - -> tensor<16x16xf32, #iree_encoding.encoding>> + : !flow.dispatch.tensor> + -> tensor<16x16xf32, #encoding_lhs> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [16, 1], strides = [1, 1] - : !flow.dispatch.tensor>>> - -> tensor<16x1xf32, #iree_encoding.encoding>> + : !flow.dispatch.tensor> + -> tensor<16x1xf32, #encoding_rhs> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [16, 1], strides = [1, 1] - : !flow.dispatch.tensor>>> - -> tensor<16x1xf32, #iree_encoding.encoding>> + : !flow.dispatch.tensor> + -> tensor<16x1xf32, #encoding_result> %6 = linalg.matmul - ins(%3, %4 : tensor<16x16xf32, #iree_encoding.encoding>>, - tensor<16x1xf32, #iree_encoding.encoding>>) - outs(%5 : tensor<16x1xf32, #iree_encoding.encoding>>) - -> tensor<16x1xf32, #iree_encoding.encoding>> + ins(%3, %4 : tensor<16x16xf32, #encoding_lhs>, + tensor<16x1xf32, #encoding_rhs>) + outs(%5 : tensor<16x1xf32, #encoding_result>) + -> tensor<16x1xf32, #encoding_result> flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [16, 1], strides = [1, 1] - : tensor<16x1xf32, #iree_encoding.encoding>> - -> !flow.dispatch.tensor>>> + : tensor<16x1xf32, #encoding_result> + -> !flow.dispatch.tensor> return } // CHECK-LABEL: func @matvec_lowering_f32f32f32_aarch64() @@ -406,6 +426,9 @@ func.func @matvec_lowering_f32f32f32_aarch64() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_f16f16f16_aarch64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", ukernels = "all"}> } { @@ -414,28 +437,28 @@ func.func @matmul_lowering_f16f16f16_aarch64() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -476,6 +499,9 @@ func.func @matmul_lowering_f16f16f16_aarch64() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_f32f32f32_x86_64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz"}> } { @@ -484,28 +510,28 @@ func.func @matmul_lowering_f32f32f32_x86_64() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -547,6 +573,9 @@ func.func @matmul_lowering_f32f32f32_x86_64() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_f32f32f32_x86_64_avx2() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx"}> } { @@ -555,28 +584,28 @@ func.func @matmul_lowering_f32f32f32_x86_64_avx2() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -617,6 +646,9 @@ func.func @matmul_lowering_f32f32f32_x86_64_avx2() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_f32f32f32_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> } { @@ -625,28 +657,28 @@ func.func @matmul_lowering_f32f32f32_x86_64_avx512f() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -687,6 +719,9 @@ func.func @matmul_lowering_f32f32f32_x86_64_avx512f() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_f16f16f32_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> } { @@ -695,28 +730,28 @@ func.func @matmul_lowering_f16f16f32_x86_64_avx512f() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -757,6 +792,9 @@ func.func @matmul_lowering_f16f16f32_x86_64_avx512f() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_f16f16f16_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> } { @@ -765,28 +803,28 @@ func.func @matmul_lowering_f16f16f16_x86_64_avx512f() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -827,6 +865,9 @@ func.func @matmul_lowering_f16f16f16_x86_64_avx512f() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_bf16bf16f32_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> } { @@ -835,28 +876,28 @@ func.func @matmul_lowering_bf16bf16f32_x86_64_avx512f() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -897,6 +938,9 @@ func.func @matmul_lowering_bf16bf16f32_x86_64_avx512f() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f"}> } { @@ -905,28 +949,28 @@ func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512f() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -967,6 +1011,9 @@ func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512f() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_bf16bf16f32_x86_64_avx512bf16() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f,+avx512bf16"}> } { @@ -975,28 +1022,28 @@ func.func @matmul_lowering_bf16bf16f32_x86_64_avx512bf16() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -1039,6 +1086,9 @@ func.func @matmul_lowering_bf16bf16f32_x86_64_avx512bf16() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512bf16() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f,+avx512bf16"}> } { @@ -1047,28 +1097,28 @@ func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512bf16() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -1111,6 +1161,9 @@ func.func @matmul_lowering_bf16bf16bf16_x86_64_avx512bf16() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_f32f16f16_aarch64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", ukernels = "all"}> } { @@ -1119,37 +1172,37 @@ func.func @matmul_lowering_f32f16f16_aarch64() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %lhs_f32 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %rhs = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %dest = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor - %empty = tensor.empty(%M, %K) : tensor>> + %empty = tensor.empty(%M, %K) : tensor %lhs_f16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} - ins(%lhs_f32 : tensor>>) - outs(%empty : tensor>>) { + ins(%lhs_f32 : tensor) + outs(%empty : tensor) { ^bb0(%in: f32, %out: f16): %17 = arith.truncf %in : f32 to f16 linalg.yield %17 : f16 - } -> tensor>> + } -> tensor %6 = linalg.matmul - ins(%lhs_f16, %rhs : tensor>>, - tensor>>) - outs(%dest : tensor>>) - -> tensor>> + ins(%lhs_f16, %rhs : tensor, + tensor) + outs(%dest : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP_CEILDIV_8:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -1183,6 +1236,9 @@ func.func @matmul_lowering_f32f16f16_aarch64() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_f32f16f16_x86_64_avx512f() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512f,+avx512bf16"}> } { @@ -1191,37 +1247,37 @@ func.func @matmul_lowering_f32f16f16_x86_64_avx512f() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %lhs_f32 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %rhs = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %dest = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor - %empty = tensor.empty(%M, %K) : tensor>> + %empty = tensor.empty(%M, %K) : tensor %lhs_f16 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} - ins(%lhs_f32 : tensor>>) - outs(%empty : tensor>>) { + ins(%lhs_f32 : tensor) + outs(%empty : tensor) { ^bb0(%in: f32, %out: f16): %17 = arith.truncf %in : f32 to f16 linalg.yield %17 : f16 - } -> tensor>> + } -> tensor %6 = linalg.matmul - ins(%lhs_f16, %rhs : tensor>>, - tensor>>) - outs(%dest : tensor>>) - -> tensor>> + ins(%lhs_f16, %rhs : tensor, + tensor) + outs(%dest : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } @@ -1256,6 +1312,9 @@ func.func @matmul_lowering_f32f16f16_x86_64_avx512f() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i8i8i32_aarch64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> } { @@ -1264,28 +1323,28 @@ func.func @matmul_lowering_i8i8i32_aarch64() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-LABEL: func @matmul_lowering_i8i8i32_aarch64() @@ -1323,6 +1382,9 @@ func.func @matmul_lowering_i8i8i32_aarch64() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i8i8i32_aarch64_dotprod() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", cpu_features="+dotprod", ukernels = "all"}> } { @@ -1331,28 +1393,28 @@ func.func @matmul_lowering_i8i8i32_aarch64_dotprod() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -1395,6 +1457,9 @@ func.func @matmul_lowering_i8i8i32_aarch64_dotprod() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i8i8i32_aarch64_i8mm() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", cpu_features="+dotprod,+i8mm", ukernels = "all"}> } { @@ -1403,28 +1468,28 @@ func.func @matmul_lowering_i8i8i32_aarch64_i8mm() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -1466,6 +1531,9 @@ func.func @matmul_lowering_i8i8i32_aarch64_i8mm() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i8i4i32_aarch64() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> } { @@ -1474,28 +1542,28 @@ func.func @matmul_lowering_i8i4i32_aarch64() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)> @@ -1539,6 +1607,9 @@ func.func @matmul_lowering_i8i4i32_aarch64() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i8i4i32_aarch64_dotprod() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", cpu_features="+dotprod", ukernels = "all"}> } { @@ -1547,28 +1618,28 @@ func.func @matmul_lowering_i8i4i32_aarch64_dotprod() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -1610,6 +1681,9 @@ func.func @matmul_lowering_i8i4i32_aarch64_dotprod() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i8i4i32_aarch64_i8mm() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", cpu_features="+dotprod,+i8mm", ukernels = "all"}> } { @@ -1618,28 +1692,28 @@ func.func @matmul_lowering_i8i4i32_aarch64_i8mm() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)> @@ -1676,18 +1750,21 @@ func.func @matmul_lowering_i8i4i32_aarch64_i8mm() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_f32f32f32_aarch64_sve(%lhs: tensor, %rhs: tensor, %acc: tensor) -> tensor attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {cpu_features = "+sve", target_triple="aarch64-xyz-xyz"}> } { - %0 = iree_encoding.set_encoding %lhs : tensor -> tensor>> - %1 = iree_encoding.set_encoding %rhs : tensor -> tensor>> - %2 = iree_encoding.set_encoding %acc : tensor -> tensor>> + %0 = iree_encoding.set_encoding %lhs : tensor -> tensor + %1 = iree_encoding.set_encoding %rhs : tensor -> tensor + %2 = iree_encoding.set_encoding %acc : tensor -> tensor %3 = linalg.matmul - ins(%0, %1 : tensor>>, - tensor>>) - outs(%2 : tensor>>) - -> tensor>> - %4 = iree_encoding.unset_encoding %3 : tensor>> -> tensor + ins(%0, %1 : tensor, + tensor) + outs(%2 : tensor) + -> tensor + %4 = iree_encoding.unset_encoding %3 : tensor -> tensor return %4 : tensor } @@ -1701,18 +1778,21 @@ func.func @matmul_lowering_f32f32f32_aarch64_sve(%lhs: tensor, %rhs: te #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_f32f32f32_riscv(%lhs: tensor, %rhs: tensor, %acc: tensor) -> tensor attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="riscv32-xyz-xyz"}> } { - %0 = iree_encoding.set_encoding %lhs : tensor -> tensor>> - %1 = iree_encoding.set_encoding %rhs : tensor -> tensor>> - %2 = iree_encoding.set_encoding %acc : tensor -> tensor>> + %0 = iree_encoding.set_encoding %lhs : tensor -> tensor + %1 = iree_encoding.set_encoding %rhs : tensor -> tensor + %2 = iree_encoding.set_encoding %acc : tensor -> tensor %3 = linalg.matmul - ins(%0, %1 : tensor>>, - tensor>>) - outs(%2 : tensor>>) - -> tensor>> - %4 = iree_encoding.unset_encoding %3 : tensor>> -> tensor + ins(%0, %1 : tensor, + tensor) + outs(%2 : tensor) + -> tensor + %4 = iree_encoding.unset_encoding %3 : tensor -> tensor return %4 : tensor } // RISC-V targets does not implement data-tiling yet. @@ -1732,6 +1812,9 @@ func.func @matmul_lowering_f32f32f32_riscv(%lhs: tensor, %rhs: tensor (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i8i8i32_riscv32_ukernel() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="riscv32-xyz-xyz", ukernels = "all"}> } { @@ -1740,28 +1823,28 @@ func.func @matmul_lowering_i8i8i32_riscv32_ukernel() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -1804,6 +1887,9 @@ func.func @matmul_lowering_i8i8i32_riscv32_ukernel() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i8i8i32_x86_64_avx2() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx2"}> } { @@ -1812,28 +1898,28 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx2() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -1876,6 +1962,9 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx2() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i8i8i32_x86_64_avx512bw() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512bw"}> } { @@ -1884,28 +1973,28 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx512bw() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -1948,6 +2037,9 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx512bw() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i8i8i32_x86_64_avx512vnni() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { @@ -1956,28 +2048,28 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx512vnni() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 16)> @@ -2013,28 +2105,31 @@ func.func @matmul_lowering_i8i8i32_x86_64_avx512vnni() attributes { #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +#encoding_lhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @extend_batch_vecmat_explicit_unit_dim(%arg0: tensor<32x1x128xi8>, %arg1: tensor<32x128x11008xi8>) -> tensor<32x1x11008xi32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { %c0_i32 = arith.constant 0 : i32 - %4 = iree_encoding.set_encoding %arg0 : tensor<32x1x128xi8> -> tensor<32x1x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %5 = tensor.empty() : tensor<32x1x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<32x1x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%5 : tensor<32x1x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { + %4 = iree_encoding.set_encoding %arg0 : tensor<32x1x128xi8> -> tensor<32x1x128xi8, #encoding_lhs> + %5 = tensor.empty() : tensor<32x1x128xi32, #encoding_lhs> + %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<32x1x128xi8, #encoding_lhs>) outs(%5 : tensor<32x1x128xi32, #encoding_lhs>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor<32x1x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %7 = iree_encoding.set_encoding %arg1 : tensor<32x128x11008xi8> -> tensor<32x128x11008xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %8 = tensor.empty() : tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7 : tensor<32x128x11008xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%8 : tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { + } -> tensor<32x1x128xi32, #encoding_lhs> + %7 = iree_encoding.set_encoding %arg1 : tensor<32x128x11008xi8> -> tensor<32x128x11008xi8, #encoding_rhs> + %8 = tensor.empty() : tensor<32x128x11008xi32, #encoding_rhs> + %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7 : tensor<32x128x11008xi8, #encoding_rhs>) outs(%8 : tensor<32x128x11008xi32, #encoding_rhs>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %10 = tensor.empty() : tensor<32x1x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<32x1x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<32x1x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %12 = linalg.batch_matmul ins(%6, %9 : tensor<32x1x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%11 : tensor<32x1x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<32x1x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %13 = iree_encoding.unset_encoding %12 : tensor<32x1x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<32x1x11008xi32> + } -> tensor<32x128x11008xi32, #encoding_rhs> + %10 = tensor.empty() : tensor<32x1x11008xi32, #encoding_result> + %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<32x1x11008xi32, #encoding_result>) -> tensor<32x1x11008xi32, #encoding_result> + %12 = linalg.batch_matmul ins(%6, %9 : tensor<32x1x128xi32, #encoding_lhs>, tensor<32x128x11008xi32, #encoding_rhs>) outs(%11 : tensor<32x1x11008xi32, #encoding_result>) -> tensor<32x1x11008xi32, #encoding_result> + %13 = iree_encoding.unset_encoding %12 : tensor<32x1x11008xi32, #encoding_result> -> tensor<32x1x11008xi32> return %13 : tensor<32x1x11008xi32> } @@ -2075,6 +2170,9 @@ func.func @extend_batch_vecmat_explicit_unit_dim(%arg0: tensor<32x1x128xi8>, %ar #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i16i16i32_x86_64_avx2() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx2"}> } { @@ -2083,28 +2181,28 @@ func.func @matmul_lowering_i16i16i32_x86_64_avx2() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[$MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -2147,6 +2245,9 @@ func.func @matmul_lowering_i16i16i32_x86_64_avx2() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i16ui4i32_x86_64_avx512vnni() attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { @@ -2155,35 +2256,35 @@ func.func @matmul_lowering_i16ui4i32_x86_64_avx512vnni() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %lhs_binding = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %rhs_binding = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %out_binding = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %rhs_i4 = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> - %empty = tensor.empty(%K, %N) : tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor + %empty = tensor.empty(%K, %N) : tensor %rhs_i32 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} - ins(%rhs_i4 : tensor>>) outs(%empty : tensor>>) { + ins(%rhs_i4 : tensor) outs(%empty : tensor) { ^bb0(%in: i4, %out: i32): %17 = arith.extui %in : i4 to i32 linalg.yield %17 : i32 - } -> tensor>> + } -> tensor %out = flow.dispatch.tensor.load %out_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %result = linalg.matmul - ins(%lhs, %rhs_i32 : tensor>>, - tensor>>) - outs(%out : tensor>>) - -> tensor>> + ins(%lhs, %rhs_i32 : tensor, + tensor) + outs(%out : tensor) + -> tensor flow.dispatch.tensor.store %result, %out_binding, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } @@ -2212,28 +2313,31 @@ func.func @matmul_lowering_i16ui4i32_x86_64_avx512vnni() attributes { #map = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d1, d0)> #map2 = affine_map<(d0, d1) -> (d0)> +#encoding_lhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @vecmat(%arg0: tensor<128xi8>, %arg1: tensor<128x11008xi8>) -> tensor<11008xi32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { %c0_i32 = arith.constant 0 : i32 - %4 = iree_encoding.set_encoding %arg0 : tensor<128xi8> -> tensor<128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %5 = tensor.empty() : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4 : tensor<128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%5 : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { + %4 = iree_encoding.set_encoding %arg0 : tensor<128xi8> -> tensor<128xi8, #encoding_lhs> + %5 = tensor.empty() : tensor<128xi32, #encoding_lhs> + %6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%4 : tensor<128xi8, #encoding_lhs>) outs(%5 : tensor<128xi32, #encoding_lhs>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %7 = iree_encoding.set_encoding %arg1 : tensor<128x11008xi8> -> tensor<128x11008xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %8 = tensor.empty() : tensor<128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<128x11008xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%8 : tensor<128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { + } -> tensor<128xi32, #encoding_lhs> + %7 = iree_encoding.set_encoding %arg1 : tensor<128x11008xi8> -> tensor<128x11008xi8, #encoding_rhs> + %8 = tensor.empty() : tensor<128x11008xi32, #encoding_rhs> + %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<128x11008xi8, #encoding_rhs>) outs(%8 : tensor<128x11008xi32, #encoding_rhs>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor<128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %10 = tensor.empty() : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %12 = linalg.vecmat ins(%6, %9 : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%11 : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %13 = iree_encoding.unset_encoding %12 : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<11008xi32> + } -> tensor<128x11008xi32, #encoding_rhs> + %10 = tensor.empty() : tensor<11008xi32, #encoding_result> + %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<11008xi32, #encoding_result>) -> tensor<11008xi32, #encoding_result> + %12 = linalg.vecmat ins(%6, %9 : tensor<128xi32, #encoding_lhs>, tensor<128x11008xi32, #encoding_rhs>) outs(%11 : tensor<11008xi32, #encoding_result>) -> tensor<11008xi32, #encoding_result> + %13 = iree_encoding.unset_encoding %12 : tensor<11008xi32, #encoding_result> -> tensor<11008xi32> return %13 : tensor<11008xi32> } @@ -2271,28 +2375,31 @@ func.func @vecmat(%arg0: tensor<128xi8>, %arg1: tensor<128x11008xi8>) -> tensor< #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d1)> #map2 = affine_map<(d0, d1) -> (d0)> +#encoding_lhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @matvec(%arg0: tensor<11008x128xi8>, %arg1: tensor<128xi8>) -> tensor<11008xi32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { %c0_i32 = arith.constant 0 : i32 - %4 = iree_encoding.set_encoding %arg0 : tensor<11008x128xi8> -> tensor<11008x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %5 = tensor.empty() : tensor<11008x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<11008x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%5 : tensor<11008x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { + %4 = iree_encoding.set_encoding %arg0 : tensor<11008x128xi8> -> tensor<11008x128xi8, #encoding_lhs> + %5 = tensor.empty() : tensor<11008x128xi32, #encoding_lhs> + %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<11008x128xi8, #encoding_lhs>) outs(%5 : tensor<11008x128xi32, #encoding_lhs>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor<11008x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %7 = iree_encoding.set_encoding %arg1 : tensor<128xi8> -> tensor<128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %8 = tensor.empty() : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%8 : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { + } -> tensor<11008x128xi32, #encoding_lhs> + %7 = iree_encoding.set_encoding %arg1 : tensor<128xi8> -> tensor<128xi8, #encoding_rhs> + %8 = tensor.empty() : tensor<128xi32, #encoding_rhs> + %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<128xi8, #encoding_rhs>) outs(%8 : tensor<128xi32, #encoding_rhs>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %10 = tensor.empty() : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %12 = linalg.matvec ins(%6, %9 : tensor<11008x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%11 : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %13 = iree_encoding.unset_encoding %12 : tensor<11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<11008xi32> + } -> tensor<128xi32, #encoding_rhs> + %10 = tensor.empty() : tensor<11008xi32, #encoding_result> + %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<11008xi32, #encoding_result>) -> tensor<11008xi32, #encoding_result> + %12 = linalg.matvec ins(%6, %9 : tensor<11008x128xi32, #encoding_lhs>, tensor<128xi32, #encoding_rhs>) outs(%11 : tensor<11008xi32, #encoding_result>) -> tensor<11008xi32, #encoding_result> + %13 = iree_encoding.unset_encoding %12 : tensor<11008xi32, #encoding_result> -> tensor<11008xi32> return %13 : tensor<11008xi32> } @@ -2330,28 +2437,31 @@ func.func @matvec(%arg0: tensor<11008x128xi8>, %arg1: tensor<128xi8>) -> tensor< #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0, d1) -> (d1)> #map2 = affine_map<(d0, d1) -> (d0)> +#encoding_lhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @matvec_with_narrow_M(%arg0: tensor<15x128xi8>, %arg1: tensor<128xi8>) -> tensor<15xi32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { %c0_i32 = arith.constant 0 : i32 - %4 = iree_encoding.set_encoding %arg0 : tensor<15x128xi8> -> tensor<15x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %5 = tensor.empty() : tensor<15x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<15x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%5 : tensor<15x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { + %4 = iree_encoding.set_encoding %arg0 : tensor<15x128xi8> -> tensor<15x128xi8, #encoding_lhs> + %5 = tensor.empty() : tensor<15x128xi32, #encoding_lhs> + %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<15x128xi8, #encoding_lhs>) outs(%5 : tensor<15x128xi32, #encoding_lhs>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor<15x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %7 = iree_encoding.set_encoding %arg1 : tensor<128xi8> -> tensor<128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %8 = tensor.empty() : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%8 : tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { + } -> tensor<15x128xi32, #encoding_lhs> + %7 = iree_encoding.set_encoding %arg1 : tensor<128xi8> -> tensor<128xi8, #encoding_rhs> + %8 = tensor.empty() : tensor<128xi32, #encoding_rhs> + %9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%7 : tensor<128xi8, #encoding_rhs>) outs(%8 : tensor<128xi32, #encoding_rhs>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %10 = tensor.empty() : tensor<15xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<15xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<15xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %12 = linalg.matvec ins(%6, %9 : tensor<15x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%11 : tensor<15xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<15xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %13 = iree_encoding.unset_encoding %12 : tensor<15xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<15xi32> + } -> tensor<128xi32, #encoding_rhs> + %10 = tensor.empty() : tensor<15xi32, #encoding_result> + %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<15xi32, #encoding_result>) -> tensor<15xi32, #encoding_result> + %12 = linalg.matvec ins(%6, %9 : tensor<15x128xi32, #encoding_lhs>, tensor<128xi32, #encoding_rhs>) outs(%11 : tensor<15xi32, #encoding_result>) -> tensor<15xi32, #encoding_result> + %13 = iree_encoding.unset_encoding %12 : tensor<15xi32, #encoding_result> -> tensor<15xi32> return %13 : tensor<15xi32> } @@ -2390,28 +2500,31 @@ func.func @matvec_with_narrow_M(%arg0: tensor<15x128xi8>, %arg1: tensor<128xi8>) #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @batch_vecmat(%arg0: tensor<32x128xi8>, %arg1: tensor<32x128x11008xi8>) -> tensor<32x11008xi32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { %c0_i32 = arith.constant 0 : i32 - %4 = iree_encoding.set_encoding %arg0 : tensor<32x128xi8> -> tensor<32x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %5 = tensor.empty() : tensor<32x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<32x128xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%5 : tensor<32x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { + %4 = iree_encoding.set_encoding %arg0 : tensor<32x128xi8> -> tensor<32x128xi8, #encoding_lhs> + %5 = tensor.empty() : tensor<32x128xi32, #encoding_lhs> + %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4 : tensor<32x128xi8, #encoding_lhs>) outs(%5 : tensor<32x128xi32, #encoding_lhs>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor<32x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %7 = iree_encoding.set_encoding %arg1 : tensor<32x128x11008xi8> -> tensor<32x128x11008xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %8 = tensor.empty() : tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7 : tensor<32x128x11008xi8, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%8 : tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { + } -> tensor<32x128xi32, #encoding_lhs> + %7 = iree_encoding.set_encoding %arg1 : tensor<32x128x11008xi8> -> tensor<32x128x11008xi8, #encoding_rhs> + %8 = tensor.empty() : tensor<32x128x11008xi32, #encoding_rhs> + %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7 : tensor<32x128x11008xi8, #encoding_rhs>) outs(%8 : tensor<32x128x11008xi32, #encoding_rhs>) { ^bb0(%in: i8, %out: i32): %17 = arith.extsi %in : i8 to i32 linalg.yield %17 : i32 - } -> tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %10 = tensor.empty() : tensor<32x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<32x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<32x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %12 = linalg.batch_vecmat ins(%6, %9 : tensor<32x128xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<32x128x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%11 : tensor<32x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<32x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %13 = iree_encoding.unset_encoding %12 : tensor<32x11008xi32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<32x11008xi32> + } -> tensor<32x128x11008xi32, #encoding_rhs> + %10 = tensor.empty() : tensor<32x11008xi32, #encoding_result> + %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<32x11008xi32, #encoding_result>) -> tensor<32x11008xi32, #encoding_result> + %12 = linalg.batch_vecmat ins(%6, %9 : tensor<32x128xi32, #encoding_lhs>, tensor<32x128x11008xi32, #encoding_rhs>) outs(%11 : tensor<32x11008xi32, #encoding_result>) -> tensor<32x11008xi32, #encoding_result> + %13 = iree_encoding.unset_encoding %12 : tensor<32x11008xi32, #encoding_result> -> tensor<32x11008xi32> return %13 : tensor<32x11008xi32> } @@ -2446,17 +2559,20 @@ func.func @batch_vecmat(%arg0: tensor<32x128xi8>, %arg1: tensor<32x128x11008xi8> // ----- +#encoding_lhs = #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array> func.func @batch_matvec(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x11008x128xi8> %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<32x128xi8> %2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<32x11008xi32> - %3 = iree_encoding.set_encoding %0 : tensor<32x11008x128xi8> -> tensor<32x11008x128xi8, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> - %4 = iree_encoding.set_encoding %1 : tensor<32x128xi8> -> tensor<32x128xi8, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> - %5 = iree_encoding.set_encoding %2 : tensor<32x11008xi32> -> tensor<32x11008xi32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> - %6 = linalg.batch_matvec ins(%3, %4 : tensor<32x11008x128xi8, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>, tensor<32x128xi8, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>) outs(%5 : tensor<32x11008xi32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>>) -> tensor<32x11008xi32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> - %7 = iree_encoding.unset_encoding %6 : tensor<32x11008xi32, #iree_encoding.encoding, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array>> -> tensor<32x11008xi32> + %3 = iree_encoding.set_encoding %0 : tensor<32x11008x128xi8> -> tensor<32x11008x128xi8, #encoding_lhs> + %4 = iree_encoding.set_encoding %1 : tensor<32x128xi8> -> tensor<32x128xi8, #encoding_rhs> + %5 = iree_encoding.set_encoding %2 : tensor<32x11008xi32> -> tensor<32x11008xi32, #encoding_result> + %6 = linalg.batch_matvec ins(%3, %4 : tensor<32x11008x128xi8, #encoding_lhs>, tensor<32x128xi8, #encoding_rhs>) outs(%5 : tensor<32x11008xi32, #encoding_result>) -> tensor<32x11008xi32, #encoding_result> + %7 = iree_encoding.unset_encoding %6 : tensor<32x11008xi32, #encoding_result> -> tensor<32x11008xi32> %8 = hal.tensor.export %7 "output0" : tensor<32x11008xi32> -> !hal.buffer_view func.return %8 : !hal.buffer_view } @@ -2469,6 +2585,9 @@ func.func @batch_matvec(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> #map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> +#encoding_lhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @matmul_transpose_a_f32f32f32(%arg0: tensor<256x128xf32>, %arg1: tensor<256x512xf32>, %arg2: tensor<128x512xf32>) -> tensor<128x512xf32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { @@ -2476,11 +2595,11 @@ func.func @matmul_transpose_a_f32f32f32(%arg0: tensor<256x128xf32>, %arg1: tenso %c128 = arith.constant 128 : index %cst = arith.constant 0.000000e+00 : f32 %c512 = arith.constant 512 : index - %6 = iree_encoding.set_encoding %arg0 : tensor<256x128xf32> -> tensor<256x128xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %10 = iree_encoding.set_encoding %arg1 : tensor<256x512xf32> -> tensor<256x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %14 = iree_encoding.set_encoding %arg2 : tensor<128x512xf32> -> tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %15 = linalg.matmul_transpose_a ins(%6, %10 : tensor<256x128xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<256x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%14 : tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %16 = iree_encoding.unset_encoding %15 : tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<128x512xf32> + %6 = iree_encoding.set_encoding %arg0 : tensor<256x128xf32> -> tensor<256x128xf32, #encoding_lhs> + %10 = iree_encoding.set_encoding %arg1 : tensor<256x512xf32> -> tensor<256x512xf32, #encoding_rhs> + %14 = iree_encoding.set_encoding %arg2 : tensor<128x512xf32> -> tensor<128x512xf32, #encoding_result> + %15 = linalg.matmul_transpose_a ins(%6, %10 : tensor<256x128xf32, #encoding_lhs>, tensor<256x512xf32, #encoding_rhs>) outs(%14 : tensor<128x512xf32, #encoding_result>) -> tensor<128x512xf32, #encoding_result> + %16 = iree_encoding.unset_encoding %15 : tensor<128x512xf32, #encoding_result> -> tensor<128x512xf32> return %16 : tensor<128x512xf32> } @@ -2505,6 +2624,9 @@ func.func @matmul_transpose_a_f32f32f32(%arg0: tensor<256x128xf32>, %arg1: tenso #map1 = affine_map<(d0, d1, d2) -> (d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> #map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> +#encoding_lhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @matmul_transpose_b_f32f32f32(%arg0: tensor<128x256xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<128x512xf32>) -> tensor<128x512xf32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { @@ -2512,11 +2634,11 @@ func.func @matmul_transpose_b_f32f32f32(%arg0: tensor<128x256xf32>, %arg1: tenso %c256 = arith.constant 256 : index %cst = arith.constant 0.000000e+00 : f32 %c512 = arith.constant 512 : index - %6 = iree_encoding.set_encoding %arg0 : tensor<128x256xf32> -> tensor<128x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %10 = iree_encoding.set_encoding %arg1 : tensor<512x256xf32> -> tensor<512x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %14 = iree_encoding.set_encoding %arg2 : tensor<128x512xf32> -> tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %15 = linalg.matmul_transpose_b ins(%6, %10 : tensor<128x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<512x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%14 : tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %16 = iree_encoding.unset_encoding %15 : tensor<128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<128x512xf32> + %6 = iree_encoding.set_encoding %arg0 : tensor<128x256xf32> -> tensor<128x256xf32, #encoding_lhs> + %10 = iree_encoding.set_encoding %arg1 : tensor<512x256xf32> -> tensor<512x256xf32, #encoding_rhs> + %14 = iree_encoding.set_encoding %arg2 : tensor<128x512xf32> -> tensor<128x512xf32, #encoding_result> + %15 = linalg.matmul_transpose_b ins(%6, %10 : tensor<128x256xf32, #encoding_lhs>, tensor<512x256xf32, #encoding_rhs>) outs(%14 : tensor<128x512xf32, #encoding_result>) -> tensor<128x512xf32, #encoding_result> + %16 = iree_encoding.unset_encoding %15 : tensor<128x512xf32, #encoding_result> -> tensor<128x512xf32> return %16 : tensor<128x512xf32> } @@ -2540,6 +2662,9 @@ func.func @matmul_transpose_b_f32f32f32(%arg0: tensor<128x256xf32>, %arg1: tenso #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> #map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> +#encoding_lhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @batch_matmul_transpose_a_f32f32f32(%arg0: tensor<2x256x128xf32>, %arg1: tensor<2x256x512xf32>, %arg2: tensor<2x128x512xf32>) -> tensor<2x128x512xf32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { @@ -2548,11 +2673,11 @@ func.func @batch_matmul_transpose_a_f32f32f32(%arg0: tensor<2x256x128xf32>, %arg %c128 = arith.constant 128 : index %cst = arith.constant 0.000000e+00 : f32 %c512 = arith.constant 512 : index - %7 = iree_encoding.set_encoding %arg0 : tensor<2x256x128xf32> -> tensor<2x256x128xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %12 = iree_encoding.set_encoding %arg1 : tensor<2x256x512xf32> -> tensor<2x256x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %17 = iree_encoding.set_encoding %arg2 : tensor<2x128x512xf32> -> tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %18 = linalg.batch_matmul_transpose_a ins(%7, %12 : tensor<2x256x128xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<2x256x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%17 : tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %19 = iree_encoding.unset_encoding %18 : tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<2x128x512xf32> + %7 = iree_encoding.set_encoding %arg0 : tensor<2x256x128xf32> -> tensor<2x256x128xf32, #encoding_lhs> + %12 = iree_encoding.set_encoding %arg1 : tensor<2x256x512xf32> -> tensor<2x256x512xf32, #encoding_rhs> + %17 = iree_encoding.set_encoding %arg2 : tensor<2x128x512xf32> -> tensor<2x128x512xf32, #encoding_result> + %18 = linalg.batch_matmul_transpose_a ins(%7, %12 : tensor<2x256x128xf32, #encoding_lhs>, tensor<2x256x512xf32, #encoding_rhs>) outs(%17 : tensor<2x128x512xf32, #encoding_result>) -> tensor<2x128x512xf32, #encoding_result> + %19 = iree_encoding.unset_encoding %18 : tensor<2x128x512xf32, #encoding_result> -> tensor<2x128x512xf32> return %19 : tensor<2x128x512xf32> } @@ -2576,6 +2701,9 @@ func.func @batch_matmul_transpose_a_f32f32f32(%arg0: tensor<2x256x128xf32>, %arg #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> #map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> +#encoding_lhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @batch_matmul_transpose_b_f32f32f32(%arg0: tensor<2x128x256xf32>, %arg1: tensor<2x512x256xf32>, %arg2: tensor<2x128x512xf32>) -> tensor<2x128x512xf32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { @@ -2584,11 +2712,11 @@ func.func @batch_matmul_transpose_b_f32f32f32(%arg0: tensor<2x128x256xf32>, %arg %c256 = arith.constant 256 : index %cst = arith.constant 0.000000e+00 : f32 %c512 = arith.constant 512 : index - %7 = iree_encoding.set_encoding %arg0 : tensor<2x128x256xf32> -> tensor<2x128x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %12 = iree_encoding.set_encoding %arg1 : tensor<2x512x256xf32> -> tensor<2x512x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %17 = iree_encoding.set_encoding %arg2 : tensor<2x128x512xf32> -> tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %18 = linalg.batch_matmul_transpose_b ins(%7, %12 : tensor<2x128x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<2x512x256xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%17 : tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) -> tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %19 = iree_encoding.unset_encoding %18 : tensor<2x128x512xf32, #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<2x128x512xf32> + %7 = iree_encoding.set_encoding %arg0 : tensor<2x128x256xf32> -> tensor<2x128x256xf32, #encoding_lhs> + %12 = iree_encoding.set_encoding %arg1 : tensor<2x512x256xf32> -> tensor<2x512x256xf32, #encoding_rhs> + %17 = iree_encoding.set_encoding %arg2 : tensor<2x128x512xf32> -> tensor<2x128x512xf32, #encoding_result> + %18 = linalg.batch_matmul_transpose_b ins(%7, %12 : tensor<2x128x256xf32, #encoding_lhs>, tensor<2x512x256xf32, #encoding_rhs>) outs(%17 : tensor<2x128x512xf32, #encoding_result>) -> tensor<2x128x512xf32, #encoding_result> + %19 = iree_encoding.unset_encoding %18 : tensor<2x128x512xf32, #encoding_result> -> tensor<2x128x512xf32> return %19 : tensor<2x128x512xf32> } @@ -2612,6 +2740,9 @@ func.func @batch_matmul_transpose_b_f32f32f32(%arg0: tensor<2x128x256xf32>, %arg #map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> #map3 = affine_map<()[s0, s1] -> (-s1 + (s1 ceildiv s0) * s0)> +#encoding_lhs = #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> func.func @generic_batch_vecmat_transposed_i16u4i32(%arg0: tensor<32x128xi16>, %arg1: tensor<4096x32x128xi4>, %arg2: tensor<4096x32xi32>) -> tensor<4096x32xi32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { @@ -2621,18 +2752,18 @@ func.func @generic_batch_vecmat_transposed_i16u4i32(%arg0: tensor<32x128xi16>, % %c0_i16 = arith.constant 0 : i16 %c128 = arith.constant 128 : index %c32 = arith.constant 32 : index - %3 = iree_encoding.set_encoding %arg0 : tensor<32x128xi16> -> tensor<32x128xi16, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %8 = iree_encoding.set_encoding %arg1 : tensor<4096x32x128xi4> -> tensor<4096x32x128xi4, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %12 = iree_encoding.set_encoding %arg2 : tensor<4096x32xi32> -> tensor<4096x32xi32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %13 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %8 : tensor<32x128xi16, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>, tensor<4096x32x128xi4, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) outs(%12 : tensor<4096x32xi32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>) { + %3 = iree_encoding.set_encoding %arg0 : tensor<32x128xi16> -> tensor<32x128xi16, #encoding_lhs> + %8 = iree_encoding.set_encoding %arg1 : tensor<4096x32x128xi4> -> tensor<4096x32x128xi4, #encoding_rhs> + %12 = iree_encoding.set_encoding %arg2 : tensor<4096x32xi32> -> tensor<4096x32xi32, #encoding_result> + %13 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%3, %8 : tensor<32x128xi16, #encoding_lhs>, tensor<4096x32x128xi4, #encoding_rhs>) outs(%12 : tensor<4096x32xi32, #encoding_result>) { ^bb0(%in: i16, %in_2: i4, %out: i32): %15 = arith.extsi %in : i16 to i32 %16 = arith.extui %in_2 : i4 to i32 %17 = arith.muli %15, %16 : i32 %18 = arith.addi %17, %out : i32 linalg.yield %18 : i32 - } -> tensor<4096x32xi32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> - %14 = iree_encoding.unset_encoding %13 : tensor<4096x32xi32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>> -> tensor<4096x32xi32> + } -> tensor<4096x32xi32, #encoding_result> + %14 = iree_encoding.unset_encoding %13 : tensor<4096x32xi32, #encoding_result> -> tensor<4096x32xi32> return %14 : tensor<4096x32xi32> } diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/test/vmvx_materialize_encoding.mlir b/compiler/src/iree/compiler/Codegen/Common/CPU/test/vmvx_materialize_encoding.mlir index 1e8dcba241e3..0464a42a26c6 100644 --- a/compiler/src/iree/compiler/Codegen/Common/CPU/test/vmvx_materialize_encoding.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/CPU/test/vmvx_materialize_encoding.mlir @@ -10,6 +10,9 @@ #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_i8i8i32_vmvx_ukernel() attributes { hal.executable.target = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "all"}> } { @@ -18,28 +21,28 @@ func.func @matmul_lowering_i8i8i32_vmvx_ukernel() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } @@ -90,21 +93,24 @@ func.func @matmul_lowering_i8i8i32_vmvx_ukernel() attributes { #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> #map3 = affine_map<(d0, d1, d2) -> (d2, d1)> #map4 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array> +#encoding_rhs = #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array> +#encoding_result = #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array> func.func @fill_matmul(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) attributes { hal.executable.target = #hal.executable.target<"vmvx", "vmvx-bytecode-fb"> } { %c32_i64 = arith.constant 32 : i64 %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>>{%arg4, %arg5} - %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 2], strides = [1, 1] : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> -> tensor<1x2xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> - %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> -> tensor<2x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> - %7 = tensor.empty() : tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> - %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>) -> tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> - %9 = linalg.matmul ins(%3, %4 : tensor<1x2xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>, tensor<2x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>) outs(%8 : tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>) -> tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> - flow.dispatch.tensor.store %9, %2, offsets = [0, 0], sizes = [1, 3], strides = [1, 1] : tensor<1x3xf32, #iree_encoding.encoding, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>> -> !flow.dispatch.tensor, user_indexing_maps = [#map2, #map3, #map4], round_dims_to = array>>> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%arg4, %arg5} + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 2], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x2xf32, #encoding_lhs> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2, 3], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<2x3xf32, #encoding_rhs> + %7 = tensor.empty() : tensor<1x3xf32, #encoding_result> + %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1x3xf32, #encoding_result>) -> tensor<1x3xf32, #encoding_result> + %9 = linalg.matmul ins(%3, %4 : tensor<1x2xf32, #encoding_lhs>, tensor<2x3xf32, #encoding_rhs>) outs(%8 : tensor<1x3xf32, #encoding_result>) -> tensor<1x3xf32, #encoding_result> + flow.dispatch.tensor.store %9, %2, offsets = [0, 0], sizes = [1, 3], strides = [1, 1] : tensor<1x3xf32, #encoding_result> -> !flow.dispatch.tensor> return } // CHECK: func.func @fill_matmul @@ -140,6 +146,7 @@ func.func @fill_matmul(%arg0: index, %arg1: index, %arg2: index, %arg3: index, % #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> func.func @set_encoding_dynamic() attributes { hal.executable.target = #hal.executable.target<"vmvx", "vmvx-bytecode-fb"> } { @@ -149,13 +156,13 @@ func.func @set_encoding_dynamic() attributes { %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%d0, %d1} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%d0, %d1} + : !flow.dispatch.tensor>{%d0, %d1} %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1] : !flow.dispatch.tensor>{%d0, %d1} -> tensor - %3 = iree_encoding.set_encoding %2 : tensor -> tensor>> + %3 = iree_encoding.set_encoding %2 : tensor -> tensor flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%d0, %d1} + : tensor + -> !flow.dispatch.tensor>{%d0, %d1} return } // CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> @@ -189,6 +196,7 @@ func.func @set_encoding_dynamic() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> func.func @unset_encoding_dynamic() attributes { hal.executable.target = #hal.executable.target<"vmvx", "vmvx-bytecode-fb"> } { @@ -197,14 +205,14 @@ func.func @unset_encoding_dynamic() attributes { %d0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index %d1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%d0, %d1} + : !flow.dispatch.tensor>{%d0, %d1} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%d0, %d1} %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1] - : !flow.dispatch.tensor>>>{%d0, %d1} - -> tensor>> + : !flow.dispatch.tensor>{%d0, %d1} + -> tensor %3 = iree_encoding.unset_encoding %2 - : tensor>> -> tensor + : tensor -> tensor %4 = tensor.extract_slice %3[0, 0] [%d0, %d1] [1, 1] : tensor to tensor flow.dispatch.tensor.store %4, %1, offsets = [0, 0], sizes = [%d0, %d1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%d0, %d1} @@ -240,6 +248,9 @@ func.func @unset_encoding_dynamic() attributes { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding> +#encoding_rhs = #iree_encoding.encoding> +#encoding_result = #iree_encoding.encoding> func.func @matmul_lowering_f32f32f32_generic() attributes { hal.executable.target = #hal.executable.target<"vmvx", "vmvx-bytecode-fb"> } { @@ -248,28 +259,28 @@ func.func @matmul_lowering_f32f32f32_generic() attributes { %N = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index %K = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %K} + : !flow.dispatch.tensor>{%M, %K} %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%K, %N} + : !flow.dispatch.tensor>{%K, %N} %2 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(2) alignment(64) offset(%c0) - : !flow.dispatch.tensor>>>{%M, %N} + : !flow.dispatch.tensor>{%M, %N} %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %K} - -> tensor>> + : !flow.dispatch.tensor>{%M, %K} + -> tensor %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%K, %N} - -> tensor>> + : !flow.dispatch.tensor>{%K, %N} + -> tensor %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : !flow.dispatch.tensor>>>{%M, %N} - -> tensor>> + : !flow.dispatch.tensor>{%M, %N} + -> tensor %6 = linalg.matmul - ins(%3, %4 : tensor>>, - tensor>>) - outs(%5 : tensor>>) - -> tensor>> + ins(%3, %4 : tensor, + tensor) + outs(%5 : tensor) + -> tensor flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1] - : tensor>> - -> !flow.dispatch.tensor>>>{%M, %N} + : tensor + -> !flow.dispatch.tensor>{%M, %N} return } // CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)> diff --git a/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_into_nop.mlir b/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_into_nop.mlir index 2e4a0ce32793..83c7dc7f3b35 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_into_nop.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/materialize_encoding_into_nop.mlir @@ -3,9 +3,10 @@ #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding func.func @pack_unpack_gemm_lhs(%arg0 : tensor) -> tensor { - %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor> - %1 = iree_encoding.unset_encoding %0 : tensor> -> tensor + %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor + %1 = iree_encoding.unset_encoding %0 : tensor -> tensor return %1 : tensor } // CHECK: func @pack_unpack_gemm_lhs( @@ -17,13 +18,16 @@ func.func @pack_unpack_gemm_lhs(%arg0 : tensor) -> tensor { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding +#encoding_rhs = #iree_encoding.encoding +#encoding_result = #iree_encoding.encoding func.func @gemm_dynamic(%arg0 : tensor, %arg1 : tensor, %arg2 : tensor) -> tensor { - %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor> - %1 = iree_encoding.set_encoding %arg1 : tensor -> tensor> - %2 = iree_encoding.set_encoding %arg2 : tensor -> tensor> - %3 = linalg.matmul ins(%0, %1 : tensor>, tensor>) - outs(%2 : tensor>) -> tensor> - %4 = iree_encoding.unset_encoding %3 : tensor> -> tensor + %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor + %1 = iree_encoding.set_encoding %arg1 : tensor -> tensor + %2 = iree_encoding.set_encoding %arg2 : tensor -> tensor + %3 = linalg.matmul ins(%0, %1 : tensor, tensor) + outs(%2 : tensor) -> tensor + %4 = iree_encoding.unset_encoding %3 : tensor -> tensor return %4 : tensor } // CHECK: func @gemm_dynamic( @@ -40,20 +44,23 @@ func.func @gemm_dynamic(%arg0 : tensor, %arg1 : tensor, %arg2 #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding_lhs = #iree_encoding.encoding +#encoding_rhs = #iree_encoding.encoding +#encoding_result = #iree_encoding.encoding func.func @gemm_fill_dynamic(%arg0 : tensor, %arg1 : tensor) -> tensor { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %cst = arith.constant 0.0 : f32 %d0 = tensor.dim %arg0, %c0 : tensor %d1 = tensor.dim %arg1, %c1 : tensor - %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor> - %1 = iree_encoding.set_encoding %arg1 : tensor -> tensor> - %2 = tensor.empty(%d0, %d1) : tensor> - %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor>) - -> tensor> - %4 = linalg.matmul ins(%0, %1 : tensor>, tensor>) - outs(%3 : tensor>) -> tensor> - %5 = iree_encoding.unset_encoding %4 : tensor> -> tensor + %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor + %1 = iree_encoding.set_encoding %arg1 : tensor -> tensor + %2 = tensor.empty(%d0, %d1) : tensor + %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor) + -> tensor + %4 = linalg.matmul ins(%0, %1 : tensor, tensor) + outs(%3 : tensor) -> tensor + %5 = iree_encoding.unset_encoding %4 : tensor -> tensor return %5 : tensor } // CHECK: func @gemm_fill_dynamic( @@ -76,13 +83,16 @@ func.func @gemm_fill_dynamic(%arg0 : tensor, %arg1 : tensor) - #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +#encoding_lhs = #iree_encoding.encoding +#encoding_rhs = #iree_encoding.encoding +#encoding_result = #iree_encoding.encoding func.func @batch_matmul(%arg0 : tensor<128x80x32xf32>, %arg1 : tensor<128x32x320xf32>, %arg2 : tensor<128x80x320xf32>) -> tensor<128x80x320xf32> { - %0 = iree_encoding.set_encoding %arg0 : tensor<128x80x32xf32> -> tensor<128x80x32xf32, #iree_encoding.encoding> - %1 = iree_encoding.set_encoding %arg1 : tensor<128x32x320xf32> -> tensor<128x32x320xf32, #iree_encoding.encoding> - %2 = iree_encoding.set_encoding %arg2 : tensor<128x80x320xf32> -> tensor<128x80x320xf32, #iree_encoding.encoding> - %3 = linalg.batch_matmul ins(%0, %1 : tensor<128x80x32xf32, #iree_encoding.encoding>, tensor<128x32x320xf32, #iree_encoding.encoding>) - outs(%2 : tensor<128x80x320xf32, #iree_encoding.encoding>) -> tensor<128x80x320xf32, #iree_encoding.encoding> - %4 = iree_encoding.unset_encoding %3 : tensor<128x80x320xf32, #iree_encoding.encoding> -> tensor<128x80x320xf32> + %0 = iree_encoding.set_encoding %arg0 : tensor<128x80x32xf32> -> tensor<128x80x32xf32, #encoding_lhs> + %1 = iree_encoding.set_encoding %arg1 : tensor<128x32x320xf32> -> tensor<128x32x320xf32, #encoding_rhs> + %2 = iree_encoding.set_encoding %arg2 : tensor<128x80x320xf32> -> tensor<128x80x320xf32, #encoding_result> + %3 = linalg.batch_matmul ins(%0, %1 : tensor<128x80x32xf32, #encoding_lhs>, tensor<128x32x320xf32, #encoding_rhs>) + outs(%2 : tensor<128x80x320xf32, #encoding_result>) -> tensor<128x80x320xf32, #encoding_result> + %4 = iree_encoding.unset_encoding %3 : tensor<128x80x320xf32, #encoding_result> -> tensor<128x80x320xf32> return %4 : tensor<128x80x320xf32> } // CHECK: func @batch_matmul( @@ -99,13 +109,16 @@ func.func @batch_matmul(%arg0 : tensor<128x80x32xf32>, %arg1 : tensor<128x32x320 #map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +#encoding_lhs = #iree_encoding.encoding +#encoding_rhs = #iree_encoding.encoding +#encoding_result = #iree_encoding.encoding func.func @batch_matmul_dynamic(%arg0 : tensor, %arg1 : tensor, %arg2 : tensor) -> tensor { - %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor> - %1 = iree_encoding.set_encoding %arg1 : tensor -> tensor> - %2 = iree_encoding.set_encoding %arg2 : tensor -> tensor> - %3 = linalg.batch_matmul ins(%0, %1 : tensor>, tensor>) - outs(%2 : tensor>) -> tensor> - %4 = iree_encoding.unset_encoding %3 : tensor> -> tensor + %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor + %1 = iree_encoding.set_encoding %arg1 : tensor -> tensor + %2 = iree_encoding.set_encoding %arg2 : tensor -> tensor + %3 = linalg.batch_matmul ins(%0, %1 : tensor, tensor) + outs(%2 : tensor) -> tensor + %4 = iree_encoding.unset_encoding %3 : tensor -> tensor return %4 : tensor } // CHECK: func @batch_matmul_dynamic( @@ -122,6 +135,9 @@ func.func @batch_matmul_dynamic(%arg0 : tensor, %arg1 : tensor (d0, d1, d3)> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> +#encoding_lhs = #iree_encoding.encoding +#encoding_rhs = #iree_encoding.encoding +#encoding_result = #iree_encoding.encoding func.func @batch_matmul_fill_dynamic(%arg0 : tensor, %arg1 : tensor) -> tensor { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -130,14 +146,14 @@ func.func @batch_matmul_fill_dynamic(%arg0 : tensor, %arg1 : tensor %d1 = tensor.dim %arg0, %c1 : tensor %d2 = tensor.dim %arg1, %c2 : tensor - %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor> - %1 = iree_encoding.set_encoding %arg1 : tensor -> tensor> - %2 = tensor.empty(%d0, %d1, %d2) : tensor> - %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor>) - -> tensor> - %4 = linalg.batch_matmul ins(%0, %1 : tensor>, tensor>) - outs(%3 : tensor>) -> tensor> - %5 = iree_encoding.unset_encoding %4 : tensor> -> tensor + %0 = iree_encoding.set_encoding %arg0 : tensor -> tensor + %1 = iree_encoding.set_encoding %arg1 : tensor -> tensor + %2 = tensor.empty(%d0, %d1, %d2) : tensor + %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor) + -> tensor + %4 = linalg.batch_matmul ins(%0, %1 : tensor, tensor) + outs(%3 : tensor) -> tensor + %5 = iree_encoding.unset_encoding %4 : tensor -> tensor return %5 : tensor } // CHECK: func @batch_matmul_fill_dynamic( @@ -165,13 +181,14 @@ func.func @batch_matmul_fill_dynamic(%arg0 : tensor, %arg1 : tensor ]> ]> +#encoding_lhs = #iree_encoding.encoding, matmul_narrow_M = 1 : index, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]> func.func @drop_encoding_for_hal_flow_ops_static() { %c0 = arith.constant 0 : index %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor, matmul_narrow_M = 1 : index, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor> %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1x1xf32> - %3 = iree_encoding.set_encoding %2 : tensor<1x1xf32> -> tensor<1x1xf32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32, #iree_encoding.encoding, matmul_narrow_M = 1 : index, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor, matmul_narrow_M = 1 : index, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>> + %3 = iree_encoding.set_encoding %2 : tensor<1x1xf32> -> tensor<1x1xf32, #encoding_lhs> + flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [1, 1], strides = [1, 1] : tensor<1x1xf32, #encoding_lhs> -> !flow.dispatch.tensor> return } // CHECK-LABEL: func.func @drop_encoding_for_hal_flow_ops_static @@ -188,6 +205,7 @@ func.func @drop_encoding_for_hal_flow_ops_static() { #hal.descriptor_set.binding<1, storage_buffer> ]> ]> +#encoding_lhs = #iree_encoding.encoding, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]> func.func @drop_encoding_for_hal_flow_ops_dynamic() { %c0 = arith.constant 0 : index %c32_i64 = arith.constant 32 : i64 @@ -208,10 +226,10 @@ func.func @drop_encoding_for_hal_flow_ops_dynamic() { %14 = flow.dispatch.workload.ordinal %8, 0 : index %15 = flow.dispatch.workload.ordinal %13, 1 : index %16 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor>{%14, %15} - %17 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%14, %15} + %17 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor>{%14, %15} %18 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [%14, %15], strides = [1, 1] : !flow.dispatch.tensor>{%14, %15} -> tensor - %19 = iree_encoding.set_encoding %18 : tensor -> tensor, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - flow.dispatch.tensor.store %19, %17, offsets = [0, 0], sizes = [%14, %15], strides = [1, 1] : tensor, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> !flow.dispatch.tensor, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>>{%14, %15} + %19 = iree_encoding.set_encoding %18 : tensor -> tensor + flow.dispatch.tensor.store %19, %17, offsets = [0, 0], sizes = [%14, %15], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%14, %15} return } // CHECK-LABEL: func.func @drop_encoding_for_hal_flow_ops_dynamic diff --git a/compiler/src/iree/compiler/Dialect/Encoding/IR/test/invalid.mlir b/compiler/src/iree/compiler/Dialect/Encoding/IR/test/invalid.mlir index b2f23a4c6f84..163de9712b89 100644 --- a/compiler/src/iree/compiler/Dialect/Encoding/IR/test/invalid.mlir +++ b/compiler/src/iree/compiler/Dialect/Encoding/IR/test/invalid.mlir @@ -8,9 +8,10 @@ func.func @illegal_set_encoding_op_with_no_result_encoding(%arg0 : tensor>) -> tensor { +#encoding = #iree_encoding.encoding +func.func @illegal_set_encoding_op_with_source_encoding(%arg0 : tensor) -> tensor { // expected-error @+1 {{source of set_encoding op cannot have a tensor encoding}} - %0 = iree_encoding.set_encoding %arg0: tensor> -> tensor + %0 = iree_encoding.set_encoding %arg0: tensor -> tensor return %0 : tensor } @@ -24,18 +25,20 @@ func.func @illegal_set_encoding_op_with_unknown_encoding(%arg0 : tensor // ----- -func.func @illegal_set_encoding_op_with_rank_change(%arg0 : tensor) -> tensor> { +#encoding = #iree_encoding.encoding +func.func @illegal_set_encoding_op_with_rank_change(%arg0 : tensor) -> tensor { // expected-error @+1 {{cannot change the rank of the tensor}} - %0 = iree_encoding.set_encoding %arg0: tensor -> tensor> - return %0 : tensor> + %0 = iree_encoding.set_encoding %arg0: tensor -> tensor + return %0 : tensor } // ----- -func.func @illegal_set_encoding_op_with_shape_change(%arg0 : tensor<10x20xf32>) -> tensor<20x30xf32, #iree_encoding.encoding> { +#encoding = #iree_encoding.encoding +func.func @illegal_set_encoding_op_with_shape_change(%arg0 : tensor<10x20xf32>) -> tensor<20x30xf32, #encoding> { // expected-error @+1 {{expected to preserve the logical shape of the tensor}} - %0 = iree_encoding.set_encoding %arg0: tensor<10x20xf32> -> tensor<20x30xf32, #iree_encoding.encoding> - return %0 : tensor<20x30xf32, #iree_encoding.encoding> + %0 = iree_encoding.set_encoding %arg0: tensor<10x20xf32> -> tensor<20x30xf32, #encoding> + return %0 : tensor<20x30xf32, #encoding> } // ----- @@ -48,10 +51,11 @@ func.func @illegal_unset_encoding_op_with_no_source_encoding(%arg0 : tensor) -> tensor> { +#encoding = #iree_encoding.encoding +func.func @illegal_unset_encoding_op_with_result_encoding(%arg0 : tensor) -> tensor { // expected-error @+1 {{result of unset_encoding op cannot have a tensor encoding}} - %0 = iree_encoding.unset_encoding %arg0: tensor -> tensor> - return %0 : tensor> + %0 = iree_encoding.unset_encoding %arg0: tensor -> tensor + return %0 : tensor } // ----- @@ -64,16 +68,18 @@ func.func @illegal_unset_encoding_op_with_unknown_encoding(%arg0 : tensor>) -> tensor { +#encoding = #iree_encoding.encoding +func.func @illegal_unset_encoding_op_with_rank_change(%arg0 : tensor) -> tensor { // expected-error @+1 {{cannot change the rank of the tensor}} - %0 = iree_encoding.unset_encoding %arg0: tensor> -> tensor + %0 = iree_encoding.unset_encoding %arg0: tensor -> tensor return %0 : tensor } // ----- -func.func @illegal_unset_encoding_op_with_shape_change(%arg0 : tensor<20x30xf32, #iree_encoding.encoding>) -> tensor<10x20xf32> { +#encoding = #iree_encoding.encoding +func.func @illegal_unset_encoding_op_with_shape_change(%arg0 : tensor<20x30xf32, #encoding>) -> tensor<10x20xf32> { // expected-error @+1 {{expected to preserve the logical shape of the tensor}} - %0 = iree_encoding.unset_encoding %arg0: tensor<20x30xf32, #iree_encoding.encoding> -> tensor<10x20xf32> + %0 = iree_encoding.unset_encoding %arg0: tensor<20x30xf32, #encoding> -> tensor<10x20xf32> return %0 : tensor<10x20xf32> } diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir index 70b35762026e..dee885fefa4c 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/annotate_dispatches.mlir @@ -111,23 +111,24 @@ flow.executable private @ex { // Dispatches set_encoding and unset_encoding ops get a heuristics-driven // summary in their name. +#encoding = #iree_encoding.encoding flow.executable private @ex0 { // CHECK: flow.executable.export public @dispatch0_map_DxD_f32 flow.executable.export public @dispatch0 builtin.module { - func.func @dispatch0(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor>>) { + func.func @dispatch0(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !flow.dispatch.tensor>) { %0 = flow.dispatch.workload.ordinal %arg2, 0 : index %1 = flow.dispatch.workload.ordinal %arg3, 1 : index %2 = flow.dispatch.workload.ordinal %arg4, 2 : index %3 = flow.dispatch.workload.ordinal %arg5, 3 : index %4 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%0, %1} %5 = flow.dispatch.tie_shape %arg1 : !flow.dispatch.tensor>{%2, %3} - %6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor>>{%2, %3} + %6 = flow.dispatch.tie_shape %arg6 : !flow.dispatch.tensor>{%2, %3} %7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor>{%0, %1} -> tensor %8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : !flow.dispatch.tensor>{%2, %3} -> tensor %mapped = linalg.map { math.absf } ins(%7 : tensor) outs(%8 : tensor) - %9 = iree_encoding.set_encoding %mapped : tensor -> tensor> - flow.dispatch.tensor.store %9, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor> -> !flow.dispatch.tensor>>{%arg4, %arg5} + %9 = iree_encoding.set_encoding %mapped : tensor -> tensor + flow.dispatch.tensor.store %9, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%arg4, %arg5} return } } @@ -136,11 +137,11 @@ flow.executable private @ex1 { // CHECK: flow.executable.export public @dispatch1_unset_encoding_LHS_DxD flow.executable.export public @dispatch1 builtin.module { - func.func @dispatch1(%arg0: !flow.dispatch.tensor>>, %arg1: index, %arg2: index, %arg3: !flow.dispatch.tensor>) { - %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>>{%arg1, %arg2} + func.func @dispatch1(%arg0: !flow.dispatch.tensor>, %arg1: index, %arg2: index, %arg3: !flow.dispatch.tensor>) { + %0 = flow.dispatch.tie_shape %arg0 : !flow.dispatch.tensor>{%arg1, %arg2} %1 = flow.dispatch.tie_shape %arg3 : !flow.dispatch.tensor>{%arg1, %arg2} - %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%arg1, %arg2], strides = [1, 1] : !flow.dispatch.tensor>>{%arg1, %arg2} -> tensor> - %3 = iree_encoding.unset_encoding %2 : tensor> -> tensor + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%arg1, %arg2], strides = [1, 1] : !flow.dispatch.tensor>{%arg1, %arg2} -> tensor + %3 = iree_encoding.unset_encoding %2 : tensor -> tensor flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [%arg1, %arg2], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%arg1, %arg2} return } diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir index 5da4e957b921..ea22db1bf38e 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors.mlir @@ -1874,11 +1874,12 @@ util.func public @batchnorm_training(%arg0: tensor<12xf32>, %arg1: tensor<12x12x // ----- +#encoding = #iree_encoding.encoding util.func public @set_encoding_op(%arg0 : tensor) - -> tensor> { + -> tensor { %0 = iree_encoding.set_encoding %arg0 - : tensor -> tensor> - util.return %0 : tensor> + : tensor -> tensor + util.return %0 : tensor } // CHECK: util.func public @set_encoding_op // CHECK-SAME: %[[ARG0:.+]]: tensor @@ -1907,10 +1908,11 @@ util.func public @set_encoding_op(%arg0 : tensor) // ----- -util.func public @unset_encoding_op(%arg0 : tensor>) +#encoding = #iree_encoding.encoding +util.func public @unset_encoding_op(%arg0 : tensor) -> tensor { %0 = iree_encoding.unset_encoding %arg0 - : tensor> -> tensor + : tensor -> tensor util.return %0 : tensor } // CHECK: util.func public @unset_encoding_op @@ -1941,8 +1943,9 @@ util.func public @unset_encoding_op(%arg0 : tensor (-s0 + (s0 ceildiv 16) * 16)> +#encoding = #iree_encoding.encoding util.func public @pad_and_set_encoding_op(%arg0 : tensor) - -> tensor> { + -> tensor { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %cst = arith.constant 0.0 : f32 @@ -1955,8 +1958,8 @@ util.func public @pad_and_set_encoding_op(%arg0 : tensor) tensor.yield %cst : f32 } : tensor to tensor %encoding = iree_encoding.set_encoding %pad - : tensor -> tensor> - util.return %encoding : tensor> + : tensor -> tensor + util.return %encoding : tensor } // CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> ((s0 ceildiv 16) * 16)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0] -> (-s0 + (s0 ceildiv 16) * 16)> @@ -1997,11 +2000,12 @@ util.func public @pad_and_set_encoding_op(%arg0 : tensor) // ----- +#encoding = #iree_encoding.encoding util.func public @unset_encoding_and_slice( - %arg0: tensor>, + %arg0: tensor, %arg1 : index, %arg2 : index) -> tensor { %0 = iree_encoding.unset_encoding %arg0 - : tensor> -> tensor + : tensor -> tensor %1 = tensor.extract_slice %0[0, 0] [%arg1, %arg2] [1, 1] : tensor to tensor util.return %1 : tensor @@ -2039,10 +2043,11 @@ util.func public @unset_encoding_and_slice( #map = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> +#encoding = #iree_encoding.encoding util.func public @root_on_unset_encoding( - %arg0: tensor<784x96xf32, #iree_encoding.encoding>, + %arg0: tensor<784x96xf32, #encoding>, %arg1: tensor<96xf32>) -> tensor<784x96xf32> { - %0 = iree_encoding.unset_encoding %arg0 : tensor<784x96xf32, #iree_encoding.encoding> -> tensor<784x96xf32> + %0 = iree_encoding.unset_encoding %arg0 : tensor<784x96xf32, #encoding> -> tensor<784x96xf32> %1 = tensor.empty() : tensor<784x96xf32> %cst = arith.constant 0.000000e+00 : f32 %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<784x96xf32>) -> tensor<784x96xf32> @@ -2084,14 +2089,15 @@ util.func public @root_on_unset_encoding( // ----- +#encoding = #iree_encoding.encoding util.func public @gemm_encoded( - %arg0 : tensor>, + %arg0 : tensor, %arg1 : tensor>, %arg2 : tensor>) -> tensor> { %0 = linalg.matmul ins(%arg0, %arg1 - : tensor>, + : tensor, tensor>) outs(%arg2 : tensor>) -> tensor> @@ -2115,21 +2121,22 @@ util.func public @gemm_encoded( // ----- +#encoding = #iree_encoding.encoding util.func public @gemm_fill_encoded( - %arg0 : tensor>, + %arg0 : tensor, %arg1 : tensor>) -> tensor> { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %cst = arith.constant 0.0 : f32 - %d0 = tensor.dim %arg0, %c0 : tensor> + %d0 = tensor.dim %arg0, %c0 : tensor %d1 = tensor.dim %arg1, %c1 : tensor> %empty = tensor.empty(%d0, %d1) : tensor> %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor>) -> tensor> %0 = linalg.matmul ins(%arg0, %arg1 - : tensor>, + : tensor, tensor>) outs(%fill : tensor>) -> tensor> diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_default.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_default.mlir index ccd1692ccea7..897fbcc5a25d 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_default.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/dispatch_linalg_on_tensors_default.mlir @@ -32,8 +32,9 @@ util.func public @no_fuse_quantized(%arg0 : tensor, %arg1 : ten #map = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> +#encoding = #iree_encoding.encoding util.func public @elem_set_encoding(%arg0: tensor<512xf32>, %arg1: tensor<384x512xf32>, - %arg2: tensor<384x512xf32>) -> tensor<384x512xf32, #iree_encoding.encoding> { + %arg2: tensor<384x512xf32>) -> tensor<384x512xf32, #encoding> { %0 = tensor.empty() : tensor<384x512xf32> %1 = linalg.generic {indexing_maps = [#map, #map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} @@ -44,8 +45,8 @@ util.func public @elem_set_encoding(%arg0: tensor<512xf32>, %arg1: tensor<384x51 %4 = arith.addf %3, %in_1 : f32 linalg.yield %4 : f32 } -> tensor<384x512xf32> - %2 = iree_encoding.set_encoding %1 : tensor<384x512xf32> -> tensor<384x512xf32, #iree_encoding.encoding> - util.return %2 : tensor<384x512xf32, #iree_encoding.encoding> + %2 = iree_encoding.set_encoding %1 : tensor<384x512xf32> -> tensor<384x512xf32, #encoding> + util.return %2 : tensor<384x512xf32, #encoding> } // CHECK-LABEL: util.func public @elem_set_encoding // CHECK: flow.dispatch.workgroups diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_regions.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_regions.mlir index 022315cafc69..b59e4a2732ab 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_regions.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/form_dispatch_regions.mlir @@ -137,8 +137,9 @@ util.func public @tranpose_pack_fusion(%arg0: tensor) -> tensor util.func public @set_encoding_fusion(%arg0 : tensor, %arg1 : tensor, - %arg2 : index, %arg3 : index) -> tensor> { + %arg2 : index, %arg3 : index) -> tensor { %cst = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -168,8 +169,8 @@ util.func public @set_encoding_fusion(%arg0 : tensor, %arg1 : tensor tensor %6 = iree_encoding.set_encoding %5 - : tensor -> tensor> - util.return %6 : tensor> + : tensor -> tensor + util.return %6 : tensor } // CHECK-LABEL: util.func public @set_encoding_fusion( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor @@ -187,16 +188,17 @@ util.func public @set_encoding_fusion(%arg0 : tensor, %arg1 : tensor util.func public @set_encoding_pad_fusion(%arg0 : tensor, - %arg1 : index, %arg2 : index) -> tensor> { + %arg1 : index, %arg2 : index) -> tensor { %cst = arith.constant 0.0 : f32 %0 = tensor.pad %arg0 low[0, 0] high[%arg1, %arg2] { ^bb0(%b0: index, %b1 : index): tensor.yield %cst : f32 } : tensor to tensor %1 = iree_encoding.set_encoding %0 - : tensor -> tensor> - util.return %1 : tensor> + : tensor -> tensor + util.return %1 : tensor } // CHECK-LABEL: util.func public @set_encoding_pad_fusion( // CHECK-SAME: %[[ARG0:.+]]: tensor @@ -208,8 +210,9 @@ util.func public @set_encoding_pad_fusion(%arg0 : tensor, // ----- +#encoding = #iree_encoding.encoding util.func public @set_encoding_pad_elementwise_fusion(%arg0 : tensor, %arg1 : tensor, - %arg2 : index, %arg3 : index) -> tensor> { + %arg2 : index, %arg3 : index) -> tensor { %cst = arith.constant 0.0 : f32 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index @@ -243,8 +246,8 @@ util.func public @set_encoding_pad_elementwise_fusion(%arg0 : tensor, % tensor.yield %cst : f32 } : tensor to tensor %7 = iree_encoding.set_encoding %6 - : tensor -> tensor> - util.return %7 : tensor> + : tensor -> tensor + util.return %7 : tensor } // CHECK-LABEL: util.func public @set_encoding_pad_elementwise_fusion( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor @@ -263,13 +266,14 @@ util.func public @set_encoding_pad_elementwise_fusion(%arg0 : tensor, % // ----- +#encoding = #iree_encoding.encoding util.func public @unset_encoding_elementwise_fusion( - %arg0: tensor>, + %arg0: tensor, %arg1: tensor) -> tensor { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %0 = iree_encoding.unset_encoding %arg0 - : tensor> -> tensor + : tensor -> tensor %1 = tensor.dim %0, %c0 : tensor %2 = tensor.dim %0, %c1 : tensor %3 = tensor.empty(%1, %2) : tensor @@ -298,8 +302,9 @@ util.func public @unset_encoding_elementwise_fusion( // ----- +#encoding = #iree_encoding.encoding util.func public @unset_encoding_slice_elementwise_fusion( - %arg0: tensor>, + %arg0: tensor, %arg1: tensor, %arg2 : index, %arg3 : index) -> tensor { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/tensor_pad_to_tensor_insert_slice.mlir b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/tensor_pad_to_tensor_insert_slice.mlir index a1f0682483eb..357a2b5ed581 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/tensor_pad_to_tensor_insert_slice.mlir +++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/test/tensor_pad_to_tensor_insert_slice.mlir @@ -79,14 +79,15 @@ util.func public @_main(%arg0: tensor<1x33x33x480xf32>, %arg1: tensor<3x3x480x1x // ---- +#encoding = #iree_encoding.encoding> util.func public @dispatch_dispatch_0_generic_512x1024_f32( %arg0: !flow.dispatch.tensor>, %arg1: index, %arg2: index, %arg3: index, %arg4: index, - %arg5: !flow.dispatch.tensor>>>) { + %arg5: !flow.dispatch.tensor>) { %cst = arith.constant 0.000000e+00 : f32 %0 = flow.dispatch.workload.ordinal %arg3, 2 : index %1 = flow.dispatch.workload.ordinal %arg4, 3 : index - %2 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor>>>{%0, %1} + %2 = flow.dispatch.tie_shape %arg5 : !flow.dispatch.tensor>{%0, %1} %3 = flow.dispatch.workload.ordinal %arg1, 0 : index %4 = flow.dispatch.workload.ordinal %arg2, 1 : index %5 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [512, 1024], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<512x1024xf32> @@ -94,8 +95,8 @@ util.func public @dispatch_dispatch_0_generic_512x1024_f32( ^bb0(%arg6: index, %arg7: index): tensor.yield %cst : f32 } : tensor<512x1024xf32> to tensor - %11 = iree_encoding.set_encoding %padded : tensor -> tensor>> - flow.dispatch.tensor.store %11, %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor>> -> !flow.dispatch.tensor>>>{%0, %1} + %11 = iree_encoding.set_encoding %padded : tensor -> tensor + flow.dispatch.tensor.store %11, %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor -> !flow.dispatch.tensor>{%0, %1} util.return } diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir index 77b09a34cdc8..83c95604bb05 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/encode_host_tensors.mlir @@ -24,8 +24,9 @@ util.func public @denseTensorSizeOfEmpty(%arg0: index) -> index { #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> util.func public @sizeof_lhs_encoding_dynamic(%arg0: index, %arg1: index) -> index { - %0 = stream.tensor.sizeof tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>{%arg0, %arg1} : index + %0 = stream.tensor.sizeof tensor{%arg0, %arg1} : index util.return %0 : index } // CHECK-LABEL: @sizeof_lhs_encoding_dynamic @@ -44,8 +45,9 @@ util.func public @sizeof_lhs_encoding_dynamic(%arg0: index, %arg1: index) -> ind #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> util.func public @sizeof_rhs_encoding_dynamic(%arg0: index, %arg1: index) -> index { - %0 = stream.tensor.sizeof tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>{%arg0, %arg1} : index + %0 = stream.tensor.sizeof tensor{%arg0, %arg1} : index util.return %0 : index } // CHECK-LABEL: @sizeof_rhs_encoding_dynamic @@ -65,8 +67,9 @@ util.func public @sizeof_rhs_encoding_dynamic(%arg0: index, %arg1: index) -> ind #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#encoding = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array> util.func public @sizeof_result_encoding_dynamic(%arg0: index, %arg1: index) -> index { - %0 = stream.tensor.sizeof tensor, user_indexing_maps = [#map, #map1, #map2], round_dims_to = array>>{%arg0, %arg1} : index + %0 = stream.tensor.sizeof tensor{%arg0, %arg1} : index util.return %0 : index } // CHECK-LABEL: @sizeof_result_encoding_dynamic @@ -86,8 +89,9 @@ util.func public @sizeof_result_encoding_dynamic(%arg0: index, %arg1: index) -> #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> #map3 = affine_map<(d0, d1, d2) -> (d1, d2)> +#encoding = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], bcast_map = #map3, round_dims_to = array> util.func public @sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic(%arg0: index, %arg1: index) -> index { - %0 = stream.tensor.sizeof tensor, user_indexing_maps = [#map, #map1, #map2], bcast_map = #map3, round_dims_to = array>>{%arg0, %arg1} : index + %0 = stream.tensor.sizeof tensor{%arg0, %arg1} : index util.return %0 : index } // CHECK-LABEL: @sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic @@ -107,8 +111,9 @@ util.func public @sizeof_lhs_encoding_with_bcast_across_batch_dim_dynamic(%arg0: #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)> #map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> #map3 = affine_map<(d0, d1, d2) -> (d0, d2)> +#encoding = #iree_encoding.encoding, user_indexing_maps = [#map, #map1, #map2], bcast_map = #map3, round_dims_to = array> util.func public @sizeof_lhs_encoding_with_bcast_across_m_dim_dynamic(%arg0: index, %arg1: index) -> index { - %0 = stream.tensor.sizeof tensor, user_indexing_maps = [#map, #map1, #map2], bcast_map = #map3, round_dims_to = array>>{%arg0, %arg1} : index + %0 = stream.tensor.sizeof tensor{%arg0, %arg1} : index util.return %0 : index } // CHECK-LABEL: @sizeof_lhs_encoding_with_bcast_across_m_dim_dynamic