diff --git a/tensorflow/lite/micro/compression.h b/tensorflow/lite/micro/compression.h index 43965c2bf48..cfc8a1c8e85 100644 --- a/tensorflow/lite/micro/compression.h +++ b/tensorflow/lite/micro/compression.h @@ -61,7 +61,7 @@ struct CompressedTensorList { // Sparsely populated array with the same number of elements as there are // tensors in the Subgraph. An alternative would include a tensor index in // the struct for each and walk the list on look up. This could be slow. - CompressionTensorData** tensors; + const CompressionTensorData** tensors; }; } // namespace tflite diff --git a/tensorflow/lite/micro/compression/BUILD b/tensorflow/lite/micro/compression/BUILD new file mode 100644 index 00000000000..cde1b55bb15 --- /dev/null +++ b/tensorflow/lite/micro/compression/BUILD @@ -0,0 +1,94 @@ +load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library", "flatbuffer_py_library") +load("@rules_python//python:defs.bzl", "py_test") +load("@tflm_pip_deps//:requirements.bzl", "requirement") + +package( + default_visibility = [ + "//visibility:public", + ], +) + +flatbuffer_cc_library( + name = "metadata_flatbuffer_cc", + srcs = ["metadata.fbs"], +) + +flatbuffer_py_library( + name = "original_flatbuffer_py", + srcs = ["original.fbs"], +) + +flatbuffer_py_library( + name = "metadata_flatbuffer_py", + srcs = ["metadata.fbs"], +) + +cc_test( + name = "metadata_test_cc", + srcs = ["metadata_test.cc"], + deps = [ + "metadata_flatbuffer_cc", + "//tensorflow/lite/micro:hexdump", + "@flatbuffers//:runtime_cc", + ], + size = "small", +) + +py_binary( + name = "compress", + srcs = ["compress.py"], + deps = [ + "@absl_py//absl:app", + "@absl_py//absl/flags", + "@absl_py//absl/logging", + "@flatbuffers//:runtime_py", + "metadata_flatbuffer_py", + "//tensorflow/lite/python:schema_py", + requirement("bitarray"), + requirement("numpy"), + requirement("scikit-learn"), + ], +) + +py_binary( + name = "view", + srcs = [ + "view.py", + ], + deps = [ + "metadata_flatbuffer_py", + "//tensorflow/lite/python:schema_py", + ], +) + +py_test( + name = "metadata_test_py", + main = "metadata_test.py", + srcs = ["metadata_test.py"], + deps = [ + "metadata_flatbuffer_py", + "@flatbuffers//:runtime_py", + requirement("hexdump"), + ], + size = "small", +) + +py_test( + name = "original_test_py", + main = "original_test.py", + srcs = ["original_test.py"], + deps = [ + "original_flatbuffer_py", + "@flatbuffers//:runtime_py", + requirement("hexdump"), + ], + size = "small", +) + +genrule( + name = "hello_world_int8.compressed", + srcs = ["//tensorflow/lite/micro/examples/hello_world/models:hello_world_int8.tflite"], + outs = ["hello_world_int8.compressed.tflite"], + cmd = "$(location :compress) --input_model_path $< --output_model_path $@", + tools = [":compress"], +) diff --git a/tensorflow/lite/micro/compression/compress.py b/tensorflow/lite/micro/compression/compress.py new file mode 100644 index 00000000000..18834982f24 --- /dev/null +++ b/tensorflow/lite/micro/compression/compress.py @@ -0,0 +1,244 @@ +# Copyright 2024 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Reduces the number of weights in a .tflite model using various strategies.""" + +# Usage information: +# Default: +# `bazel run tensorflow/lite/micro/tools:compress -- \ +# --input_model_path=` \ +# --output_model_path=` + + +from tensorflow.lite.micro.compression import metadata_flatbuffer_py_generated as compression_schema +from tensorflow.lite.python import schema_py_generated as tflite_schema + +from absl import app +from absl import flags +from absl import logging +import bitarray +import bitarray.util +import numpy as np +import flatbuffers +import sklearn.cluster +import struct + + +_INPUT_MODEL_PATH = flags.DEFINE_string( + "input_model_path", + None, + ".tflite input model path", + required=True, +) + +_TEST_COMPRESSED_MODEL = flags.DEFINE_bool( + "test_compressed_model", + False, + "optional config to test models with random data and" + " report on the differences in output.", +) + +_OUTPUT_MODEL_PATH = flags.DEFINE_string( + "output_model_path", + None, + ".tflite output path. Leave blank if same as input+.compressed.tflite", +) + + +def read_model(path): + with open(path, 'rb') as file: + buffer = bytearray(file.read()) + return tflite_schema.ModelT.InitFromPackedBuf(buffer, 0) + + +def write_model(model, path): + builder = flatbuffers.Builder(32) + root = model.Pack(builder) + builder.Finish(root) + buffer: bytearray = builder.Output() + + with open(path, 'wb') as file: + file.write(buffer) + + +def pack_compression_metadata(m): + builder = flatbuffers.Builder(32) + root = m.Pack(builder) + builder.Finish(root) + buffer: bytearray = builder.Output() + return buffer + + +def pack_lut_indexes(indexes, bitwidth): + """Pack the sequence of integers given in `indexes` into bitwidth-wide fields + in a buffer, and return the buffer. Raise an OverflowError if any element + does not fit into a bitwidth-wide field. """ + ba = bitarray.bitarray(endian="big") + for i in indexes: + field = bitarray.util.int2ba(i, length=bitwidth, endian="big") + ba.extend(field) + return ba.tobytes() + + +def pack_lut_values(values, struct_format): + """Pack the `values` into a buffer of bytes, using a `struct_format` + character from the standard module `struct` to determine the type of values + and corresponding encoding into bytes. Always little-endian byte order. + """ + buffer = bytearray() + little_endian = "<" + packer = struct.Struct(little_endian + struct_format) + for v in values: + buffer.extend(packer.pack(v)) + return buffer + + +def unpack_buffer_values(data, struct_format): + little_endian = "<" + unpacker = struct.Struct(little_endian + struct_format) + values = [v[0] for v in unpacker.iter_unpack(bytes(data))] + return values + + +def tensor_type_to_struct_format(type): + m = { + tflite_schema.TensorType.INT8: "b", + tflite_schema.TensorType.INT16: "h", + tflite_schema.TensorType.FLOAT32: "f", + } + return m[type] + + +def bq(sequence, num_values): + """Quantize a sequence of integers, minimizing the total error using k-means + clustering. + + Parameters: + sequence :list - a sequence of integers to be quanized + num_values :int - the number of quantization levels + + Returns: + (indexes, values): a tuple with the list of indexes and list of values + """ + sequence = np.array(sequence).reshape(-1, 1) + kmeans = sklearn.cluster.KMeans(n_clusters=num_values, + random_state=0).fit(sequence) + values = kmeans.cluster_centers_.flatten() + values = np.round(values).astype(int).tolist() + indexes = kmeans.predict(sequence).tolist() + return (indexes, values) + + +def compress_tensor(subgraph_id, tensor_id, model): + subgraph = model.subgraphs[subgraph_id] + tensor = subgraph.tensors[tensor_id] + struct_format = tensor_type_to_struct_format(tensor.type) + buffer_id = tensor.buffer + buffer = model.buffers[buffer_id] + sequence = unpack_buffer_values(buffer.data, struct_format) + bitwidth = 2 + indexes, values = bq(sequence, 2 ** bitwidth) + + # append index buffer + buffer = tflite_schema.BufferT() + buffer.data = pack_lut_indexes(indexes, bitwidth) + model.buffers.append(buffer) + index_id = len(model.buffers) - 1 + + # append value buffer + buffer = tflite_schema.BufferT() + buffer.data = pack_lut_values(values, struct_format) + model.buffers.append(buffer) + value_id = len(model.buffers) - 1 + + # create metadata + lut_tensor = compression_schema.LutTensorT() + lut_tensor.subgraph = subgraph_id + lut_tensor.tensor = tensor_id + lut_tensor.indexBitwidth = bitwidth + lut_tensor.indexBuffer = index_id + lut_tensor.valueBuffer = value_id + + return lut_tensor + + +def compress_fully_connected(subgraph_id, operator_id, model): + # On a fully_connected operator, we compress the 2nd + subgraph = model.subgraphs[subgraph_id] + operator = subgraph.operators[operator_id] + tensor_id_2 = operator.inputs[1] + # tensor_id_3 = operator.inputs[2] + lut_tensor_2 = compress_tensor(subgraph_id, tensor_id_2, model) + # lut_tensor_3 = compress_tensor(subgraph_id, tensor_id_2, model) + return (lut_tensor_2,) + + +def get_opcode_compressions(model): + """Return a map of operator_code indexes to compression functions, for those + operators we wish to and know how to compress. + """ + compressable = {tflite_schema.BuiltinOperator.FULLY_CONNECTED: compress_fully_connected} + compressions = {} + for index, code in enumerate(model.operatorCodes): + if code.builtinCode in compressable: + compressions[index] = compressable[code.builtinCode] + return compressions + + +def compress(model): + # Walk op codes, identify those we compress, note index + # Walk operators, match op code indexes, note tensors to compress + # Walk those tensors, creating LUTs in buffers and metadata + + compressions = get_opcode_compressions(model) + + lut_tensors = [] + + for subgraph_id, subgraph in enumerate(model.subgraphs): + for operator_id, operator in enumerate(subgraph.operators): + fn = compressions.get(operator.opcodeIndex) + if fn is not None: + result = fn(subgraph_id, operator_id, model) + if result is not None: + lut_tensors.extend(result) + + compression_metadata = compression_schema.MetadataT() + compression_metadata.lutTensors = lut_tensors + + return compression_metadata + + +def main(_) -> None: + output_model_path = _OUTPUT_MODEL_PATH.value or ( + _INPUT_MODEL_PATH.value.split(".tflite")[0] + ".compressed.tflite") + logging.info("compressing %s to %s", _INPUT_MODEL_PATH.value, output_model_path) + + model = read_model(_INPUT_MODEL_PATH.value) + + compression_metadata = compress(model) + + buffer = tflite_schema.BufferT() + buffer.data = pack_compression_metadata(compression_metadata) + model.buffers.append(buffer) + + metadata = tflite_schema.MetadataT() + metadata.name = "COMPRESSION_METADATA" + metadata.buffer = len(model.buffers) - 1 + model.metadata.append(metadata) + + write_model(model, output_model_path) + + +if __name__ == "__main__": + app.run(main) diff --git a/tensorflow/lite/micro/compression/metadata.fbs b/tensorflow/lite/micro/compression/metadata.fbs new file mode 100644 index 00000000000..dcfb1ccafb9 --- /dev/null +++ b/tensorflow/lite/micro/compression/metadata.fbs @@ -0,0 +1,38 @@ +// Copyright 2024 The TensorFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Flatbuffer schema describing a TFLM compressed model. Use as the value for +// the key "TFLM_COMPRESSION" in the metadata table in a .tflite flatbuffer. + +namespace tflite.micro.compression; + +table Metadata { + lut_tensors:[LutTensor]; // list of tensors that are compressed by LUT +} + +struct LutTensor { + subgraph:uint16; // the index of the subgraph + tensor:uint16; // the index of the tensor in its subgraph + index_bitwidth:uint8; // the bit-width of LUT indexes + index_buffer:uint16; // the index of the buffer containing LUT indexes + value_buffer:uint16; // the index of the buffer containing LUT values +} +// Look-Up-Table tensors are encoded in two buffers: an index buffer and a +// value buffer. The indexes are unsigned integers packed into the index buffer +// in bitwidth-wide bit fields with a big-endian bit order. The data in the +// value buffer is encoded as usual according to the type of the tensor. +// Tensors with multiple channels have distinct values tables for each channel, +// concatinated into one value buffer. (Will elaborate this comment.) + +root_type Metadata; diff --git a/tensorflow/lite/micro/compression/metadata_generated.h b/tensorflow/lite/micro/compression/metadata_generated.h new file mode 100644 index 00000000000..eaa03cb21e8 --- /dev/null +++ b/tensorflow/lite/micro/compression/metadata_generated.h @@ -0,0 +1,148 @@ +// automatically generated by the FlatBuffers compiler, do not modify + + +#ifndef FLATBUFFERS_GENERATED_METADATA_TFLITE_MICRO_COMPRESSION_H_ +#define FLATBUFFERS_GENERATED_METADATA_TFLITE_MICRO_COMPRESSION_H_ + +#include "flatbuffers/flatbuffers.h" + +namespace tflite { +namespace micro { +namespace compression { + +struct Metadata; +struct MetadataBuilder; + +struct LutTensor; + +FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) LutTensor FLATBUFFERS_FINAL_CLASS { + private: + uint16_t subgraph_; + uint16_t tensor_; + uint8_t index_bitwidth_; + int8_t padding0__; + uint16_t index_buffer_; + uint16_t value_buffer_; + + public: + LutTensor() + : subgraph_(0), + tensor_(0), + index_bitwidth_(0), + padding0__(0), + index_buffer_(0), + value_buffer_(0) { + (void)padding0__; + } + LutTensor(uint16_t _subgraph, uint16_t _tensor, uint8_t _index_bitwidth, uint16_t _index_buffer, uint16_t _value_buffer) + : subgraph_(flatbuffers::EndianScalar(_subgraph)), + tensor_(flatbuffers::EndianScalar(_tensor)), + index_bitwidth_(flatbuffers::EndianScalar(_index_bitwidth)), + padding0__(0), + index_buffer_(flatbuffers::EndianScalar(_index_buffer)), + value_buffer_(flatbuffers::EndianScalar(_value_buffer)) { + } + uint16_t subgraph() const { + return flatbuffers::EndianScalar(subgraph_); + } + uint16_t tensor() const { + return flatbuffers::EndianScalar(tensor_); + } + uint8_t index_bitwidth() const { + return flatbuffers::EndianScalar(index_bitwidth_); + } + uint16_t index_buffer() const { + return flatbuffers::EndianScalar(index_buffer_); + } + uint16_t value_buffer() const { + return flatbuffers::EndianScalar(value_buffer_); + } +}; +FLATBUFFERS_STRUCT_END(LutTensor, 10); + +struct Metadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef MetadataBuilder Builder; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_LUT_TENSORS = 4 + }; + const flatbuffers::Vector *lut_tensors() const { + return GetPointer *>(VT_LUT_TENSORS); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_LUT_TENSORS) && + verifier.VerifyVector(lut_tensors()) && + verifier.EndTable(); + } +}; + +struct MetadataBuilder { + typedef Metadata Table; + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_lut_tensors(flatbuffers::Offset> lut_tensors) { + fbb_.AddOffset(Metadata::VT_LUT_TENSORS, lut_tensors); + } + explicit MetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateMetadata( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset> lut_tensors = 0) { + MetadataBuilder builder_(_fbb); + builder_.add_lut_tensors(lut_tensors); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateMetadataDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const std::vector *lut_tensors = nullptr) { + auto lut_tensors__ = lut_tensors ? _fbb.CreateVectorOfStructs(*lut_tensors) : 0; + return tflite::micro::compression::CreateMetadata( + _fbb, + lut_tensors__); +} + +inline const tflite::micro::compression::Metadata *GetMetadata(const void *buf) { + return flatbuffers::GetRoot(buf); +} + +inline const tflite::micro::compression::Metadata *GetSizePrefixedMetadata(const void *buf) { + return flatbuffers::GetSizePrefixedRoot(buf); +} + +inline bool VerifyMetadataBuffer( + flatbuffers::Verifier &verifier) { + return verifier.VerifyBuffer(nullptr); +} + +inline bool VerifySizePrefixedMetadataBuffer( + flatbuffers::Verifier &verifier) { + return verifier.VerifySizePrefixedBuffer(nullptr); +} + +inline void FinishMetadataBuffer( + flatbuffers::FlatBufferBuilder &fbb, + flatbuffers::Offset root) { + fbb.Finish(root); +} + +inline void FinishSizePrefixedMetadataBuffer( + flatbuffers::FlatBufferBuilder &fbb, + flatbuffers::Offset root) { + fbb.FinishSizePrefixed(root); +} + +} // namespace compression +} // namespace micro +} // namespace tflite + +#endif // FLATBUFFERS_GENERATED_METADATA_TFLITE_MICRO_COMPRESSION_H_ diff --git a/tensorflow/lite/micro/compression/metadata_test.cc b/tensorflow/lite/micro/compression/metadata_test.cc new file mode 100644 index 00000000000..74b567c7d14 --- /dev/null +++ b/tensorflow/lite/micro/compression/metadata_test.cc @@ -0,0 +1,71 @@ +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// Test validity of the flatbuffer schema and illustrate use of the flatbuffer +// machinery with C++. + +#include +#include + +#include "metadata_generated.h" +#include "tensorflow/lite/micro/hexdump.h" + +using tflite::micro::compression::LutTensor; +using tflite::micro::compression::Metadata; +using tflite::micro::compression::MetadataT; + +bool operator==(const LutTensor& a, const LutTensor& b) { + return + a.subgraph() == b.subgraph() && + a.tensor() == b.tensor() && + a.index_bitwidth() == b.index_bitwidth() && + a.index_buffer() == b.index_buffer() && + a.value_buffer() == b.value_buffer(); +} + +int main(int argc, char* argv[]) { + const LutTensor lut_tensor0 { + 0, // subgraph + 127, // tensor + 2, // index_bitwidth + 128, // index_buffer + 129, // value_buffer + }; + const LutTensor lut_tensor1 { + 1, // subgraph + 164, // tensor + 2, // index_bitwidth + 136, // index_buffer + 129, // value_buffer + }; + MetadataT metadata; + metadata.lut_tensors = {lut_tensor0, lut_tensor1}; + + flatbuffers::FlatBufferBuilder builder; + auto root = Metadata::Pack(builder, &metadata); + builder.Finish(root); + const uint8_t* buffer = builder.GetBufferPointer(); + + tflite::hexdump( + {reinterpret_cast(buffer), builder.GetSize()}); + std::cout << "length: " << builder.GetSize() << "\n"; + + auto readback = tflite::micro::compression::GetMetadata(buffer); + auto& read_lut_tensor0 = *readback->lut_tensors()->Get(0); + auto& read_lut_tensor1 = *readback->lut_tensors()->Get(1); + assert(read_lut_tensor0 == lut_tensor0); + assert(read_lut_tensor1 == lut_tensor1); + + return 0; +} diff --git a/tensorflow/lite/micro/compression/metadata_test.py b/tensorflow/lite/micro/compression/metadata_test.py new file mode 100644 index 00000000000..3d954154b8a --- /dev/null +++ b/tensorflow/lite/micro/compression/metadata_test.py @@ -0,0 +1,67 @@ +# Copyright 2024 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test validity of the flatbuffer schema and illustrate use of the flatbuffer +# machinery with Python + +import sys +import hexdump +import flatbuffers + +# `.*_generated` is the name of the module created by the Bazel rule +# `flatbuffer_py_library' based on the schema. +from tensorflow.lite.micro.compression import metadata_flatbuffer_py_generated as schema + + +def main(): + # The classes with a `T` suffix provide an object-oriented representation of + # the object tree in the flatbuffer using native data structures. + lut_tensor0 = schema.LutTensorT() + lut_tensor0.subgraph = 1 + lut_tensor0.tensor = 127 + lut_tensor0.indexBitwidth = 2 + lut_tensor0.indexBuffer = 128 + lut_tensor0.valueBuffer = 129 + + lut_tensor1 = schema.LutTensorT() + lut_tensor1.subgraph = 1 + lut_tensor1.tensor = 164 + lut_tensor1.indexBitwidth = 2 + lut_tensor1.indexBuffer = 136 + lut_tensor1.valueBuffer = 129 + + metadata = schema.MetadataT() + metadata.lutTensors = [lut_tensor0, lut_tensor1] + + # Build the flatbuffer itself using the flatbuffers runtime module. + builder = flatbuffers.Builder(32) + root = metadata.Pack(builder) + builder.Finish(root) + buffer: bytearray = builder.Output() + + print(hexdump.hexdump(buffer, result='return')) + print(f"length: {len(buffer)}") + + def attrs_equal(a, b): + return all(vars(a)[key] == vars(b)[key] for key in vars(a)) + + readback = schema.MetadataT.InitFromPackedBuf(buffer, 0) + assert attrs_equal(readback.lutTensors[0], lut_tensor0) + assert attrs_equal(readback.lutTensors[1], lut_tensor1) + + sys.exit() + + +if __name__ == "__main__": + main() diff --git a/tensorflow/lite/micro/compression/original.fbs b/tensorflow/lite/micro/compression/original.fbs new file mode 100644 index 00000000000..3a05a6cd4f2 --- /dev/null +++ b/tensorflow/lite/micro/compression/original.fbs @@ -0,0 +1,82 @@ +// Copyright 2024 The TensorFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +namespace tflite.micro; + +table ValuesInt8 { + values:[int8]; +} + +table ValuesInt16 { + values:[int16]; +} + +table ValuesInt32 { + values:[int32]; +} + +table ValuesInt64 { + values:[int64]; +} + +table ValuesFloat32 { + values:[float32]; +} + +union ValuesUnion { + ValuesFloat32, + ValuesInt8, + ValuesInt16, + ValuesInt32, + ValuesInt64 +} + +table Values { + values:ValuesUnion; +} + +table BinQuantBufferOptions { + value_table_index:int; + compressed_bit_width:uint8; // Should be 2 or 4 +} + +union CompressedBufferOptions { + BinQuantBufferOptions, + // HuffmanBufferOptions, // Future +} + +table CompressedBuffer { + buffer_index:int; // Buffer index from the top-level Model buffer vector + options:CompressedBufferOptions; +} + +table BinQuantCompression { + version:uint8; + // For a given value table, if the corresponding buffer was per-tensor quantized, there should be 4 or 16 elements (2 bit or 4 bit indexes). + // If the buffer was per-channel quantized, there should be 4/16 x number of channels elements. These will be laid out in the table as: + // [c0v0, c0v1, c0v2, c0v3, c1v0, c1v1, ... cNv3] + value_tables:[Values]; +} + +table CompressionMetadata { + // List of compressed buffers + buffers:[CompressedBuffer]; + + // (Optional) Model-wide Bin & Quant compression parameters. Only needed if a + // CompressedBuffer contains BinQuantBufferOptions. + bin_quant_compression:BinQuantCompression; +} + +root_type CompressionMetadata; diff --git a/tensorflow/lite/micro/compression/original_test.py b/tensorflow/lite/micro/compression/original_test.py new file mode 100644 index 00000000000..edc8ad4d11f --- /dev/null +++ b/tensorflow/lite/micro/compression/original_test.py @@ -0,0 +1,76 @@ +# Copyright 2024 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test validity of the flatbuffer schema and illustrate use of the flatbuffer +# machinery with Python + +import sys +import hexdump +import flatbuffers + +# `.*_generated` is the name of the module created by the Bazel rule +# `flatbuffer_py_library' based on the schema. +from tensorflow.lite.micro.compression import original_flatbuffer_py_generated as schema + + +def main(): + # The classes with a `T` suffix provide an object-oriented representation of + # the object tree in the flatbuffer using native data structures. + bq0_options = schema.BinQuantBufferOptionsT() + bq0_options.valueTableIndex = 0 + bq0_options.compressedBitWidth = 2 + + bq1_options = schema.BinQuantBufferOptionsT() + bq1_options.valueTableIndex = 1 + bq1_options.compressedBitBidth = 4 + + buffer0 = schema.CompressedBufferT() + buffer0.bufferIndex = 0 + buffer0.options = bq0_options + buffer0.optionsType = schema.CompressedBufferOptions.BinQuantBufferOptions + + buffer1 = schema.CompressedBufferT() + buffer1.bufferIndex = 1 + buffer1.options = bq1_options + buffer1.optionsType = schema.CompressedBufferOptions.BinQuantBufferOptions + + valuesInt8 = schema.ValuesInt8T() + valuesInt8.values = [65] + values0 = schema.ValuesT() + values0.values = valuesInt8 + values0.values.Type = schema.ValuesUnion.ValuesInt8 + + bq_compression = schema.BinQuantCompressionT() + bq_compression.valueTables = [values0] + + metadata = schema.CompressionMetadataT() + metadata.buffers = [buffer0, buffer1] + metadata.binQuantCompression = bq_compression + + # Build the flatbuffer itself using the flatbuffers runtime module. + builder = flatbuffers.Builder(32) + root = metadata.Pack(builder) + builder.Finish(root) + buffer: bytearray = builder.Output() + + print(hexdump.hexdump(buffer, result='return')) + print(f"length: {len(buffer)}") + + readback = schema.CompressionMetadataT.InitFromPackedBuf(buffer, 0) + + sys.exit() + + +if __name__ == "__main__": + main() diff --git a/tensorflow/lite/micro/compression/view.py b/tensorflow/lite/micro/compression/view.py new file mode 100644 index 00000000000..55c4255ede1 --- /dev/null +++ b/tensorflow/lite/micro/compression/view.py @@ -0,0 +1,155 @@ +# Copyright 2024 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pprint + +from tensorflow.lite.micro.compression import metadata_flatbuffer_py_generated as compression_schema +from tensorflow.lite.python import schema_py_generated as tflite_schema + + +def read_model(path): + with open(path, 'rb') as file: + buffer = bytearray(file.read()) + return tflite_schema.ModelT.InitFromPackedBuf(buffer, 0) + + +def unpack_list(source): + result = [] + for index, s in enumerate(source): + d = {"_index": index} | vars(s) + result.append(d) + return result + + +def unpack_operators(operators): + result = [] + for index, o in enumerate(operators): + d = {"_index": index, + "opcode_index": o.opcodeIndex, + "inputs": unpack_array(o.inputs), + "outputs": unpack_array(o.outputs), + } + result.append(d) + return result + + +def unpack_TensorType(type): + attrs = [attr for attr in dir(tflite_schema.TensorType) if not + attr.startswith("__")] + lut = {getattr(tflite_schema.TensorType, attr): attr for attr in attrs} + return lut[type] + + +def unpack_tensors(tensors): + result = [] + for index, t in enumerate(tensors): + d = {"_index": index, + "name": t.name.decode("utf-8"), + "type": unpack_TensorType(t.type), + "shape": unpack_array(t.shape), + "quantization": [unpack_array(t.quantization.scale), unpack_array(t.quantization.zeroPoint)], + "buffer": t.buffer, + } + result.append(d) + return result + + +def unpack_subgraphs(subgraphs): + result = [] + for index, s in enumerate(subgraphs): + d = {"_index": index, + "name": s.name, + # "inputs": s.inputs, + # "outputs": s.outputs, + "operators": unpack_operators(s.operators), + "tensors": unpack_tensors(s.tensors), + } + result.append(d) + return result + + +def unpack_metadata(metadata): + return [{"name": m.name.decode("utf-8"), "buffer": m.buffer} for m in + metadata] + + +def unpack_compression_metadata(buffer): + metadata = compression_schema.MetadataT.InitFromPackedBuf(buffer, 0) + result = [] + for index, t in enumerate(metadata.lutTensors): + d = {"_index": index, + "subgraph": t.subgraph, + "tensor": t.tensor, + "indexBitwidth": t.indexBitwidth, + "indexBuffer": t.indexBuffer, + "valueBuffer": t.valueBuffer, + } + result.append(d) + return {"lut_tensors": result} + + +def unpack_array(a): + try: + # Avoid printing as numpy arrays if possible. The pprint module does not + # format them well. + a = a.tolist() + except AttributeError: + pass + return a + + +def unpack_buffers(buffers, compression_metadata=None): + result = [] + for index, b in enumerate(buffers): + d = {"_index": index} + d = d | {"data": unpack_array(b.data)} + if index == compression_metadata: d = d | {"_compression_metadata_decoded": + unpack_compression_metadata(bytes(b.data))} + result.append(d) + return result + + +def get_compression_metadata_buffer(model): + # Return the metadata buffer data or None + for item in model.metadata: + if item.name.decode("utf-8") == "COMPRESSION_METADATA": + return item.buffer + else: + return None + + +def print_model(model, format=None): + output = { + "description": model.description.decode("utf-8"), + "version": model.version, + "operator_codes": unpack_list(model.operatorCodes), + "metadata": unpack_metadata(model.metadata), + "subgraphs": unpack_subgraphs(model.subgraphs), + "buffers": unpack_buffers(model.buffers, + get_compression_metadata_buffer(model)), + } + + pprint.pprint(output, width=90, sort_dicts=False, compact=True) + + +def main(argv=None): + filename = argv[1] + model = read_model(filename) + print_model(model) + + +if __name__ == "__main__": + import sys + main(sys.argv) diff --git a/tensorflow/lite/micro/fake_micro_context.cc b/tensorflow/lite/micro/fake_micro_context.cc index 5787ffd0648..8874798896c 100644 --- a/tensorflow/lite/micro/fake_micro_context.cc +++ b/tensorflow/lite/micro/fake_micro_context.cc @@ -1,4 +1,4 @@ -/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -23,10 +23,23 @@ limitations under the License. namespace tflite { -FakeMicroContext::FakeMicroContext(TfLiteTensor* tensors, - SingleArenaBufferAllocator* allocator, - MicroGraph* micro_graph) - : graph_(*micro_graph), tensors_(tensors), allocator_(allocator) {} +FakeMicroContext::FakeMicroContext( + TfLiteTensor* tensors, SingleArenaBufferAllocator* allocator, + MicroGraph* micro_graph +#ifdef USE_TFLM_COMPRESSION + , + const CompressedTensorList* compressed_tensors +#endif // USE_TFLM_COMPRESSION + ) + : graph_(*micro_graph), + tensors_(tensors), + allocator_(allocator) +#ifdef USE_TFLM_COMPRESSION + , + compressed_tensors_(compressed_tensors) +#endif // USE_TFLM_COMPRESSION +{ +} TfLiteTensor* FakeMicroContext::AllocateTempTfLiteTensor(int tensor_index) { allocated_temp_count_++; @@ -112,4 +125,60 @@ void* FakeMicroContext::external_context() { return nullptr; } MicroGraph& FakeMicroContext::graph() { return graph_; } +#ifdef USE_TFLM_COMPRESSION + +// Available during Prepare & Eval. Returns false if tensor is not +// compressed. +bool FakeMicroContext::IsTensorCompressed(const TfLiteNode* node, + int tensor_idx) { + if (compressed_tensors_ != nullptr && tensor_idx < node->inputs->size) { + int index = node->inputs->data[tensor_idx]; + if (index >= 0 && compressed_tensors_->tensors[index] != nullptr) { + return true; + } + } + + return false; +} + +// Only available during Prepare. The kernel is responsible for storing the +// scratch buffer handle. +int FakeMicroContext::AllocateDecompressionScratchBuffer(const TfLiteNode* node, + int tensor_idx) { + if (compressed_tensors_ == nullptr || tensor_idx >= node->inputs->size) { + return -1; + } + int index = node->inputs->data[tensor_idx]; + if (index < 0 || compressed_tensors_->tensors[index] == nullptr) { + return -1; + } + TfLiteTensor* tensor = &tensors_[index]; + int scratch_index = -1; + TfLiteStatus result = + RequestScratchBufferInArena(tensor->bytes, &scratch_index); + if (result != kTfLiteOk) { + return -1; + } + + return scratch_index; +} + +// Available during Prepare & Eval. Returns nullptr if tensor is not +// compressed. +const CompressionTensorData* FakeMicroContext::GetTensorCompressionData( + const TfLiteNode* node, int tensor_idx) { + if (compressed_tensors_ == nullptr || tensor_idx >= node->inputs->size) { + return nullptr; + } + + int index = node->inputs->data[tensor_idx]; + if (index < 0) { + return nullptr; + } + + return compressed_tensors_->tensors[index]; +} + +#endif // USE_TFLM_COMPRESSION + } // namespace tflite diff --git a/tensorflow/lite/micro/kernels/conv.cc b/tensorflow/lite/micro/kernels/conv.cc index 0df35fce4eb..221d560afa6 100644 --- a/tensorflow/lite/micro/kernels/conv.cc +++ b/tensorflow/lite/micro/kernels/conv.cc @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -45,15 +45,35 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) { TFLITE_DCHECK(node->user_data != nullptr); const auto& data = *(static_cast(node->user_data)); +#ifdef USE_TFLM_COMPRESSION + + MicroContext* micro_context = GetMicroContext(context); + + const CompressionTensorData* weights_comp_td = + micro_context->GetTensorCompressionData(node, kConvWeightsTensor); + const CompressionTensorData* bias_comp_td = + micro_context->GetTensorCompressionData(node, kConvBiasTensor); + +#endif // USE_TFLM_COMPRESSION + switch (input->type) { // Already know in/out types are same. case kTfLiteFloat32: { tflite::reference_ops::Conv( ConvParamsFloat(params, data), tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(filter), +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(micro_context, filter, + weights_comp_td, + data.weights_scratch_index), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(micro_context, bias, bias_comp_td, + data.bias_scratch_index), +#else // USE_TFLM_COMPRESSION tflite::micro::GetTensorData(filter), tflite::micro::GetTensorShape(bias), tflite::micro::GetOptionalTensorData(bias), +#endif // USE_TFLM_COMPRESSION tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output), tflite::micro::GetTensorShape(nullptr), nullptr); @@ -67,9 +87,18 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) { tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(filter), +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(micro_context, filter, + weights_comp_td, + data.weights_scratch_index), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData( + micro_context, bias, bias_comp_td, data.bias_scratch_index), +#else // USE_TFLM_COMPRESSION tflite::micro::GetTensorData(filter), tflite::micro::GetTensorShape(bias), tflite::micro::GetOptionalTensorData(bias), +#endif // USE_TFLM_COMPRESSION tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output)); } else if (bias->type == kTfLiteInt64) { @@ -79,9 +108,18 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) { tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(filter), +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(micro_context, filter, + weights_comp_td, + data.weights_scratch_index), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData( + micro_context, bias, bias_comp_td, data.bias_scratch_index), +#else // USE_TFLM_COMPRESSION tflite::micro::GetTensorData(filter), tflite::micro::GetTensorShape(bias), tflite::micro::GetOptionalTensorData(bias), +#endif // USE_TFLM_COMPRESSION tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output)); } else { @@ -119,9 +157,19 @@ TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) { tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(filter), +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(micro_context, filter, + weights_comp_td, + data.weights_scratch_index), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData( + micro_context, bias, bias_comp_td, data.bias_scratch_index), +#else // USE_TFLM_COMPRESSION tflite::micro::GetTensorData(filter), tflite::micro::GetTensorShape(bias), tflite::micro::GetOptionalTensorData(bias), +#endif // USE_TFLM_COMPRESSION + tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output)); break; diff --git a/tensorflow/lite/micro/kernels/conv.h b/tensorflow/lite/micro/kernels/conv.h index 0c8073f48f0..0090053e03c 100644 --- a/tensorflow/lite/micro/kernels/conv.h +++ b/tensorflow/lite/micro/kernels/conv.h @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -49,6 +49,14 @@ struct OpDataConv { // A buffer used to store unpacked filter values. This is used if the source // tensor is of n-bit precision that cannot be easily processed by kernels. int filter_buffer_index; + +#ifdef USE_TFLM_COMPRESSION + + // scratch buffers for compressed tensors + int weights_scratch_index; + int bias_scratch_index; + +#endif // USE_TFLM_COMPRESSION }; extern const int kConvInputTensor; diff --git a/tensorflow/lite/micro/kernels/conv_common.cc b/tensorflow/lite/micro/kernels/conv_common.cc index 51c7a6ff2d6..9f0f2f79588 100644 --- a/tensorflow/lite/micro/kernels/conv_common.cc +++ b/tensorflow/lite/micro/kernels/conv_common.cc @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -209,6 +209,23 @@ TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node) { &data->filter_buffer_index); } +#ifdef USE_TFLM_COMPRESSION + + // Compression scratch buffers. + // These will only be allocated if the tensor is compressed. + if (micro_context->IsTensorCompressed(node, kConvWeightsTensor) && + filter->type == kTfLiteInt4) { + MicroPrintf("Compression not supported with INT4 tensors"); + return kTfLiteError; + } + data->weights_scratch_index = + micro_context->AllocateDecompressionScratchBuffer(node, + kConvWeightsTensor); + data->bias_scratch_index = + micro_context->AllocateDecompressionScratchBuffer(node, kConvBiasTensor); + +#endif // USE_TFLM_COMPRESSION + micro_context->DeallocateTempTfLiteTensor(filter); micro_context->DeallocateTempTfLiteTensor(input); micro_context->DeallocateTempTfLiteTensor(output); diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc index 0fb9411a3f0..0c3e0f06937 100644 --- a/tensorflow/lite/micro/kernels/conv_test.cc +++ b/tensorflow/lite/micro/kernels/conv_test.cc @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,6 +15,8 @@ limitations under the License. #include "tensorflow/lite/micro/kernels/conv_test.h" +#include + #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/micro/kernels/kernel_runner.h" @@ -46,6 +48,83 @@ static int kOutputShape[] = {4, 2, 1, 2, 3}; static const float kGoldenData[kOutputElements] = {18, 2, 5, 18, 2, 5, 17, 4, 3, 37, 4, 3}; +#ifdef USE_TFLM_COMPRESSION + +// compressed filter data for kBinQuant scheme, matches kFilterData +constexpr uint8_t kBinQuantFilterData[] = { + 0x05, 0x38, 0x20, 0x90, 0x00, +}; +constexpr float kBinQuantFilterValueTable[] = { + 1, 2, 3, 4, -1, +}; +constexpr int kBinQuantFilterBitWidth = 3; +// compressed bias data for kBinQuant scheme, matches kBiasData +constexpr uint8_t kBinQuantBiasData[] = {0x18}; +constexpr int kBinQuantBiasBitWidth = 2; + +// Common inputs and outputs for quantized compressed tensor tests. +// Values from TfLite conv_test.cc SimplePerChannelTest. +static int kInputShapeQ1[] = {4, 1, 2, 3, 2}; +static const float kInputDataQ1[] = { + // [1 * 2 * 3 * 2] as [batch, y, x, input_channel] + 3, 2, // batch = 0, y = 0, x = 0 + 1, -1, // batch = 0, y = 0, x = 1 + -2, -3, // batch = 0, y = 0, x = 2 + 4, 3, // batch = 0, y = 1, x = 0 + 2, -2, // batch = 0, y = 1, x = 1 + -3, -4, // batch = 0, y = 1, x = 2 +}; +constexpr size_t kInputElementsQ1 = std::extent::value; + +constexpr int kFilterNumChannelsQ1 = 2; +static int kFilterShapeQ1[] = {4, 2, 2, 2, 2}; +static const float kFilterDataQ1[] = { + // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel] + 1, 2, // out channel = 0, y = 0, x = 0 + 3, 4, // out channel = 0, y = 0, x = 1 + 3, 4, // out channel = 0, y = 1, x = 0 + 5, 6, // out channel = 0, y = 1, x = 1 + 7, 8, // out channel = 1, y = 0, x = 0 + 5, 6, // out channel = 1, y = 0, x = 1 + 3, 4, // out channel = 1, y = 1, x = 0 + 1, 2, // out channel = 1, y = 1, x = 1 +}; +constexpr size_t kFilterElementsQ1 = + std::extent::value; + +static int kBiasShapeQ1[] = {1, 2}; +static const float kBiasDataQ1[] = {3, -2}; +constexpr size_t kBiasElementsQ1 = std::extent::value; + +static int kOutputShapeQ1[] = {4, 1, 1, 2, 2}; +static const float kGoldenDataQ1[] = {31, 64, -57, -46}; +constexpr int kOutputElementsQ1 = std::extent::value; +static const float kGoldenDataQ1_16[] = {31, 63.99804688, -57, -46}; + +// compressed filter data for kBinQuant scheme, matches kFilterDataQ1 +constexpr uint8_t kBinQuantFilterDataQ1[] = { + 0x05, 0x34, 0xE5, 0xDE, 0x54, 0xC1, +}; +constexpr float kBinQuantFilterValueTableQ1[] = { + 1, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, +}; +constexpr int kBinQuantFilterBitWidthQ1 = 3; +// compressed bias data for kBinQuant scheme, matches kBiasDataQ1 +constexpr uint8_t kBinQuantBiasDataQ1[] = {0x00}; +constexpr int kBinQuantBiasBitWidthQ1 = 1; + +static TfLiteConvParams common_conv_params_q1 = { + kTfLitePaddingValid, // padding + 1, // stride_width + 1, // stride_height + kTfLiteActNone, // activation + 1, // dilation_width_factor + 1, // dilation_height_factor + kTfLiteNoType // quantized_bias_type +}; + +#endif // USE_TFLM_COMPRESSION + static TfLiteConvParams common_conv_params = { kTfLitePaddingValid, // padding 2, // stride_width @@ -122,6 +201,66 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) { output_data)); } +#ifdef USE_TFLM_COMPRESSION + +TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelCompressed) { + const float input_scale = 0.5f; + const float output_scale = 0.5f; + const int input_zero_point = -1; + const int output_zero_point = -1; + constexpr float filter_scales[] = {tflite::testing::kFilterNumChannelsQ1, + 1.0f, 2.0f}; + constexpr int filter_zero_points[] = {tflite::testing::kFilterNumChannelsQ1, + 0, 0}; + // bias scales and zero points will be computed + float bias_scales[std::extent::value] = {}; + int bias_zero_points[std::extent::value] = {}; + + int8_t input_quantized[tflite::testing::kInputElementsQ1]; + int8_t filter_quantized[tflite::testing::kFilterElementsQ1]; + int32_t bias_quantized[tflite::testing::kBiasElementsQ1]; + int8_t golden_quantized[tflite::testing::kOutputElementsQ1]; + int8_t output_quantized[tflite::testing::kOutputElementsQ1]; + + tflite::testing::TestCompressionQuantizedInfo comp_info = {}; + comp_info.scheme = tflite::CompressionScheme::kBinQuant; + + comp_info.filter_value_table = filter_quantized; + comp_info.filter_value_table_stride = + std::extent< + decltype(tflite::testing::kBinQuantFilterValueTableQ1)>::value / + tflite::testing::kFilterNumChannelsQ1; + comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidthQ1; + comp_info.filter_compressed = tflite::testing::kBinQuantFilterDataQ1; + comp_info.filter_data = tflite::testing::kBinQuantFilterValueTableQ1; + comp_info.filter_dims_data = tflite::testing::kFilterShapeQ1; + comp_info.filter_scales = filter_scales; + comp_info.filter_zero_points = filter_zero_points; + + comp_info.bias_value_table = bias_quantized; + comp_info.bias_value_table_stride = + std::extent::value / + tflite::testing::kFilterNumChannelsQ1; + comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidthQ1; + comp_info.bias_compressed = tflite::testing::kBinQuantBiasDataQ1; + comp_info.bias_data = tflite::testing::kBiasDataQ1; + comp_info.bias_dims_data = tflite::testing::kBiasShapeQ1; + comp_info.bias_scales = bias_scales; + comp_info.bias_zero_points = bias_zero_points; + + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, + tflite::testing::TestConvQuantizedPerChannelCompressed( + tflite::testing::kInputShapeQ1, tflite::testing::kInputDataQ1, + input_quantized, input_scale, input_zero_point, + tflite::testing::kOutputShapeQ1, tflite::testing::kGoldenDataQ1, + golden_quantized, output_quantized, output_scale, output_zero_point, + &tflite::testing::common_conv_params_q1, tflite::Register_CONV_2D(), + &comp_info)); +} + +#endif // USE_TFLM_COMPRESSION + TF_LITE_MICRO_TEST(SimpleTestFloat) { float output_data[tflite::testing::kOutputElements]; @@ -136,6 +275,37 @@ TF_LITE_MICRO_TEST(SimpleTestFloat) { output_data)); } +#ifdef USE_TFLM_COMPRESSION + +TF_LITE_MICRO_TEST(SimpleTestFloatCompressed) { + tflite::testing::TestCompressionInfo comp_info = {}; + comp_info.scheme = tflite::CompressionScheme::kBinQuant; + comp_info.filter_value_table = tflite::testing::kBinQuantFilterValueTable; + comp_info.filter_value_table_stride = + std::extent::value; + comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidth; + comp_info.bias_value_table = tflite::testing::kBiasData; + comp_info.bias_value_table_stride = + std::extent::value; + comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidth; + + float output_data[tflite::testing::kOutputElements]; + + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, + tflite::testing::TestConvFloat( + tflite::testing::kInputShape, tflite::testing::kInputData, + tflite::testing::kFilterShape, + reinterpret_cast(tflite::testing::kBinQuantFilterData), + tflite::testing::kBiasShape, + reinterpret_cast(tflite::testing::kBinQuantBiasData), + tflite::testing::kOutputShape, tflite::testing::kGoldenData, + &tflite::testing::common_conv_params, tflite::Register_CONV_2D(), + output_data, &comp_info)); +} + +#endif + TF_LITE_MICRO_TEST(InputAndFilterSameWidthHeight) { const int output_dims_count = 2; float output_data[output_dims_count]; @@ -246,6 +416,66 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannel64bBias) { output_data)); } +#ifdef USE_TFLM_COMPRESSION + +TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannel64bBiasCompressed) { + const float input_scale = 128.0f / 65536; + const float output_scale = 128.0f / 65536; + const int input_zero_point = 0; + const int output_zero_point = 0; + constexpr float filter_scales[] = {tflite::testing::kFilterNumChannelsQ1, + 1.0f, 2.0f}; + constexpr int filter_zero_points[] = {tflite::testing::kFilterNumChannelsQ1, + 0, 0}; + // bias scales and zero points will be computed + float bias_scales[std::extent::value] = {}; + int bias_zero_points[std::extent::value] = {}; + + int16_t input_quantized[tflite::testing::kInputElementsQ1]; + int8_t filter_quantized[tflite::testing::kFilterElementsQ1]; + int64_t bias_quantized[tflite::testing::kBiasElementsQ1]; + int16_t golden_quantized[tflite::testing::kOutputElementsQ1]; + int16_t output_quantized[tflite::testing::kOutputElementsQ1]; + + tflite::testing::TestCompressionQuantizedInfo comp_info = {}; + comp_info.scheme = tflite::CompressionScheme::kBinQuant; + + comp_info.filter_value_table = filter_quantized; + comp_info.filter_value_table_stride = + std::extent< + decltype(tflite::testing::kBinQuantFilterValueTableQ1)>::value / + tflite::testing::kFilterNumChannelsQ1; + comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidthQ1; + comp_info.filter_compressed = tflite::testing::kBinQuantFilterDataQ1; + comp_info.filter_data = tflite::testing::kBinQuantFilterValueTableQ1; + comp_info.filter_dims_data = tflite::testing::kFilterShapeQ1; + comp_info.filter_scales = filter_scales; + comp_info.filter_zero_points = filter_zero_points; + + comp_info.bias_value_table = bias_quantized; + comp_info.bias_value_table_stride = + std::extent::value / + tflite::testing::kFilterNumChannelsQ1; + comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidthQ1; + comp_info.bias_compressed = tflite::testing::kBinQuantBiasDataQ1; + comp_info.bias_data = tflite::testing::kBiasDataQ1; + comp_info.bias_dims_data = tflite::testing::kBiasShapeQ1; + comp_info.bias_scales = bias_scales; + comp_info.bias_zero_points = bias_zero_points; + + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, + tflite::testing::TestConvQuantizedPerChannelCompressed( + tflite::testing::kInputShapeQ1, tflite::testing::kInputDataQ1, + input_quantized, input_scale, input_zero_point, + tflite::testing::kOutputShapeQ1, tflite::testing::kGoldenDataQ1_16, + golden_quantized, output_quantized, output_scale, output_zero_point, + &tflite::testing::common_conv_params_q1, tflite::Register_CONV_2D(), + &comp_info)); +} + +#endif // USE_TFLM_COMPRESSION + TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannel32bBias) { const int output_dims_count = 12; int16_t output_data[output_dims_count]; @@ -276,6 +506,66 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannel32bBias) { output_data)); } +#ifdef USE_TFLM_COMPRESSION + +TF_LITE_MICRO_TEST(SimpleTestQuantized16x8PerChannel32bBiasCompressed) { + const float input_scale = 128.0f / 65536; + const float output_scale = 128.0f / 65536; + const int input_zero_point = 0; + const int output_zero_point = 0; + constexpr float filter_scales[] = {tflite::testing::kFilterNumChannelsQ1, + 1.0f, 2.0f}; + constexpr int filter_zero_points[] = {tflite::testing::kFilterNumChannelsQ1, + 0, 0}; + // bias scales and zero points will be computed + float bias_scales[std::extent::value] = {}; + int bias_zero_points[std::extent::value] = {}; + + int16_t input_quantized[tflite::testing::kInputElementsQ1]; + int8_t filter_quantized[tflite::testing::kFilterElementsQ1]; + int32_t bias_quantized[tflite::testing::kBiasElementsQ1]; + int16_t golden_quantized[tflite::testing::kOutputElementsQ1]; + int16_t output_quantized[tflite::testing::kOutputElementsQ1]; + + tflite::testing::TestCompressionQuantizedInfo comp_info = {}; + comp_info.scheme = tflite::CompressionScheme::kBinQuant; + + comp_info.filter_value_table = filter_quantized; + comp_info.filter_value_table_stride = + std::extent< + decltype(tflite::testing::kBinQuantFilterValueTableQ1)>::value / + tflite::testing::kFilterNumChannelsQ1; + comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidthQ1; + comp_info.filter_compressed = tflite::testing::kBinQuantFilterDataQ1; + comp_info.filter_data = tflite::testing::kBinQuantFilterValueTableQ1; + comp_info.filter_dims_data = tflite::testing::kFilterShapeQ1; + comp_info.filter_scales = filter_scales; + comp_info.filter_zero_points = filter_zero_points; + + comp_info.bias_value_table = bias_quantized; + comp_info.bias_value_table_stride = + std::extent::value / + tflite::testing::kFilterNumChannelsQ1; + comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidthQ1; + comp_info.bias_compressed = tflite::testing::kBinQuantBiasDataQ1; + comp_info.bias_data = tflite::testing::kBiasDataQ1; + comp_info.bias_dims_data = tflite::testing::kBiasShapeQ1; + comp_info.bias_scales = bias_scales; + comp_info.bias_zero_points = bias_zero_points; + + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, + tflite::testing::TestConvQuantizedPerChannelCompressed( + tflite::testing::kInputShapeQ1, tflite::testing::kInputDataQ1, + input_quantized, input_scale, input_zero_point, + tflite::testing::kOutputShapeQ1, tflite::testing::kGoldenDataQ1_16, + golden_quantized, output_quantized, output_scale, output_zero_point, + &tflite::testing::common_conv_params_q1, tflite::Register_CONV_2D(), + &comp_info)); +} + +#endif // USE_TFLM_COMPRESSION + TF_LITE_MICRO_TEST(SimpleTestDilatedQuantizedPerChannel) { const int output_dims_count = 24; int8_t output_data[output_dims_count]; @@ -1190,3 +1480,60 @@ TF_LITE_MICRO_TEST(Int8Filter8x3x3x3PerChannelScaleRelu6ShouldMatchGolden) { } TF_LITE_MICRO_TESTS_END + +// {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1}, +// {TensorType_INT8, +// // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel] +// {2, 2, 2, 2}, +// 0, +// 0, +// 0, +// 0, +// /*per_channel_quantization=*/true, +// /*per_channel_quantization_scales=*/{1, 2}, +// /*per_channel_quantization_offsets=*/{0, 0}, +// /*channel_index=*/0}, +// {TensorType_INT8, {}, -63.5, 64, 0.5, -1}, +// /*stride_width=*/1, /*stride_height=*/1); +// m.SetInput({ +// // [1 * 2 * 3 * 2] as [batch, y, x, input_channel] +// 3, 2, // batch = 0, y = 0, x = 0 +// 1, -1, // batch = 0, y = 0, x = 1 +// -2, -3, // batch = 0, y = 0, x = 2 +// 4, 3, // batch = 0, y = 1, x = 0 +// 2, -2, // batch = 0, y = 1, x = 1 +// -3, -4, // batch = 0, y = 1, x = 2 +// }); +// m.SetFilter( +// // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel] +// { +// 1, 2, // out channel = 0, y = 0, x = 0 +// 3, 4, // out channel = 0, y = 0, x = 1 +// 3, 4, // out channel = 0, y = 1, x = 0 +// 5, 6, // out channel = 0, y = 1, x = 1 +// 7, 8, // out channel = 1, y = 0, x = 0 +// 5, 6, // out channel = 1, y = 0, x = 1 +// 3, 4, // out channel = 1, y = 1, x = 0 +// 1, 2, // out channel = 1, y = 1, x = 1 +// }); +// m.SetBias({3, -2}); +// // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel] +// EXPECT_THAT(m.GetDequantizedOutput(), +// ElementsAreArray(ArrayFloatNear({31, 64, -57, -46}))); +// EXPECT_THAT(m.GetOutput(), ElementsAreArray({61, 127, -115, -93})); + +// TEST_P(ConvolutionOpTest, SimplePerChannel16x8Bias32) { +// const float scale = 128.0 / 65536; +// // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel] +// EXPECT_THAT(m.GetDequantizedOutput(), +// ElementsAreArray(ArrayFloatNear({31, 63.99804688, -57, -46}))); +// EXPECT_THAT(m.GetOutput(), +// ElementsAreArray({15872, 32767, -29184, -23552})); + +// TEST_P(ConvolutionOpTest, SimplePerChannel16x8Bias64) { +// const float scale = 128.0 / 65536; +// // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel] +// EXPECT_THAT(m.GetDequantizedOutput(), +// ElementsAreArray(ArrayFloatNear({31, 63.99804688, -57, -46}))); +// EXPECT_THAT(m.GetOutput(), +// ElementsAreArray({15872, 32767, -29184, -23552})); \ No newline at end of file diff --git a/tensorflow/lite/micro/kernels/conv_test.h b/tensorflow/lite/micro/kernels/conv_test.h index c655f043bcc..7f6c55e2a9e 100644 --- a/tensorflow/lite/micro/kernels/conv_test.h +++ b/tensorflow/lite/micro/kernels/conv_test.h @@ -1,4 +1,4 @@ -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/micro/kernels/conv.h" #include "tensorflow/lite/micro/kernels/kernel_runner.h" #include "tensorflow/lite/micro/kernels/micro_ops.h" #include "tensorflow/lite/micro/test_helpers.h" @@ -26,35 +27,123 @@ limitations under the License. namespace tflite { namespace testing { -TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size, - int output_length, TfLiteConvParams* conv_params, - TFLMRegistration registration, float* output_data); +constexpr int kConvMaxTensors = 4; +constexpr int kConvMaxInputTensors = 3; +template TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size, - int output_length, TfLiteConvParams* conv_params, - TFLMRegistration registration, int8_t* output_data); - -TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size, - const float* expected_output_data, - int output_length, - TfLiteConvParams* conv_params, - TFLMRegistration registration, - float* output_data, float tolerance = 1e-5); - -TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size, - const int8_t* expected_output_data, - int output_length, - TfLiteConvParams* conv_params, - TFLMRegistration registration, - int8_t* output_data, float tolerance = 1e-5); - -TfLiteStatus TestConvFloat(int* input_dims_data, const float* input_data, - int* filter_dims_data, const float* filter_data, - int* bias_dims_data, const float* bias_data, - int* output_dims_data, - const float* expected_output_data, - TfLiteConvParams* conv_params, - TFLMRegistration registration, float* output_data); + int output_length, const TfLiteConvParams* conv_params, + TFLMRegistration registration, T* output_data +#ifdef USE_TFLM_COMPRESSION + , + const CompressedTensorList* comp_list_p = nullptr +#endif // USE_TFLM_COMPRESSION +) { + // TODO(b/358165875): support optional bias tensor + int inputs_array_data[] = {3, 0, 1, 2}; + TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); + int outputs_array_data[] = {1, 3}; + TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); + + micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array, + outputs_array, conv_params +#ifdef USE_TFLM_COMPRESSION + , + nullptr, comp_list_p +#endif // USE_TFLM_COMPRESSION + ); + + const char* init_data = reinterpret_cast(conv_params); + TfLiteStatus status = runner.InitAndPrepare(init_data); + if (status != kTfLiteOk) { + return status; + } + return runner.Invoke(); +} + +template +TfLiteStatus ValidateConvGoldens( + TfLiteTensor* tensors, int tensors_size, const T* expected_output_data, + int output_length, const TfLiteConvParams* conv_params, + TFLMRegistration registration, T* output_data, float tolerance = 1e-5 +#ifdef USE_TFLM_COMPRESSION + , + const TestCompressionInfo* comp_info = nullptr +#endif // USE_TFLM_COMPRESSION +) { +#ifdef USE_TFLM_COMPRESSION + + TestCompressedList tcl; + const CompressedTensorList* comp_list_p = nullptr; + + if (comp_info != nullptr) { + TF_LITE_MICRO_EXPECT_EQ( + tcl.AddWeight(*comp_info, tensors[kConvWeightsTensor], + kConvWeightsTensor), + kTfLiteOk); + TF_LITE_MICRO_CHECK_FAIL(); + TF_LITE_MICRO_EXPECT_EQ( + tcl.AddBias(*comp_info, tensors[kConvBiasTensor], kConvBiasTensor), + kTfLiteOk); + TF_LITE_MICRO_CHECK_FAIL(); + comp_list_p = tcl.GetCompressedTensorList(); + } + +#endif // USE_TFLM_COMPRESSION + + TfLiteStatus status = InvokeConv(tensors, tensors_size, output_length, + conv_params, registration, output_data +#ifdef USE_TFLM_COMPRESSION + , + comp_list_p +#endif // USE_TFLM_COMPRESSION + ); + if (status != kTfLiteOk) { + return status; + } + for (int i = 0; i < output_length; ++i) { + TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], + tolerance); + } + return kTfLiteOk; +} + +template +TfLiteStatus TestConvFloat( + int* input_dims_data, const float* input_data, int* filter_dims_data, + const float* filter_data, int* bias_dims_data, const float* bias_data, + int* output_dims_data, const float* expected_output_data, + TfLiteConvParams* conv_params, TFLMRegistration registration, + float* output_data +#ifdef USE_TFLM_COMPRESSION + , + const TestCompressionInfo* comp_info = nullptr +#endif // USE_TFLM_COMPRESSION +) { + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); + TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + constexpr int inputs_size = 3; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + CreateTensor(input_data, input_dims), + CreateTensor(filter_data, filter_dims), + CreateTensor(bias_data, bias_dims), + CreateTensor(output_data, output_dims), + }; + + return ValidateConvGoldens(tensors, tensors_size, expected_output_data, + output_dims_count, conv_params, registration, + output_data +#ifdef USE_TFLM_COMPRESSION + , + 1e-5f, comp_info +#endif // USE_TFLM_COMPRESSION + ); +} TfLiteStatus TestConvQuantizedPerChannel( int* input_dims_data, const float* input_data, int8_t* input_quantized, @@ -88,6 +177,71 @@ TfLiteStatus TestConvQuantizedPerChannel( float output_scale, int output_zero_point, TfLiteConvParams* conv_params, TFLMRegistration registration, int16_t* output_data); +#ifdef USE_TFLM_COMPRESSION + +template +TfLiteStatus TestConvQuantizedPerChannelCompressed( + int* input_dims_data, const float* input_data, TIO* input_quantized, + float input_scale, int input_zero_point, int* output_dims_data, + const float* expected_output_data, TIO* expected_output_quantized, + TIO* output_quantized, float output_scale, int output_zero_point, + const TfLiteConvParams* conv_params, TFLMRegistration registration, + const TestCompressionQuantizedInfo* comp_info) { + // TODO(b/358165875): account for optional bias tensor + // bool null_bias = comp_info->bias_data == nullptr ? true : false; + + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* filter_dims = IntArrayFromInts(comp_info->filter_dims_data); + TfLiteIntArray* bias_dims = IntArrayFromInts(comp_info->bias_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); + + TfLiteFloatArray* filter_scales = + FloatArrayFromFloats(comp_info->filter_scales); + TfLiteIntArray* filter_zero_points = + IntArrayFromInts(comp_info->filter_zero_points); + TfLiteFloatArray* bias_scales = FloatArrayFromFloats(comp_info->bias_scales); + TfLiteIntArray* bias_zero_points = + IntArrayFromInts(comp_info->bias_zero_points); + + TfLiteAffineQuantization filter_quant = {}; + TfLiteTensor filter_tensor = CreatePerChannelQuantizedTensor( + comp_info->filter_compressed, filter_dims, filter_scales, + filter_zero_points, &filter_quant, kConvQuantizedDimension, + false /* is_variable */, kTfLiteInt8); + SymmetricPerChannelQuantize( + comp_info->filter_data, comp_info->filter_value_table, + ElementCount(*filter_dims), filter_scales->size, filter_scales->data); + + TfLiteAffineQuantization bias_quant = {}; + TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor( + comp_info->bias_compressed, bias_dims, input_scale, filter_scales, + bias_scales, bias_zero_points, &bias_quant, kConvQuantizedDimension, + false /* is_variable */, typeToTfLiteType()); + SymmetricPerChannelQuantize(comp_info->bias_data, comp_info->bias_value_table, + ElementCount(*bias_dims), bias_scales->size, + bias_scales->data); + + constexpr int tensors_size = kConvMaxTensors; + TfLiteTensor tensors[tensors_size] = { + CreateQuantizedTensor(input_data, input_quantized, input_dims, + input_scale, input_zero_point), + filter_tensor, + bias_tensor, + CreateQuantizedTensor(output_quantized, output_dims, output_scale, + output_zero_point), + }; + + const int output_dims_count = ElementCount(*output_dims); + Quantize(expected_output_data, expected_output_quantized, output_dims_count, + output_scale, output_zero_point); + return ValidateConvGoldens(tensors, tensors_size, expected_output_quantized, + output_dims_count, conv_params, registration, + output_quantized, 1.0e-5f /* tolerance */, + comp_info); +} + +#endif // USE_TFLM_COMPRESSION + } // namespace testing } // namespace tflite diff --git a/tensorflow/lite/micro/kernels/conv_test_common.cc b/tensorflow/lite/micro/kernels/conv_test_common.cc index a0f733b8e42..7b6f71a8fc3 100644 --- a/tensorflow/lite/micro/kernels/conv_test_common.cc +++ b/tensorflow/lite/micro/kernels/conv_test_common.cc @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,108 +18,6 @@ limitations under the License. namespace tflite { namespace testing { -template -TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size, - int output_length, TfLiteConvParams* conv_params, - TFLMRegistration registration, T* output_data) { - int inputs_array_data[] = {3, 0, 1, 2}; - TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); - int outputs_array_data[] = {1, 3}; - TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); - - micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array, - outputs_array, conv_params); - - const char* init_data = reinterpret_cast(conv_params); - TfLiteStatus status = runner.InitAndPrepare(init_data); - if (status != kTfLiteOk) { - return status; - } - return runner.Invoke(); -} - -template -TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size, - const T* expected_output_data, - int output_length, - TfLiteConvParams* conv_params, - TFLMRegistration registration, T* output_data, - float tolerance) { - TfLiteStatus status = InvokeConv(tensors, tensors_size, output_length, - conv_params, registration, output_data); - if (status != kTfLiteOk) { - return status; - } - for (int i = 0; i < output_length; ++i) { - TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], - tolerance); - } - return kTfLiteOk; -} - -TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size, - int output_length, TfLiteConvParams* conv_params, - TFLMRegistration registration, float* output_data) { - return InvokeConv(tensors, tensors_size, output_length, conv_params, - registration, output_data); -} - -TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size, - int output_length, TfLiteConvParams* conv_params, - TFLMRegistration registration, int8_t* output_data) { - return InvokeConv(tensors, tensors_size, output_length, conv_params, - registration, output_data); -} - -TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size, - const float* expected_output_data, - int output_length, - TfLiteConvParams* conv_params, - TFLMRegistration registration, - float* output_data, float tolerance) { - return ValidateConvGoldens(tensors, tensors_size, expected_output_data, - output_length, conv_params, registration, - output_data, tolerance); -} - -TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size, - const int8_t* expected_output_data, - int output_length, - TfLiteConvParams* conv_params, - TFLMRegistration registration, - int8_t* output_data, float tolerance) { - return ValidateConvGoldens( - tensors, tensors_size, expected_output_data, output_length, conv_params, - registration, output_data, tolerance); -} - -TfLiteStatus TestConvFloat(int* input_dims_data, const float* input_data, - int* filter_dims_data, const float* filter_data, - int* bias_dims_data, const float* bias_data, - int* output_dims_data, - const float* expected_output_data, - TfLiteConvParams* conv_params, - TFLMRegistration registration, float* output_data) { - TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); - TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); - TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); - TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); - const int output_dims_count = ElementCount(*output_dims); - constexpr int inputs_size = 3; - constexpr int outputs_size = 1; - constexpr int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[tensors_size] = { - CreateTensor(input_data, input_dims), - CreateTensor(filter_data, filter_dims), - CreateTensor(bias_data, bias_dims), - CreateTensor(output_data, output_dims), - }; - - return ValidateConvGoldens(tensors, tensors_size, expected_output_data, - output_dims_count, conv_params, registration, - output_data); -} - template TfLiteStatus TestConvQuantizedPerChannel( int* input_dims_data, const float* input_data, T* input_quantized, diff --git a/tensorflow/lite/micro/kernels/fully_connected.cc b/tensorflow/lite/micro/kernels/fully_connected.cc index 65c83792e87..21d061ae430 100644 --- a/tensorflow/lite/micro/kernels/fully_connected.cc +++ b/tensorflow/lite/micro/kernels/fully_connected.cc @@ -1,4 +1,4 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -60,7 +60,7 @@ TfLiteStatus FullyConnectedPrepare(TfLiteContext* context, TfLiteNode* node) { (input->type == kTfLiteInt8 && (filter->type != kTfLiteInt8 && filter->type != kTfLiteInt4)) || (input->type == kTfLiteInt16 && filter->type != kTfLiteInt8)) { - MicroPrintf("Input type: %s with filter type : %s not supported.", + MicroPrintf("Input type: %s with filter type: %s not supported.", TfLiteTypeGetName(input->type), TfLiteTypeGetName(filter->type)); return kTfLiteError; @@ -79,6 +79,23 @@ TfLiteStatus FullyConnectedPrepare(TfLiteContext* context, TfLiteNode* node) { context, params->activation, input->type, input, filter, bias, output, data)); +#ifdef USE_TFLM_COMPRESSION + + // Compression scratch buffers. + // These will only be allocated if the tensor is compressed. + if (micro_context->IsTensorCompressed(node, kFullyConnectedWeightsTensor) && + filter->type == kTfLiteInt4) { + MicroPrintf("Compression not supported with INT4 tensors"); + return kTfLiteError; + } + data->weights_scratch_index = + micro_context->AllocateDecompressionScratchBuffer( + node, kFullyConnectedWeightsTensor); + data->bias_scratch_index = micro_context->AllocateDecompressionScratchBuffer( + node, kFullyConnectedBiasTensor); + +#endif // USE_TFLM_COMPRESSION + micro_context->DeallocateTempTfLiteTensor(input); micro_context->DeallocateTempTfLiteTensor(filter); if (bias != nullptr) { @@ -102,8 +119,19 @@ TfLiteStatus FullyConnectedEval(TfLiteContext* context, TfLiteNode* node) { TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, kFullyConnectedOutputTensor); - TFLITE_DCHECK(node->user_data != nullptr); +#ifdef USE_TFLM_COMPRESSION + + MicroContext* micro_context = GetMicroContext(context); + + const CompressionTensorData* weights_comp_td = + micro_context->GetTensorCompressionData(node, + kFullyConnectedWeightsTensor); + const CompressionTensorData* bias_comp_td = + micro_context->GetTensorCompressionData(node, kFullyConnectedBiasTensor); +#endif // USE_TFLM_COMPRESSION + + TFLITE_DCHECK(node->user_data != nullptr); const auto& data = *(static_cast(node->user_data)); @@ -115,9 +143,18 @@ TfLiteStatus FullyConnectedEval(TfLiteContext* context, TfLiteNode* node) { tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(filter), +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(micro_context, filter, + weights_comp_td, + data.weights_scratch_index), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(micro_context, bias, bias_comp_td, + data.bias_scratch_index), +#else // USE_TFLM_COMPRESSION tflite::micro::GetTensorData(filter), tflite::micro::GetTensorShape(bias), tflite::micro::GetOptionalTensorData(bias), +#endif // USE_TFLM_COMPRESSION tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output)); break; @@ -149,9 +186,18 @@ TfLiteStatus FullyConnectedEval(TfLiteContext* context, TfLiteNode* node) { tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(filter), +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(micro_context, filter, + weights_comp_td, + data.weights_scratch_index), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData( + micro_context, bias, bias_comp_td, data.bias_scratch_index), +#else // USE_TFLM_COMPRESSION tflite::micro::GetTensorData(filter), tflite::micro::GetTensorShape(bias), tflite::micro::GetOptionalTensorData(bias), +#endif // USE_TFLM_COMPRESSION tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output)); break; @@ -173,9 +219,18 @@ TfLiteStatus FullyConnectedEval(TfLiteContext* context, TfLiteNode* node) { tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(filter), +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(micro_context, filter, + weights_comp_td, + data.weights_scratch_index), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData( + micro_context, bias, bias_comp_td, data.bias_scratch_index), +#else // USE_TFLM_COMPRESSION tflite::micro::GetTensorData(filter), tflite::micro::GetTensorShape(bias), tflite::micro::GetOptionalTensorData(bias), +#endif // USE_TFLM_COMPRESSION tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output)); break; diff --git a/tensorflow/lite/micro/kernels/fully_connected.h b/tensorflow/lite/micro/kernels/fully_connected.h index 8308838ec6d..d7ea705964c 100644 --- a/tensorflow/lite/micro/kernels/fully_connected.h +++ b/tensorflow/lite/micro/kernels/fully_connected.h @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -46,6 +46,14 @@ struct OpDataFullyConnected { // tensor is of n-bit precision that cannot be easily processed by kernels. int filter_buffer_index; #endif + +#ifdef USE_TFLM_COMPRESSION + + // scratch buffers for compressed tensors + int weights_scratch_index; + int bias_scratch_index; + +#endif // USE_TFLM_COMPRESSION }; extern const int kFullyConnectedInputTensor; diff --git a/tensorflow/lite/micro/kernels/fully_connected_test.cc b/tensorflow/lite/micro/kernels/fully_connected_test.cc index 2ad132055b8..8c605fefbcb 100644 --- a/tensorflow/lite/micro/kernels/fully_connected_test.cc +++ b/tensorflow/lite/micro/kernels/fully_connected_test.cc @@ -1,4 +1,4 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -42,6 +42,20 @@ const float simple_weights_data[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, // u = 2 }; +#ifdef USE_TFLM_COMPRESSION + +// compressed filter data for kBinQuant scheme +constexpr uint8_t kBinQuantFilterData[] = {0x01, 0x23, 0x45, 0x67, 0x89, + 0x01, 0x23, 0x45, 0x67, 0x89, + 0x01, 0x23, 0x45, 0x67, 0x89}; +constexpr float kBinQuantFilterValueTable[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; +constexpr int kBinQuantFilterBitWidth = 4; +// compressed bias data for kBinQuant scheme +constexpr uint8_t kBinQuantBiasData[] = {0x18}; +constexpr int kBinQuantBiasBitWidth = 2; + +#endif // USE_TFLM_COMPRESSION + // TODO(b/258710417): INT4 isn't currently supported on Hexagon. #if !defined(HEXAGON) const float simple_int4_weights_data[] = { @@ -241,11 +255,18 @@ const float representative_64x16_golden[] = { const int representative_64x16_output_size = 16; int representative_64x16_output_dims[] = {2, 1, 16}; -template +constexpr int kMaxTensors = 4; + +template TfLiteStatus ValidateFullyConnectedGoldens( TfLiteTensor* tensors, const int tensors_size, bool null_bias, const TfLiteFusedActivation activation, const float tolerance, - const int output_len, const T* golden, T* output_data) { + const int output_len, const T* golden, T* output_data +#ifdef USE_TFLM_COMPRESSION + , + const TestCompressionInfo* comp_info = nullptr +#endif // USE_TFLM_COMPRESSION +) { TfLiteFullyConnectedParams builtin_data = { activation, kTfLiteFullyConnectedWeightsFormatDefault, false, false, kTfLiteNoType}; @@ -272,10 +293,38 @@ TfLiteStatus ValidateFullyConnectedGoldens( TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); +#ifdef USE_TFLM_COMPRESSION + + TestCompressedList tcl; + const CompressedTensorList* comp_list_p = nullptr; + + if (comp_info != nullptr) { + TF_LITE_MICRO_EXPECT_EQ( + tcl.AddWeight(*comp_info, tensors[kFullyConnectedWeightsTensor], + kFullyConnectedWeightsTensor), + kTfLiteOk); + TF_LITE_MICRO_CHECK_FAIL(); + if (!null_bias) { + TF_LITE_MICRO_EXPECT_EQ( + tcl.AddBias(*comp_info, tensors[kFullyConnectedBiasTensor], + kFullyConnectedBiasTensor), + kTfLiteOk); + TF_LITE_MICRO_CHECK_FAIL(); + } + comp_list_p = tcl.GetCompressedTensorList(); + } + +#endif // USE_TFLM_COMPRESSION + const TFLMRegistration registration = Register_FULLY_CONNECTED(); micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array, outputs_array, - reinterpret_cast(&builtin_data)); + reinterpret_cast(&builtin_data), nullptr +#ifdef USE_TFLM_COMPRESSION + , + comp_list_p +#endif // USE_TFLM_COMPRESSION + ); TfLiteStatus status = runner.InitAndPrepare(); if (status != kTfLiteOk) { @@ -293,11 +342,17 @@ TfLiteStatus ValidateFullyConnectedGoldens( return kTfLiteOk; } +template TfLiteStatus TestFullyConnectedFloat( int* input_dims_data, const float* input_data, int* weights_dims_data, const float* weights_data, int* bias_dims_data, const float* bias_data, const float* golden, int* output_dims_data, - TfLiteFusedActivation activation, float* output_data) { + TfLiteFusedActivation activation, float* output_data +#ifdef USE_TFLM_COMPRESSION + , + const TestCompressionInfo* comp_info = nullptr +#endif // USE_TFLM_COMPRESSION +) { TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data); TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); @@ -305,16 +360,15 @@ TfLiteStatus TestFullyConnectedFloat( const int output_dims_count = ElementCount(*output_dims); bool null_bias = bias_data == nullptr ? true : false; - constexpr int array_size = 4; // Avoid variable length array warning. - const int inputs_size = bias_data == nullptr ? 2 : 3; + const int inputs_size = null_bias ? 2 : 3; constexpr int outputs_size = 1; const int tensors_size = inputs_size + outputs_size; - TfLiteTensor tensors[array_size]; + TfLiteTensor tensors[kMaxTensors]; tensors[0] = CreateTensor(input_data, input_dims); tensors[1] = CreateTensor(weights_data, weights_dims); - if (bias_data == nullptr) { + if (null_bias) { tensors[2] = CreateTensor(output_data, output_dims); } else { tensors[2] = CreateTensor(bias_data, bias_dims); @@ -323,7 +377,12 @@ TfLiteStatus TestFullyConnectedFloat( return ValidateFullyConnectedGoldens(tensors, tensors_size, null_bias, activation, 1e-4f, output_dims_count, - golden, output_data); + golden, output_data +#ifdef USE_TFLM_COMPRESSION + , + comp_info +#endif // USE_TFLM_COMPRESSION + ); } template @@ -345,7 +404,7 @@ TfLiteStatus TestFullyConnectedQuantized( bool null_bias = bias_data == nullptr ? true : false; constexpr int array_size = 4; // Avoid variable length array warning. - const int inputs_size = bias_data == nullptr ? 2 : 3; + const int inputs_size = null_bias ? 2 : 3; constexpr int outputs_size = 1; const int tensors_size = inputs_size + outputs_size; TfLiteTensor tensors[array_size]; @@ -355,7 +414,7 @@ TfLiteStatus TestFullyConnectedQuantized( tensors[1] = CreateQuantizedTensor( weights_data, weights_quantized, weights_dims, weights_scale, weights_zero_point, false, weights_packed_type); - if (bias_data == nullptr) { + if (null_bias) { tensors[2] = CreateQuantizedTensor(output_data, output_dims, output_scale, output_zero_point); } else { @@ -373,6 +432,68 @@ TfLiteStatus TestFullyConnectedQuantized( golden_quantized, output_data); } +#ifdef USE_TFLM_COMPRESSION + +template +TfLiteStatus TestFullyConnectedQuantizedCompressed( + int* input_dims_data, const float* input_data, TIO* input_quantized, + float input_scale, int input_zero_point, int* output_dims_data, + const float* expected_output_data, TIO* expected_output_quantized, + TIO* output_quantized, float output_scale, int output_zero_point, + const TfLiteFusedActivation activation, + const TestCompressionQuantizedInfo* comp_info) { + bool null_bias = comp_info->bias_data == nullptr ? true : false; + + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* filter_dims = IntArrayFromInts(comp_info->filter_dims_data); + TfLiteIntArray* bias_dims = IntArrayFromInts(comp_info->bias_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); + + TfLiteFloatArray* filter_scales = + FloatArrayFromFloats(comp_info->filter_scales); + TfLiteIntArray* filter_zero_points = + IntArrayFromInts(comp_info->filter_zero_points); + + TfLiteTensor filter_tensor = CreateQuantizedTensor( + comp_info->filter_compressed, filter_dims, filter_scales->data[0], + filter_zero_points->data[0], false, kTfLiteInt8); + SymmetricQuantize(comp_info->filter_data, comp_info->filter_value_table, + ElementCount(*filter_dims), filter_scales->data[0]); + + TfLiteTensor bias_tensor = {}; + if (!null_bias) { + bias_tensor = CreateQuantizedTensor(comp_info->bias_compressed, bias_dims, + input_scale * filter_scales->data[0], 0, + false, typeToTfLiteType()); + SymmetricQuantize(comp_info->bias_data, comp_info->bias_value_table, + ElementCount(*bias_dims), bias_tensor.params.scale); + } + + TfLiteTensor output_tensor = CreateQuantizedTensor( + output_quantized, output_dims, output_scale, output_zero_point); + + const int tensors_size = null_bias ? kMaxTensors - 1 : kMaxTensors; + TfLiteTensor tensors[kMaxTensors] = {}; + tensors[0] = CreateQuantizedTensor(input_data, input_quantized, input_dims, + input_scale, input_zero_point); + tensors[1] = filter_tensor; + if (null_bias) { + tensors[2] = output_tensor; + } else { + tensors[2] = bias_tensor; + tensors[3] = output_tensor; + } + + const int output_dims_count = ElementCount(*output_dims); + Quantize(expected_output_data, expected_output_quantized, output_dims_count, + output_scale, output_zero_point); + return ValidateFullyConnectedGoldens( + tensors, tensors_size, null_bias, activation, 0.0f, output_dims_count, + expected_output_quantized, output_quantized, comp_info); +} + +#endif // USE_TFLM_COMPRESSION + } // namespace } // namespace testing } // namespace tflite @@ -393,6 +514,37 @@ TF_LITE_MICRO_TEST(SimpleTest) { kTfLiteOk); } +#ifdef USE_TFLM_COMPRESSION + +TF_LITE_MICRO_TEST(SimpleTestCompressed) { + float output_data[tflite::testing::simple_output_size]; + + tflite::testing::TestCompressionInfo comp_info = {}; + comp_info.scheme = tflite::CompressionScheme::kBinQuant; + comp_info.filter_value_table = tflite::testing::kBinQuantFilterValueTable; + comp_info.filter_value_table_stride = + std::extent::value; + comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidth; + comp_info.bias_value_table = tflite::testing::simple_bias_data; + comp_info.bias_value_table_stride = + std::extent::value; + comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidth; + + TF_LITE_MICRO_EXPECT_EQ( + tflite::testing::TestFullyConnectedFloat( + tflite::testing::simple_input_dims, + tflite::testing::simple_input_data, + tflite::testing::simple_weights_dims, + reinterpret_cast(tflite::testing::kBinQuantFilterData), + tflite::testing::simple_bias_dims, + reinterpret_cast(tflite::testing::kBinQuantBiasData), + tflite::testing::simple_golden, tflite::testing::simple_output_dims, + kTfLiteActNone, output_data, &comp_info), + kTfLiteOk); +} + +#endif // USE_TFLM_COMPRESSION + TF_LITE_MICRO_TEST(SimpleTestNullBias) { float output_data[tflite::testing::simple_output_size]; TF_LITE_MICRO_EXPECT_EQ( @@ -434,6 +586,54 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8) { kTfLiteOk); } +#ifdef USE_TFLM_COMPRESSION + +TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8Compressed) { + const float input_scale = 1.0f; + const int input_zero_point = -1; + constexpr float weights_scale[] = {1, 1.0f}; + constexpr int weights_zero_point[] = {1, 0}; + const float output_scale = 0.5f; + const int output_zero_point = -1; + + int8_t input_quantized[tflite::testing::simple_input_size]; + int8_t weights_quantized[tflite::testing::simple_weights_size]; + int32_t bias_quantized[tflite::testing::simple_output_size]; + int8_t golden_quantized[tflite::testing::simple_output_size]; + int8_t output_data[tflite::testing::simple_output_size]; + + tflite::testing::TestCompressionQuantizedInfo comp_info = {}; + comp_info.scheme = tflite::CompressionScheme::kBinQuant; + comp_info.filter_value_table = weights_quantized; + comp_info.filter_value_table_stride = + std::extent::value; + comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidth; + comp_info.filter_compressed = tflite::testing::kBinQuantFilterData; + comp_info.filter_data = tflite::testing::kBinQuantFilterValueTable; + comp_info.filter_dims_data = tflite::testing::simple_weights_dims; + comp_info.filter_scales = weights_scale; + comp_info.filter_zero_points = weights_zero_point; + comp_info.bias_value_table = bias_quantized; + comp_info.bias_value_table_stride = + std::extent::value; + comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidth; + comp_info.bias_compressed = tflite::testing::kBinQuantBiasData; + comp_info.bias_data = tflite::testing::simple_bias_data; + comp_info.bias_dims_data = tflite::testing::simple_bias_dims; + // bias_scales and bias_zero_points are not used + + TF_LITE_MICRO_EXPECT_EQ( + tflite::testing::TestFullyConnectedQuantizedCompressed( + tflite::testing::simple_input_dims, + tflite::testing::simple_input_data, input_quantized, input_scale, + input_zero_point, tflite::testing::simple_output_dims, + tflite::testing::simple_golden, golden_quantized, output_data, + output_scale, output_zero_point, kTfLiteActNone, &comp_info), + kTfLiteOk); +} + +#endif // USE_TFLM_COMPRESSION + #if !defined(HEXAGON) TF_LITE_MICRO_TEST(SimpleTestQuantizedInt16) { const float input_scale = 128.0 / 65536; @@ -443,7 +643,6 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedInt16) { const float output_scale = 128.0 / 65536; const int output_zero_point = 0; - const float simple_golden[] = {24, 25, 26, 58, 59, 60}; int16_t input_quantized[tflite::testing::simple_input_size]; int8_t weights_quantized[tflite::testing::simple_weights_size]; int64_t bias_quantized[tflite::testing::simple_output_size]; @@ -457,12 +656,62 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedInt16) { input_zero_point, tflite::testing::simple_weights_dims, tflite::testing::simple_weights_data, weights_quantized, weights_scale, weights_zero_point, tflite::testing::simple_bias_dims, - tflite::testing::simple_bias_data, bias_quantized, simple_golden, - golden_quantized, tflite::testing::simple_output_dims, output_scale, - output_zero_point, kTfLiteActNone, output_data), + tflite::testing::simple_bias_data, bias_quantized, + tflite::testing::simple_golden, golden_quantized, + tflite::testing::simple_output_dims, output_scale, output_zero_point, + kTfLiteActNone, output_data), kTfLiteOk); } -#endif + +#ifdef USE_TFLM_COMPRESSION + +TF_LITE_MICRO_TEST(SimpleTestQuantizedInt16Compressed) { + const float input_scale = 128.0 / 65536; + const int input_zero_point = 0; + constexpr float weights_scale[] = {1, 1.0f}; + constexpr int weights_zero_point[] = {1, 0}; + const float output_scale = 128.0 / 65536; + const int output_zero_point = 0; + + int16_t input_quantized[tflite::testing::simple_input_size]; + int8_t weights_quantized[tflite::testing::simple_weights_size]; + int64_t bias_quantized[tflite::testing::simple_output_size]; + int16_t golden_quantized[tflite::testing::simple_output_size]; + int16_t output_data[tflite::testing::simple_output_size]; + + tflite::testing::TestCompressionQuantizedInfo comp_info = {}; + comp_info.scheme = tflite::CompressionScheme::kBinQuant; + comp_info.filter_value_table = weights_quantized; + comp_info.filter_value_table_stride = + std::extent::value; + comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidth; + comp_info.filter_compressed = tflite::testing::kBinQuantFilterData; + comp_info.filter_data = tflite::testing::kBinQuantFilterValueTable; + comp_info.filter_dims_data = tflite::testing::simple_weights_dims; + comp_info.filter_scales = weights_scale; + comp_info.filter_zero_points = weights_zero_point; + comp_info.bias_value_table = bias_quantized; + comp_info.bias_value_table_stride = + std::extent::value; + comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidth; + comp_info.bias_compressed = tflite::testing::kBinQuantBiasData; + comp_info.bias_data = tflite::testing::simple_bias_data; + comp_info.bias_dims_data = tflite::testing::simple_bias_dims; + // bias_scales and bias_zero_points are not used + + TF_LITE_MICRO_EXPECT_EQ( + tflite::testing::TestFullyConnectedQuantizedCompressed( + tflite::testing::simple_input_dims, + tflite::testing::simple_input_data, input_quantized, input_scale, + input_zero_point, tflite::testing::simple_output_dims, + tflite::testing::simple_golden, golden_quantized, output_data, + output_scale, output_zero_point, kTfLiteActNone, &comp_info), + kTfLiteOk); +} + +#endif // USE_TFLM_COMPRESSION + +#endif // !defined(HEXAGON) TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8) { const float input_scale = 1.0f; diff --git a/tensorflow/lite/micro/kernels/kernel_runner.cc b/tensorflow/lite/micro/kernels/kernel_runner.cc index 602778d7c50..79824efe5de 100644 --- a/tensorflow/lite/micro/kernels/kernel_runner.cc +++ b/tensorflow/lite/micro/kernels/kernel_runner.cc @@ -1,4 +1,4 @@ -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,7 +18,6 @@ limitations under the License. #include "tensorflow/lite/micro/arena_allocator/single_arena_buffer_allocator.h" #include "tensorflow/lite/micro/micro_arena_constants.h" #include "tensorflow/lite/micro/micro_log.h" -#include "tensorflow/lite/micro/test_helpers.h" namespace tflite { namespace micro { @@ -38,12 +37,22 @@ KernelRunner::KernelRunner(const TFLMRegistration& registration, TfLiteTensor* tensors, int tensors_size, TfLiteIntArray* inputs, TfLiteIntArray* outputs, const void* builtin_data, - TfLiteIntArray* intermediates) + TfLiteIntArray* intermediates +#ifdef USE_TFLM_COMPRESSION + , + const CompressedTensorList* compressed_tensors +#endif // USE_TFLM_COMPRESSION + ) : registration_(registration), allocator_(SingleArenaBufferAllocator::Create(kKernelRunnerBuffer_, kKernelRunnerBufferSize_)), mock_micro_graph_(allocator_), - fake_micro_context_(tensors, allocator_, &mock_micro_graph_) { + fake_micro_context_(tensors, allocator_, &mock_micro_graph_ +#ifdef USE_TFLM_COMPRESSION + , + compressed_tensors +#endif // USE_TFLM_COMPRESSION + ) { // Prepare TfLiteContext: context_.impl_ = static_cast(&fake_micro_context_); context_.ReportError = MicroContextReportOpError; diff --git a/tensorflow/lite/micro/kernels/kernel_util.h b/tensorflow/lite/micro/kernels/kernel_util.h index 977ed9563e1..5ef4bac85c9 100644 --- a/tensorflow/lite/micro/kernels/kernel_util.h +++ b/tensorflow/lite/micro/kernels/kernel_util.h @@ -95,8 +95,6 @@ const T* GetOptionalTensorData(const TfLiteEvalTensor* tensor) { // Overloads existing GetTensorData. If not compressed, this will return // tensor->data. -// -// TODO(ddavis-2015): make micro_context a const pointer template const T* GetTensorData(MicroContext* micro_context, const TfLiteEvalTensor* tensor, diff --git a/tensorflow/lite/micro/kernels/transpose_conv.cc b/tensorflow/lite/micro/kernels/transpose_conv.cc index ea0efae0607..7d65dc3de7c 100644 --- a/tensorflow/lite/micro/kernels/transpose_conv.cc +++ b/tensorflow/lite/micro/kernels/transpose_conv.cc @@ -1,4 +1,4 @@ -/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -27,30 +27,26 @@ limitations under the License. #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/kernels/padding.h" #include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/transpose_conv.h" #include "tensorflow/lite/micro/micro_log.h" namespace tflite { namespace { -// For the TfLite transpose_conv implementation, input tensor 0 corresponds to -// the OutputShapeTensor. However, since TFLM does not support dynamic tensors, -// the TFLM implementation ignores input tensor 0 and the only inputs we care -// about are kFilterTensor, kInputTensor and kBiasTensor. -constexpr int kFilterTensor = 1; -constexpr int kInputTensor = 2; -constexpr int kBiasTensor = 3; -constexpr int kOutputTensor = 0; - -// Conv is quantized along dimension 0: -// https://www.tensorflow.org/lite/performance/quantization_spec -constexpr int kConvQuantizedDimension = 0; - struct OpData { ConvParams params; // A scratch buffer is required for quantized implementations. int scratch_buffer_index; +#ifdef USE_TFLM_COMPRESSION + + // scratch buffers for compressed tensors + int filter_scratch_index; + int bias_scratch_index; + +#endif // USE_TFLM_COMPRESSION + // Index to the converted 64-bit bias buffer from 16-bit bias. This is // required to handle 16x8 transpose convolutions where a 16-bit bias is // provided, whereas the kernel expects 64-bit biases. @@ -102,17 +98,17 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node, MicroContext* micro_context = GetMicroContext(context); TfLiteTensor* input = - micro_context->AllocateTempInputTensor(node, kInputTensor); + micro_context->AllocateTempInputTensor(node, kTransposeConvInputTensor); TF_LITE_ENSURE(context, input != nullptr); - TfLiteTensor* filter = - micro_context->AllocateTempInputTensor(node, kFilterTensor); + TfLiteTensor* filter = micro_context->AllocateTempInputTensor( + node, kTransposeConvFilterTensor); TF_LITE_ENSURE(context, filter != nullptr); TfLiteTensor* bias = - micro_context->AllocateTempInputTensor(node, kBiasTensor); - TfLiteTensor* output = - micro_context->AllocateTempOutputTensor(node, kOutputTensor); + micro_context->AllocateTempInputTensor(node, kTransposeConvBiasTensor); + TfLiteTensor* output = micro_context->AllocateTempOutputTensor( + node, kTransposeConvOutputTensor); TF_LITE_ENSURE(context, output != nullptr); - int output_channels = filter->dims->data[kConvQuantizedDimension]; + int output_channels = filter->dims->data[kTransposeConvQuantizedDimension]; TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams( context, input, filter, bias, output, kTfLiteActNone, @@ -164,13 +160,13 @@ TfLiteStatus TransposeConvPrepare(TfLiteContext* context, TfLiteNode* node) { MicroContext* micro_context = GetMicroContext(context); TfLiteTensor* output = - micro_context->AllocateTempOutputTensor(node, kOutputTensor); + micro_context->AllocateTempOutputTensor(node, kTransposeConvOutputTensor); TF_LITE_ENSURE(context, output != nullptr); TfLiteTensor* input = - micro_context->AllocateTempInputTensor(node, kInputTensor); + micro_context->AllocateTempInputTensor(node, kTransposeConvInputTensor); TF_LITE_ENSURE(context, input != nullptr); TfLiteTensor* filter = - micro_context->AllocateTempInputTensor(node, kFilterTensor); + micro_context->AllocateTempInputTensor(node, kTransposeConvFilterTensor); TF_LITE_ENSURE(context, filter != nullptr); TF_LITE_ENSURE_MSG( @@ -186,7 +182,7 @@ TfLiteStatus TransposeConvPrepare(TfLiteContext* context, TfLiteNode* node) { const int filter_height = SizeOfDimension(filter, 1); // Dynamically allocate per-channel quantization parameters. - const int num_channels = filter->dims->data[kConvQuantizedDimension]; + const int num_channels = filter->dims->data[kTransposeConvQuantizedDimension]; data->per_channel_output_multiplier = static_cast(context->AllocatePersistentBuffer( context, num_channels * sizeof(int32_t))); @@ -223,10 +219,10 @@ TfLiteStatus TransposeConvPrepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE(context, affine_quantization->scale); TF_LITE_ENSURE(context, affine_quantization->zero_point); - TF_LITE_ENSURE(context, - affine_quantization->scale->size == 1 || - affine_quantization->scale->size == - filter->dims->data[kConvQuantizedDimension]); + TF_LITE_ENSURE( + context, affine_quantization->scale->size == 1 || + affine_quantization->scale->size == + filter->dims->data[kTransposeConvQuantizedDimension]); TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size, affine_quantization->zero_point->size); } @@ -244,6 +240,18 @@ TfLiteStatus TransposeConvPrepare(TfLiteContext* context, TfLiteNode* node) { data->params.stride_width = params->stride_width; data->params.stride_height = params->stride_height; +#ifdef USE_TFLM_COMPRESSION + + // Compression scratch buffers. + // These will only be allocated if the tensor is compressed. + data->filter_scratch_index = + micro_context->AllocateDecompressionScratchBuffer( + node, kTransposeConvFilterTensor); + data->bias_scratch_index = micro_context->AllocateDecompressionScratchBuffer( + node, kTransposeConvBiasTensor); + +#endif // USE_TFLM_COMPRESSION + micro_context->DeallocateTempTfLiteTensor(output); micro_context->DeallocateTempTfLiteTensor(input); micro_context->DeallocateTempTfLiteTensor(filter); @@ -252,15 +260,26 @@ TfLiteStatus TransposeConvPrepare(TfLiteContext* context, TfLiteNode* node) { TfLiteStatus TransposeConvEval(TfLiteContext* context, TfLiteNode* node) { const TfLiteEvalTensor* input = - tflite::micro::GetEvalInput(context, node, kInputTensor); + tflite::micro::GetEvalInput(context, node, kTransposeConvInputTensor); const TfLiteEvalTensor* filter = - tflite::micro::GetEvalInput(context, node, kFilterTensor); + tflite::micro::GetEvalInput(context, node, kTransposeConvFilterTensor); const TfLiteEvalTensor* bias = (NumInputs(node) == 4) - ? tflite::micro::GetEvalInput(context, node, kBiasTensor) + ? tflite::micro::GetEvalInput(context, node, kTransposeConvBiasTensor) : nullptr; TfLiteEvalTensor* output = - tflite::micro::GetEvalOutput(context, node, kOutputTensor); + tflite::micro::GetEvalOutput(context, node, kTransposeConvOutputTensor); + +#ifdef USE_TFLM_COMPRESSION + + MicroContext* micro_context = GetMicroContext(context); + + const CompressionTensorData* filter_comp_td = + micro_context->GetTensorCompressionData(node, kTransposeConvFilterTensor); + const CompressionTensorData* bias_comp_td = + micro_context->GetTensorCompressionData(node, kTransposeConvBiasTensor); + +#endif // USE_TFLM_COMPRESSION TFLITE_DCHECK(node->user_data != nullptr); const OpData& data = *(static_cast(node->user_data)); @@ -280,9 +299,17 @@ TfLiteStatus TransposeConvEval(TfLiteContext* context, TfLiteNode* node) { op_params, tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(filter), +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData( + micro_context, filter, filter_comp_td, data.filter_scratch_index), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData(micro_context, bias, bias_comp_td, + data.bias_scratch_index), +#else // USE_TFLM_COMPRESSION tflite::micro::GetTensorData(filter), tflite::micro::GetTensorShape(bias), tflite::micro::GetOptionalTensorData(bias), +#endif // USE_TFLM_COMPRESSION tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output), tflite::micro::GetTensorShape(nullptr), nullptr); @@ -296,9 +323,17 @@ TfLiteStatus TransposeConvEval(TfLiteContext* context, TfLiteNode* node) { data.per_channel_output_shift, tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(filter), +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData( + micro_context, filter, filter_comp_td, data.filter_scratch_index), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData( + micro_context, bias, bias_comp_td, data.bias_scratch_index), +#else // USE_TFLM_COMPRESSION tflite::micro::GetTensorData(filter), tflite::micro::GetTensorShape(bias), tflite::micro::GetOptionalTensorData(bias), +#endif // USE_TFLM_COMPRESSION tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output), tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer); @@ -311,16 +346,29 @@ TfLiteStatus TransposeConvEval(TfLiteContext* context, TfLiteNode* node) { auto* bias_converted_buffer = static_cast(context->GetScratchBuffer( context, data.bias_converted_buffer_index)); + const int16_t* const bias_int16_data = +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData( + micro_context, bias, bias_comp_td, data.bias_scratch_index); +#else // USE_TFLM_COMPRESSION + static_cast(bias->data.data); +#endif // USE_TFLM_COMPRESSION for (int i = 0; i < tflite::micro::GetTensorShape(bias).FlatSize(); i++) { - bias_converted_buffer[i] = bias->data.i16[i]; + bias_converted_buffer[i] = bias_int16_data[i]; } reference_integer_ops::TransposeConv( data.params, data.per_channel_output_multiplier, data.per_channel_output_shift, tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(filter), +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(micro_context, filter, + filter_comp_td, + data.filter_scratch_index), +#else // USE_TFLM_COMPRESSION tflite::micro::GetTensorData(filter), +#endif // USE_TFLM_COMPRESSION tflite::micro::GetTensorShape(bias), bias_converted_buffer, tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output), @@ -331,9 +379,18 @@ TfLiteStatus TransposeConvEval(TfLiteContext* context, TfLiteNode* node) { data.per_channel_output_shift, tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(filter), +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(micro_context, filter, + filter_comp_td, + data.filter_scratch_index), + tflite::micro::GetTensorShape(bias), + tflite::micro::GetTensorData( + micro_context, bias, bias_comp_td, data.bias_scratch_index), +#else // USE_TFLM_COMPRESSION tflite::micro::GetTensorData(filter), tflite::micro::GetTensorShape(bias), - tflite::micro::GetOptionalTensorData(bias), + tflite::micro::GetOptionalTensorData(bias), +#endif // USE_TFLM_COMPRESSION tflite::micro::GetTensorShape(output), tflite::micro::GetTensorData(output), tflite::micro::GetTensorShape(nullptr), nullptr, scratch_buffer); diff --git a/tensorflow/lite/micro/kernels/transpose_conv.h b/tensorflow/lite/micro/kernels/transpose_conv.h index 3a99ccbf847..ec0416e067f 100644 --- a/tensorflow/lite/micro/kernels/transpose_conv.h +++ b/tensorflow/lite/micro/kernels/transpose_conv.h @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -23,6 +23,19 @@ limitations under the License. namespace tflite { +// For the TfLite transpose_conv implementation, input tensor 0 corresponds to +// the OutputShapeTensor. However, since TFLM does not support dynamic tensors, +// the TFLM implementation ignores input tensor 0 and the only inputs we care +// about are kFilterTensor, kInputTensor and kBiasTensor. +constexpr int kTransposeConvFilterTensor = 1; +constexpr int kTransposeConvInputTensor = 2; +constexpr int kTransposeConvBiasTensor = 3; +constexpr int kTransposeConvOutputTensor = 0; + +// Conv is quantized along dimension 0: +// https://www.tensorflow.org/lite/performance/quantization_spec +constexpr int kTransposeConvQuantizedDimension = 0; + // This is the most generic TFLMRegistration. The actual supported types // may still be target dependent. The only requirement is that every // implementation (reference or optimized) must define this function. diff --git a/tensorflow/lite/micro/kernels/transpose_conv_test.cc b/tensorflow/lite/micro/kernels/transpose_conv_test.cc index 49d2c90f439..e9716794229 100644 --- a/tensorflow/lite/micro/kernels/transpose_conv_test.cc +++ b/tensorflow/lite/micro/kernels/transpose_conv_test.cc @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,9 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "tensorflow/lite/micro/kernels/transpose_conv.h" + +#include + #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/common.h" -#include "tensorflow/lite/micro/kernels/conv_test.h" #include "tensorflow/lite/micro/kernels/kernel_runner.h" #include "tensorflow/lite/micro/micro_utils.h" #include "tensorflow/lite/micro/test_helpers.h" @@ -47,20 +50,119 @@ static const float kGoldenData[kOutputElements] = { 184, 412, 568, 528, 678, 1347, 1689, 1434, 1494, 2715, 3057, 2442, 1968, 3352, 3652, 2760}; +#ifdef USE_TFLM_COMPRESSION + +constexpr size_t kTransposeConvMaxTensors = 5; +constexpr size_t kTransposeConvMaxInputTensors = 4; + +// compressed filter data for kBinQuant scheme, matches kFilterData +constexpr uint8_t kBinQuantFilterData[] = {0x00, 0x44, 0x32, 0x14, 0xC7, 0x42, + 0x54, 0xB6, 0x35, 0xCF, 0x84, 0x40}; +constexpr int kBinQuantFilterBitWidth = 5; +// compressed bias data for kBinQuant scheme, matches kBiasData +constexpr uint8_t kBinQuantBiasData[] = {0x00}; +constexpr int kBinQuantBiasBitWidth = 1; + +// Common inputs and outputs (quantized single channel). +// data from TfLite test: SimpleBiasTestQuantizedPerChannelSingleChannel +static int kInputShapeQ1[] = {4, 1, 4, 4, 1}; +static constexpr float kInputDataQ1[] = {1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16}; +constexpr size_t kInputElementsQ1 = std::extent::value; + +constexpr int kFilterNumChannelsQ1 = 1; +static int kFilterShapeQ1[] = {4, 1, 3, 3, 1}; +static constexpr float kFilterDataQ1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; +constexpr size_t kFilterElementsQ1 = + std::extent::value; + +static int kBiasShapeQ1[] = {1, 1}; +static constexpr float kBiasDataQ1[] = {1}; +constexpr size_t kBiasElementsQ1 = std::extent::value; + +static int kOutputShapeQ1[] = {4, 1, 4, 4, 1}; +static constexpr float kGoldenDataQ1[] = { + 30, 62, 84, 76, 100, 194, 238, 200, 208, 372, 418, 330, 264, 446, 486, 366}; +constexpr int kOutputElementsQ1 = std::extent::value; + +// compressed filter data for kBinQuant scheme, matches kFilterDataQ1 +constexpr uint8_t kBinQuantFilterDataQ1[] = {0x01, 0x23, 0x45, 0x67, 0x80}; +constexpr int kBinQuantFilterBitWidthQ1 = 4; +// compressed bias data for kBinQuant scheme, matches kBiasDataQ1 +constexpr uint8_t kBinQuantBiasDataQ1[] = {0x00}; +constexpr int kBinQuantBiasBitWidthQ1 = 1; + +// Common inputs and outputs (quantized multi channel). +// data from TfLite test: SimpleBiasTestQuantizedPerChannel16x8Bias64 +static int kInputShapeQ2[] = {4, 1, 2, 3, 2}; +static constexpr float kInputDataQ2[] = { + // [1 * 2 * 3 * 2] as [batch, y, x, input_channel] + 3, 2, // batch = 0, y = 0, x = 0 + 1, -1, // batch = 0, y = 0, x = 1 + -2, -3, // batch = 0, y = 0, x = 2 + 4, 3, // batch = 0, y = 1, x = 0 + 2, -2, // batch = 0, y = 1, x = 1 + -3, -4, // batch = 0, y = 1, x = 2 +}; +constexpr size_t kInputElementsQ2 = std::extent::value; + +constexpr int kFilterNumChannelsQ2 = 2; +static int kFilterShapeQ2[] = {4, 2, 2, 2, 2}; +static constexpr float kFilterDataQ2[] = { + // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel] + 1, 2, // out channel = 0, y = 0, x = 0 + 3, 4, // out channel = 0, y = 0, x = 1 + 3, 4, // out channel = 0, y = 1, x = 0 + 5, 6, // out channel = 0, y = 1, x = 1 + 7, 8, // out channel = 1, y = 0, x = 0 + 5, 6, // out channel = 1, y = 0, x = 1 + 3, 4, // out channel = 1, y = 1, x = 0 + 1, 2, // out channel = 1, y = 1, x = 1 +}; +constexpr size_t kFilterElementsQ2 = + std::extent::value; + +static int kBiasShapeQ2[] = {1, 2}; +static constexpr float kBiasDataQ2[] = {3, -2}; +constexpr size_t kBiasElementsQ2 = std::extent::value; + +static int kOutputShapeQ2[] = {4, 1, 2, 3, 2}; +static constexpr float kGoldenDataQ2[] = {10, 35, 19, 24, -6, -41, + 30, 64, 51, 40, -29, -64}; +constexpr int kOutputElementsQ2 = std::extent::value; + +// compressed filter data for kBinQuant scheme, matches kFilterDataQ2 +constexpr uint8_t kBinQuantFilterDataQ2[] = {0x05, 0x34, 0xE5, + 0xDE, 0x54, 0xC1}; +constexpr float kBinQuantFilterValueTableQ2[] = {1, 2, 3, 4, 5, 6, 0, 0, + 1, 2, 3, 4, 5, 6, 7, 8}; +constexpr int kBinQuantFilterBitWidthQ2 = 3; +// compressed bias data for kBinQuant scheme, matches kBiasDataQ2 +constexpr uint8_t kBinQuantBiasDataQ2[] = {0x00}; +constexpr int kBinQuantBiasBitWidthQ2 = 1; + +#endif // USE_TFLM_COMPRESSION + // Transpose conv uses TfLiteConvParams. -static TfLiteConvParams common_conv_params = {kTfLitePaddingSame, // padding - 1, // stride_width - 1, // stride_height - kTfLiteActNone, - 1, - 1, - kTfLiteNoType}; +static const TfLiteConvParams common_conv_params = { + kTfLitePaddingSame, // padding + 1, // stride_width + 1, // stride_height + kTfLiteActNone, + 1, + 1, + kTfLiteNoType}; template -TfLiteStatus InvokeTransposeConv(TfLiteTensor* tensors, int tensors_size, - int output_length, - TfLiteConvParams* conv_params, - T* output_data) { +TfLiteStatus InvokeTransposeConv( + TfLiteTensor* tensors, int tensors_size, int output_length, + const TfLiteConvParams* conv_params, T* output_data +#ifdef USE_TFLM_COMPRESSION + , + const CompressedTensorList* comp_list_p = nullptr +#endif // USE_TFLM_COMPRESSION +) { + // TODO(b/358151309): support optional bias tensor int inputs_array_data[] = {4, 0, 1, 2, 3}; TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); int outputs_array_data[] = {1, 4}; @@ -68,7 +170,12 @@ TfLiteStatus InvokeTransposeConv(TfLiteTensor* tensors, int tensors_size, const TFLMRegistration registration = tflite::Register_TRANSPOSE_CONV(); micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array, - outputs_array, conv_params); + outputs_array, conv_params +#ifdef USE_TFLM_COMPRESSION + , + nullptr, comp_list_p +#endif // USE_TFLM_COMPRESSION + ); const char* init_data = reinterpret_cast(conv_params); TfLiteStatus status = runner.InitAndPrepare(init_data); @@ -78,15 +185,44 @@ TfLiteStatus InvokeTransposeConv(TfLiteTensor* tensors, int tensors_size, return runner.Invoke(); } -template -TfLiteStatus ValidateTransposeConvGoldens(TfLiteTensor* tensors, - int tensors_size, - const T* expected_output_data, - int output_length, - TfLiteConvParams* conv_params, - T* output_data, float tolerance) { +template +TfLiteStatus ValidateTransposeConvGoldens( + TfLiteTensor* tensors, int tensors_size, const T* expected_output_data, + int output_length, const TfLiteConvParams* conv_params, T* output_data, + float tolerance = 1e-5f +#ifdef USE_TFLM_COMPRESSION + , + const TestCompressionInfo* comp_info = nullptr +#endif // USE_TFLM_COMPRESSION +) { +#ifdef USE_TFLM_COMPRESSION + + TestCompressedList tcl; + const CompressedTensorList* comp_list_p = nullptr; + + if (comp_info != nullptr) { + TF_LITE_MICRO_EXPECT_EQ( + tcl.AddWeight(*comp_info, tensors[kTransposeConvFilterTensor], + kTransposeConvFilterTensor), + kTfLiteOk); + TF_LITE_MICRO_CHECK_FAIL(); + TF_LITE_MICRO_EXPECT_EQ( + tcl.AddBias(*comp_info, tensors[kTransposeConvBiasTensor], + kTransposeConvBiasTensor), + kTfLiteOk); + TF_LITE_MICRO_CHECK_FAIL(); + comp_list_p = tcl.GetCompressedTensorList(); + } + +#endif // USE_TFLM_COMPRESSION + TfLiteStatus status = InvokeTransposeConv( - tensors, tensors_size, output_length, conv_params, output_data); + tensors, tensors_size, output_length, conv_params, output_data +#ifdef USE_TFLM_COMPRESSION + , + comp_list_p +#endif // USE_TFLM_COMPRESSION + ); if (status != kTfLiteOk) { return status; } @@ -97,11 +233,17 @@ TfLiteStatus ValidateTransposeConvGoldens(TfLiteTensor* tensors, return kTfLiteOk; } +template TfLiteStatus TestTransposeConvFloat( int* input_dims_data, const float* input_data, int* filter_dims_data, const float* filter_data, int* bias_dims_data, const float* bias_data, int* output_dims_data, const float* expected_output_data, - TfLiteConvParams* conv_params, float* output_data) { + const TfLiteConvParams* conv_params, float* output_data +#ifdef USE_TFLM_COMPRESSION + , + const TestCompressionInfo* comp_info = nullptr +#endif // USE_TFLM_COMPRESSION +) { TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); @@ -125,7 +267,12 @@ TfLiteStatus TestTransposeConvFloat( return ValidateTransposeConvGoldens(tensors, tensors_size, expected_output_data, output_dims_count, - conv_params, output_data, 0.001f); + conv_params, output_data +#ifdef USE_TFLM_COMPRESSION + , + 1e-5, comp_info +#endif // USE_TFLM_COMPRESSION + ); } TfLiteStatus TestTransposeConvQuantized( @@ -135,8 +282,8 @@ TfLiteStatus TestTransposeConvQuantized( int* bias_dims_data, const float* bias_data, int32_t* bias_quantized, float* bias_scales, int* bias_zero_points, int* output_dims_data, const float* expected_output_data, int8_t* expected_output_quantized, - float output_scale, int output_zero_point, TfLiteConvParams* conv_params, - int8_t* output_data) { + float output_scale, int output_zero_point, + const TfLiteConvParams* conv_params, int8_t* output_data) { TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); @@ -181,8 +328,8 @@ TfLiteStatus TestTransposeConvQuantized( int* bias_dims_data, const float* bias_data, T* bias_quantized, float* bias_scales, int* bias_zero_points, int* output_dims_data, const float* expected_output_data, int16_t* expected_output_quantized, - float output_scale, int output_zero_point, TfLiteConvParams* conv_params, - int16_t* output_data) { + float output_scale, int output_zero_point, + const TfLiteConvParams* conv_params, int16_t* output_data) { TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data); TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data); @@ -221,6 +368,76 @@ TfLiteStatus TestTransposeConvQuantized( conv_params, output_data, 4.0f); } +#ifdef USE_TFLM_COMPRESSION + +template +TfLiteStatus TestTransposeConvQuantizedCompressed( + int* input_dims_data, const float* input_data, TIO* input_quantized, + float input_scale, int input_zero_point, int* output_dims_data, + const float* expected_output_data, TIO* expected_output_quantized, + TIO* output_quantized, float output_scale, int output_zero_point, + const TfLiteConvParams* conv_params, const unsigned int tolerance, + const TestCompressionQuantizedInfo* comp_info) { + // TODO(b/358151309): account for optional bias tensor + // bool null_bias = comp_info->bias_data == nullptr ? true : false; + + TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data); + TfLiteIntArray* filter_dims = IntArrayFromInts(comp_info->filter_dims_data); + TfLiteIntArray* bias_dims = IntArrayFromInts(comp_info->bias_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data); + + TfLiteFloatArray* filter_scales = + FloatArrayFromFloats(comp_info->filter_scales); + TfLiteIntArray* filter_zero_points = + IntArrayFromInts(comp_info->filter_zero_points); + TfLiteFloatArray* bias_scales = FloatArrayFromFloats(comp_info->bias_scales); + TfLiteIntArray* bias_zero_points = + IntArrayFromInts(comp_info->bias_zero_points); + + TfLiteAffineQuantization filter_quant = {}; + TfLiteTensor filter_tensor = CreatePerChannelQuantizedTensor( + comp_info->filter_compressed, filter_dims, filter_scales, + filter_zero_points, &filter_quant, kTransposeConvQuantizedDimension, + false /* is_variable */, kTfLiteInt8); + SymmetricPerChannelQuantize( + comp_info->filter_data, comp_info->filter_value_table, + ElementCount(*filter_dims), filter_scales->size, filter_scales->data); + + TfLiteAffineQuantization bias_quant = {}; + TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor( + comp_info->bias_compressed, bias_dims, input_scale, filter_scales, + bias_scales, bias_zero_points, &bias_quant, + kTransposeConvQuantizedDimension, false /* is_variable */, + typeToTfLiteType()); + SymmetricPerChannelQuantize(comp_info->bias_data, comp_info->bias_value_table, + ElementCount(*bias_dims), bias_scales->size, + bias_scales->data); + + int output_shape_dims_data[] = {1, 0}; + int32_t* output_shape = nullptr; + TfLiteIntArray* output_shape_dims = IntArrayFromInts(output_shape_dims_data); + + constexpr int tensors_size = kTransposeConvMaxTensors; + TfLiteTensor tensors[tensors_size] = { + CreateTensor(output_shape, output_shape_dims), + filter_tensor, + CreateQuantizedTensor(input_data, input_quantized, input_dims, + input_scale, input_zero_point), + bias_tensor, + CreateQuantizedTensor(output_quantized, output_dims, output_scale, + output_zero_point), + }; + + const int output_dims_count = ElementCount(*output_dims); + Quantize(expected_output_data, expected_output_quantized, output_dims_count, + output_scale, output_zero_point); + return ValidateTransposeConvGoldens( + tensors, tensors_size, expected_output_quantized, output_dims_count, + conv_params, output_quantized, tolerance, comp_info); +} + +#endif // USE_TFLM_COMPRESSION + } // namespace } // namespace testing } // namespace tflite @@ -240,6 +457,36 @@ TF_LITE_MICRO_TEST(SimpleTestFloat) { &tflite::testing::common_conv_params, output_data)); } +#ifdef USE_TFLM_COMPRESSION + +TF_LITE_MICRO_TEST(SimpleTestFloatCompressed) { + tflite::testing::TestCompressionInfo comp_info = {}; + comp_info.scheme = tflite::CompressionScheme::kBinQuant; + comp_info.filter_value_table = tflite::testing::kFilterData; + comp_info.filter_value_table_stride = + std::extent::value; + comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidth; + comp_info.bias_value_table = tflite::testing::kBiasData; + comp_info.bias_value_table_stride = + std::extent::value; + comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidth; + + float output_data[tflite::testing::kOutputElements]; + + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, + tflite::testing::TestTransposeConvFloat( + tflite::testing::kInputShape, tflite::testing::kInputData, + tflite::testing::kFilterShape, + reinterpret_cast(tflite::testing::kBinQuantFilterData), + tflite::testing::kBiasShape, + reinterpret_cast(tflite::testing::kBinQuantBiasData), + tflite::testing::kOutputShape, tflite::testing::kGoldenData, + &tflite::testing::common_conv_params, output_data, &comp_info)); +} + +#endif // USE_TFLM_COMPRESSION + TF_LITE_MICRO_TEST(fusedRELUTest) { float output_data[tflite::testing::kOutputElements]; float golden_data[] = {29, 24, 0, 0, 99, 72, 0, 0, @@ -476,4 +723,199 @@ TF_LITE_MICRO_TEST(HybridModeIsError) { &tflite::testing::common_conv_params, output_data)); } +#ifdef USE_TFLM_COMPRESSION + +TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelSingleChannelCompressed) { + // data from TfLite test: SimpleBiasTestQuantizedPerChannelSingleChannel + const float input_scale = 16.0f / 255.0f; + const float output_scale = 2.0f; + const int input_zero_point = -128; + const int output_zero_point = -128; + constexpr float filter_scales[] = { + tflite::testing::kFilterNumChannelsQ1, + 9.0f / 127.0f, + }; + constexpr int filter_zero_points[] = { + tflite::testing::kFilterNumChannelsQ1, + 0, + }; + // bias scales and zero points will be computed + float bias_scales[std::extent::value] = {}; + int bias_zero_points[std::extent::value] = {}; + + int8_t input_quantized[tflite::testing::kInputElementsQ1]; + int8_t filter_quantized[tflite::testing::kFilterElementsQ1]; + int32_t bias_quantized[tflite::testing::kBiasElementsQ1]; + int8_t golden_quantized[tflite::testing::kOutputElementsQ1]; + int8_t output_quantized[tflite::testing::kOutputElementsQ1]; + + tflite::testing::TestCompressionQuantizedInfo comp_info = {}; + comp_info.scheme = tflite::CompressionScheme::kBinQuant; + + comp_info.filter_value_table = filter_quantized; + comp_info.filter_value_table_stride = + std::extent::value / + tflite::testing::kFilterNumChannelsQ1; + comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidthQ1; + comp_info.filter_compressed = tflite::testing::kBinQuantFilterDataQ1; + comp_info.filter_data = tflite::testing::kFilterDataQ1; + comp_info.filter_dims_data = tflite::testing::kFilterShapeQ1; + comp_info.filter_scales = filter_scales; + comp_info.filter_zero_points = filter_zero_points; + + comp_info.bias_value_table = bias_quantized; + comp_info.bias_value_table_stride = + std::extent::value / + tflite::testing::kFilterNumChannelsQ1; + comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidthQ1; + comp_info.bias_compressed = tflite::testing::kBinQuantBiasDataQ1; + comp_info.bias_data = tflite::testing::kBiasDataQ1; + comp_info.bias_dims_data = tflite::testing::kBiasShapeQ1; + comp_info.bias_scales = bias_scales; + comp_info.bias_zero_points = bias_zero_points; + + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, + tflite::testing::TestTransposeConvQuantizedCompressed( + tflite::testing::kInputShapeQ1, tflite::testing::kInputDataQ1, + input_quantized, input_scale, input_zero_point, + tflite::testing::kOutputShapeQ1, tflite::testing::kGoldenDataQ1, + golden_quantized, output_quantized, output_scale, output_zero_point, + &tflite::testing::common_conv_params, 0, &comp_info)); +} + +TF_LITE_MICRO_TEST( + SimpleBiasTestQuantizedPerChannelBias16MultiChannelCompressed) { + // data from TfLite test: SimpleBiasTestQuantizedPerChannel16x8Bias64 + const float input_scale = 4.0f / 127.0f; + const float output_scale = 128.0f / 65536.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + constexpr float filter_scales[] = { + tflite::testing::kFilterNumChannelsQ2, + 7.0f / 127.0f, + 8.0f / 127.0f, + }; + constexpr int filter_zero_points[] = { + tflite::testing::kFilterNumChannelsQ2, + 0, + 0, + }; + // bias scales and zero points will be computed + float bias_scales[std::extent::value] = {}; + int bias_zero_points[std::extent::value] = {}; + + int16_t input_quantized[tflite::testing::kInputElementsQ2]; + int8_t filter_quantized[tflite::testing::kFilterElementsQ2]; + int16_t bias_quantized[tflite::testing::kBiasElementsQ2]; + int16_t golden_quantized[tflite::testing::kOutputElementsQ2]; + int16_t output_quantized[tflite::testing::kOutputElementsQ2]; + + tflite::testing::TestCompressionQuantizedInfo comp_info = {}; + comp_info.scheme = tflite::CompressionScheme::kBinQuant; + + comp_info.filter_value_table = filter_quantized; + comp_info.filter_value_table_stride = + std::extent< + decltype(tflite::testing::kBinQuantFilterValueTableQ2)>::value / + tflite::testing::kFilterNumChannelsQ2; + comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidthQ2; + comp_info.filter_compressed = tflite::testing::kBinQuantFilterDataQ2; + comp_info.filter_data = tflite::testing::kBinQuantFilterValueTableQ2; + comp_info.filter_dims_data = tflite::testing::kFilterShapeQ2; + comp_info.filter_scales = filter_scales; + comp_info.filter_zero_points = filter_zero_points; + + comp_info.bias_value_table = bias_quantized; + comp_info.bias_value_table_stride = + std::extent::value / + tflite::testing::kFilterNumChannelsQ2; + comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidthQ2; + comp_info.bias_compressed = tflite::testing::kBinQuantBiasDataQ2; + comp_info.bias_data = tflite::testing::kBiasDataQ2; + comp_info.bias_dims_data = tflite::testing::kBiasShapeQ2; + comp_info.bias_scales = bias_scales; + comp_info.bias_zero_points = bias_zero_points; + + // The quantized output is compared to the expected output (quantized). + // A tolerance of 81 is approx. 0.1582f which is less than the TfLite + // tolerance of 0.19f. + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, + tflite::testing::TestTransposeConvQuantizedCompressed( + tflite::testing::kInputShapeQ2, tflite::testing::kInputDataQ2, + input_quantized, input_scale, input_zero_point, + tflite::testing::kOutputShapeQ2, tflite::testing::kGoldenDataQ2, + golden_quantized, output_quantized, output_scale, output_zero_point, + &tflite::testing::common_conv_params, 81, &comp_info)); +} + +TF_LITE_MICRO_TEST( + SimpleBiasTestQuantizedPerChannelBias64MultiChannelCompressed) { + // data from TfLite test: SimpleBiasTestQuantizedPerChannel16x8Bias64 + const float input_scale = 4.0f / 127.0f; + const float output_scale = 128.0f / 65536.0f; + const int input_zero_point = 0; + const int output_zero_point = 0; + constexpr float filter_scales[] = { + tflite::testing::kFilterNumChannelsQ2, + 7.0f / 127.0f, + 8.0f / 127.0f, + }; + constexpr int filter_zero_points[] = { + tflite::testing::kFilterNumChannelsQ2, + 0, + 0, + }; + // bias scales and zero points will be computed + float bias_scales[std::extent::value] = {}; + int bias_zero_points[std::extent::value] = {}; + + int16_t input_quantized[tflite::testing::kInputElementsQ2]; + int8_t filter_quantized[tflite::testing::kFilterElementsQ2]; + int64_t bias_quantized[tflite::testing::kBiasElementsQ2]; + int16_t golden_quantized[tflite::testing::kOutputElementsQ2]; + int16_t output_quantized[tflite::testing::kOutputElementsQ2]; + + tflite::testing::TestCompressionQuantizedInfo comp_info = {}; + comp_info.scheme = tflite::CompressionScheme::kBinQuant; + + comp_info.filter_value_table = filter_quantized; + comp_info.filter_value_table_stride = + std::extent< + decltype(tflite::testing::kBinQuantFilterValueTableQ2)>::value / + tflite::testing::kFilterNumChannelsQ2; + comp_info.filter_bit_width = tflite::testing::kBinQuantFilterBitWidthQ2; + comp_info.filter_compressed = tflite::testing::kBinQuantFilterDataQ2; + comp_info.filter_data = tflite::testing::kBinQuantFilterValueTableQ2; + comp_info.filter_dims_data = tflite::testing::kFilterShapeQ2; + comp_info.filter_scales = filter_scales; + comp_info.filter_zero_points = filter_zero_points; + + comp_info.bias_value_table = bias_quantized; + comp_info.bias_value_table_stride = + std::extent::value / + tflite::testing::kFilterNumChannelsQ2; + comp_info.bias_bit_width = tflite::testing::kBinQuantBiasBitWidthQ2; + comp_info.bias_compressed = tflite::testing::kBinQuantBiasDataQ2; + comp_info.bias_data = tflite::testing::kBiasDataQ2; + comp_info.bias_dims_data = tflite::testing::kBiasShapeQ2; + comp_info.bias_scales = bias_scales; + comp_info.bias_zero_points = bias_zero_points; + + // The quantized output is compared to the expected output (quantized). + // A tolerance of 81 is approx. 0.1582f which is less than the TfLite + // tolerance of 0.19f. + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, + tflite::testing::TestTransposeConvQuantizedCompressed( + tflite::testing::kInputShapeQ2, tflite::testing::kInputDataQ2, + input_quantized, input_scale, input_zero_point, + tflite::testing::kOutputShapeQ2, tflite::testing::kGoldenDataQ2, + golden_quantized, output_quantized, output_scale, output_zero_point, + &tflite::testing::common_conv_params, 81, &comp_info)); +} + +#endif // USE_TFLM_COMPRESSION + TF_LITE_MICRO_TESTS_END diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc index 930da754bb5..5caefa34764 100644 --- a/tensorflow/lite/micro/micro_allocator.cc +++ b/tensorflow/lite/micro/micro_allocator.cc @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -36,6 +36,15 @@ limitations under the License. #include "tensorflow/lite/micro/tflite_bridge/flatbuffer_conversions_bridge.h" #include "tensorflow/lite/schema/schema_generated.h" +#ifdef USE_TFLM_COMPRESSION + +#include +#include + +#include "tensorflow/lite/micro/compression/metadata_generated.h" + +#endif // USE_TFLM_COMPRESSION + namespace tflite { namespace { @@ -355,6 +364,151 @@ TfLiteStatus InitializeTfLiteEvalTensorFromFlatbuffer( return kTfLiteOk; } +#ifdef USE_TFLM_COMPRESSION + +const tflite::micro::compression::Metadata* GetCompressionMetadata( + const Model& model) { + const auto metadata_vector = model.metadata(); + if (metadata_vector == nullptr) { + return nullptr; + } + auto buffers = model.buffers(); + if (buffers == nullptr) { + return nullptr; + } + const size_t metadata_string_length = std::strlen(kCompressionMetadataString); + for (size_t metadata_index = 0; metadata_index < metadata_vector->size(); + metadata_index++) { + auto metadata = metadata_vector->Get(metadata_index); + if (metadata->name() == nullptr || metadata->name()->size() == 0) { + continue; + } + const char* s = metadata->name()->c_str(); + if ((metadata->name()->size() == metadata_string_length) && + (std::strncmp(s, kCompressionMetadataString, metadata_string_length) == + 0)) { + auto buffer_index = metadata->buffer(); + if (buffer_index == 0 || buffer_index >= buffers->size()) { + MicroPrintf("Compression: Invalid buffer index %u", buffer_index); + continue; + } + auto vp = buffers->Get(buffer_index)->data(); + if (vp == nullptr || vp->data() == nullptr) { + MicroPrintf("Compression: Invalid data for buffer index %u", + buffer_index); + continue; + } + // TODO(ddavis-2015): support multiple compression methods, possibly + // through multiple verification checks. + // Then return a pair. + auto compression_metadata = + tflite::micro::compression::GetSizePrefixedMetadata(vp); + flatbuffers::Verifier verifier(vp->data(), vp->size(), + flatbuffers::Verifier::Options()); + if (!tflite::micro::compression::VerifyMetadataBuffer(verifier)) { + MicroPrintf("Compression: verification failure"); + return nullptr; + } else { + return compression_metadata; + } + } + } + + return nullptr; +} + +TfLiteStatus InitializeCompressionTensorDataFromFlatbuffer( + const Model& model, const tflite::micro::compression::LutTensor& lut_tensor, + CompressionTensorData* ctd) { + ctd->scheme = CompressionScheme::kBinQuant; + + const size_t subgraph_index = lut_tensor.subgraph(); + if (subgraph_index >= model.subgraphs()->size()) { + MicroPrintf("Compression: invalid subgraph index %u in LutTensor", + subgraph_index); + return kTfLiteError; + } + const size_t tensor_index = lut_tensor.tensor(); + auto tensors = model.subgraphs()->Get(subgraph_index)->tensors(); + if (tensor_index >= tensors->size()) { + MicroPrintf("Compression: invalid tensor index %u in LutTensor", + tensor_index); + return kTfLiteError; + } + const size_t index_bit_width = lut_tensor.index_bitwidth(); + if (index_bit_width > LookupTableData::kMaxBitWidth) { + MicroPrintf("Compression: invalid bit width %u in LutTensor", + index_bit_width); + return kTfLiteError; + } + ctd->data.lut_data->compressed_bit_width = index_bit_width; + const size_t value_buffer_index = lut_tensor.value_buffer(); + if (value_buffer_index >= model.buffers()->size()) { + MicroPrintf("Compression: invalid value_buffer %u in LutTensor", + value_buffer_index); + return kTfLiteError; + } + auto value_buffer = model.buffers()->Get(value_buffer_index)->data(); + if (value_buffer == nullptr || value_buffer->data() == nullptr) { + MicroPrintf("Compression: invalid value table for value_buffer %u", + value_buffer_index); + return kTfLiteError; + } + ctd->data.lut_data->value_table = value_buffer->data(); + auto tensor = + model.subgraphs()->Get(subgraph_index)->tensors()->Get(tensor_index); + if (tensor->shape() == nullptr) { + MicroPrintf("Compression: scalar tensors not supported"); + return kTfLiteError; + } + if (tensor->buffer() != lut_tensor.index_buffer()) { + MicroPrintf("Compression: mismatched index_buffer %u != %u in LutTensor", + lut_tensor.index_buffer(), tensor->buffer()); + return kTfLiteError; + } + TfLiteType tensor_type = kTfLiteNoType; + TfLiteStatus status = ConvertTensorType(tensor->type(), &tensor_type); + if (status != kTfLiteOk) { + MicroPrintf("Compression: failed to convert tensor type"); + return kTfLiteError; + } + size_t tensor_type_size = 0; + status = TfLiteTypeSizeOf(tensor_type, &tensor_type_size); + if (status != kTfLiteOk) { + MicroPrintf("Compression: failed to get tensor type size"); + return kTfLiteError; + } + if (tensor->quantization() != nullptr && + tensor->quantization()->scale() != nullptr && + tensor->quantization()->scale()->size() > 1) { + const size_t num_channels = tensor->quantization()->scale()->size(); + ctd->data.lut_data->is_per_channel_quantized = true; + const TfLiteIntArray* dims = + FlatBufferVectorToTfLiteTypeArray(tensor->shape()); + int32_t quantized_axis = tensor->quantization()->quantized_dimension(); + if (quantized_axis == 0) { + ctd->data.lut_data->use_alternate_axis = false; + } else if (quantized_axis == (dims->size - 1)) { + ctd->data.lut_data->use_alternate_axis = true; + } else { + MicroPrintf("Compression: unsupported quantization axis %u", + quantized_axis); + return kTfLiteError; + } + ctd->data.lut_data->value_table_channel_stride = + (value_buffer->size() / tensor_type_size) / num_channels; + } else { + ctd->data.lut_data->is_per_channel_quantized = false; + ctd->data.lut_data->use_alternate_axis = false; + ctd->data.lut_data->value_table_channel_stride = + value_buffer->size() / tensor_type_size; + } + + return kTfLiteOk; +} + +#endif // USE_TFLM_COMPRESSION + } // namespace internal size_t MicroAllocator::GetDefaultTailUsage(bool is_memory_planner_given) { @@ -502,7 +656,11 @@ SubgraphAllocations* MicroAllocator::StartModelAllocation(const Model* model) { return nullptr; } - if (AllocateTfLiteEvalTensors(model, output) != kTfLiteOk || + if ( +#ifdef USE_TFLM_COMPRESSION + AllocateCompressedTensorsList(model, output) != kTfLiteOk || +#endif // USE_TFLM_COMPRESSION + AllocateTfLiteEvalTensors(model, output) != kTfLiteOk || AllocateNodeAndRegistrations(model, output) != kTfLiteOk) { return nullptr; } @@ -757,6 +915,108 @@ bool MicroAllocator::IsAllTempDeallocated() { return non_persistent_buffer_allocator_->IsAllTempDeallocated(); } +#ifdef USE_TFLM_COMPRESSION + +TfLiteStatus MicroAllocator::AllocateCompressedTensorsList( + const Model* model, SubgraphAllocations* subgraph_allocations) { + TFLITE_DCHECK(subgraph_allocations != nullptr); + + for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs()->size(); + subgraph_idx++) { + subgraph_allocations[subgraph_idx].compressed.tensors = nullptr; + } + + const tflite::micro::compression::Metadata* compression_metadata = + internal::GetCompressionMetadata(*model); + if (compression_metadata == nullptr) { + // no compression metadata is available + return kTfLiteOk; + } + if (compression_metadata->lut_tensors() == nullptr) { + MicroPrintf("Compression: invalid LutTensor vector"); + return kTfLiteError; + } + if (compression_metadata->lut_tensors()->size() == 0) { + MicroPrintf("Compression: zero length LutTensor vector"); + return kTfLiteError; + } + + for (size_t lut_tensors_index = 0; + lut_tensors_index < compression_metadata->lut_tensors()->size(); + lut_tensors_index++) { + auto lut_tensor = + compression_metadata->lut_tensors()->Get(lut_tensors_index); + + CompressionTensorData* ctd = reinterpret_cast( + persistent_buffer_allocator_->AllocatePersistentBuffer( + sizeof(CompressionTensorData), alignof(CompressionTensorData))); + if (ctd == nullptr) { + MicroPrintf( + "Compressions: failed to allocate memory for CompressionTensorData, " + "%d bytes required", + sizeof(CompressionTensorData)); + return kTfLiteError; + } + + LookupTableData* lut_table = reinterpret_cast( + persistent_buffer_allocator_->AllocatePersistentBuffer( + sizeof(LookupTableData), alignof(LookupTableData))); + if (lut_table == nullptr) { + MicroPrintf( + "Compressions: failed to allocate memory for LookupTableData, " + "%d bytes required", + sizeof(LookupTableData)); + return kTfLiteError; + } + ctd->data.lut_data = lut_table; + + TfLiteStatus status = + internal::InitializeCompressionTensorDataFromFlatbuffer( + *model, *lut_tensor, ctd); + if (status != kTfLiteOk) { + MicroPrintf("Compression: failed to initialize data for LutTensor %u", + lut_tensors_index); + return kTfLiteError; + } + + const size_t subgraph_index = lut_tensor->subgraph(); + if (subgraph_allocations[subgraph_index].compressed.tensors == nullptr) { + size_t alloc_count = + model->subgraphs()->Get(subgraph_index)->tensors()->size(); + const CompressionTensorData** tensors = + reinterpret_cast( + persistent_buffer_allocator_->AllocatePersistentBuffer( + sizeof(CompressionTensorData*) * alloc_count, + alignof(CompressionTensorData*))); + if (tensors == nullptr) { + MicroPrintf( + "Compression: failed to allocate memory for compression tensor " + "list, %d bytes required", + sizeof(CompressionTensorData*) * alloc_count); + return kTfLiteError; + } + + subgraph_allocations[subgraph_index].compressed.tensors = tensors; + std::fill(tensors, tensors + alloc_count, nullptr); + } + + const size_t tensor_index = lut_tensor->tensor(); + if (subgraph_allocations[subgraph_index].compressed.tensors[tensor_index] != + nullptr) { + MicroPrintf("Compression: duplicate LutTensor subgraph %u tensor %u", + subgraph_index, tensor_index); + return kTfLiteError; + } else { + subgraph_allocations[subgraph_index].compressed.tensors[tensor_index] = + ctd; + } + } + + return kTfLiteOk; +} + +#endif // USE_TFLM_COMPRESSION + TfLiteStatus MicroAllocator::AllocateTfLiteEvalTensors( const Model* model, SubgraphAllocations* subgraph_allocations) { TFLITE_DCHECK(subgraph_allocations != nullptr); diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h index 02317220e12..215bffc6a8c 100644 --- a/tensorflow/lite/micro/micro_allocator.h +++ b/tensorflow/lite/micro/micro_allocator.h @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,6 +26,12 @@ limitations under the License. #include "tensorflow/lite/micro/tflite_bridge/flatbuffer_conversions_bridge.h" #include "tensorflow/lite/schema/schema_generated.h" +#ifdef USE_TFLM_COMPRESSION + +#include "tensorflow/lite/micro/compression.h" + +#endif // USE_TFLM_COMPRESSION + namespace tflite { // TODO(b/199402574): rename to tflite_internal or just remove internal @@ -91,6 +97,9 @@ struct ScratchBufferHandle { struct SubgraphAllocations { NodeAndRegistration* node_and_registrations; TfLiteEvalTensor* tensors; +#ifdef USE_TFLM_COMPRESSION + CompressedTensorList compressed; +#endif // USE_TFLM_COMPRESSION }; // Allocator responsible for allocating memory for all intermediate tensors @@ -258,6 +267,15 @@ class MicroAllocator { MicroMemoryPlanner* memory_planner); virtual ~MicroAllocator(); +#ifdef USE_TFLM_COMPRESSION + + // Allocates an array in the arena of pointers to the compressions data + // required to decompress tensors for each subgraph within the model. + virtual TfLiteStatus AllocateCompressedTensorsList( + const Model* model, SubgraphAllocations* subgraph_allocations); + +#endif // USE_TFLM_COMPRESSION + // Allocates an array in the arena to hold pointers to the node and // registration pointers required to represent the inference graph of the // model. diff --git a/tensorflow/lite/micro/micro_context.cc b/tensorflow/lite/micro/micro_context.cc index 295b3c34463..c21caac0e89 100644 --- a/tensorflow/lite/micro/micro_context.cc +++ b/tensorflow/lite/micro/micro_context.cc @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,8 +18,10 @@ limitations under the License. #include #include +#include "tensorflow/lite/kernels/internal/compatibility.h" #include "tensorflow/lite/micro/micro_common.h" #include "tensorflow/lite/micro/micro_log.h" +#include "tensorflow/lite/micro/micro_utils.h" namespace tflite { namespace { @@ -34,6 +36,103 @@ int GetTensorIndex(int index, int max_size, const int* tensor_indices) { return -1; } +#ifdef USE_TFLM_COMPRESSION + +struct DecompressionState { + DecompressionState() = delete; + + DecompressionState(const uint8_t* compressed_indices, + const size_t count_indices, + const CompressionTensorData& comp_data, + const size_t num_channels) + : compressed_indices_(compressed_indices), + count_indices_(count_indices), + comp_data_(comp_data), + num_channels_(num_channels) {} + + template + T* DecompressToBuffer(void* buffer); + + size_t GetNextTableIndex(); + void UpdateBufferAndChannelIndex(); + + private: + const uint8_t* compressed_indices_; + const size_t count_indices_; + const CompressionTensorData& comp_data_; + const size_t num_channels_; + const size_t compressed_bit_width_ = + comp_data_.data.lut_data->compressed_bit_width; + size_t channel_ = 0; + size_t index_in_channel_ = 0; + const size_t elements_per_channel_ = + comp_data_.data.lut_data->use_alternate_axis + ? 1 + : count_indices_ / num_channels_; + size_t buffer_index_ = 0; + size_t current_offset_ = 0; + size_t current_bits_remaining_ = 8; + uint8_t current_byte_ = compressed_indices_[0]; +}; + +template +T* DecompressionState::DecompressToBuffer(void* buffer) { + while (buffer_index_ < count_indices_) { + const size_t table_index = GetNextTableIndex(); + static_cast(buffer)[buffer_index_] = + static_cast(comp_data_.data.lut_data->value_table) + [table_index + + (channel_ * comp_data_.data.lut_data->value_table_channel_stride)]; + UpdateBufferAndChannelIndex(); + } + + return static_cast(buffer); +} + +size_t DecompressionState::GetNextTableIndex() { + TFLITE_DCHECK(compressed_bit_width_ <= LookupTableData::kMaxBitWidth); + TFLITE_DCHECK(compressed_bit_width_ > 0); + + size_t table_index_bits_to_fill = compressed_bit_width_; + size_t table_index = 0; + + while (table_index_bits_to_fill > 0) { + if (current_bits_remaining_ == 0) { + current_offset_++; + current_byte_ = compressed_indices_[current_offset_]; + current_bits_remaining_ = 8; + } + + const uint8_t mask_bit_count = + std::min(table_index_bits_to_fill, + std::min(compressed_bit_width_, current_bits_remaining_)); + const uint8_t current_byte_mask = (1 << mask_bit_count) - 1; + table_index <<= mask_bit_count; + table_index |= + (current_byte_ >> (current_bits_remaining_ - mask_bit_count)) & + current_byte_mask; + + table_index_bits_to_fill -= mask_bit_count; + current_bits_remaining_ -= mask_bit_count; + } + + return table_index; +} + +void DecompressionState::UpdateBufferAndChannelIndex() { + buffer_index_++; + index_in_channel_++; + if (index_in_channel_ == elements_per_channel_) { + index_in_channel_ = 0; + channel_++; + if (channel_ == num_channels_) { + channel_ = 0; + } + } +} + +#endif // USE_TFLM_COMPRESSION + } // namespace TfLiteTensor* MicroContext::AllocateTempInputTensor(const TfLiteNode* node, @@ -74,4 +173,57 @@ void MicroContextReportOpError(struct TfLiteContext* context, va_end(args); } +#ifdef USE_TFLM_COMPRESSION + +void* MicroContext::DecompressTensorToScratchBuffer( + const TfLiteEvalTensor& tensor, + const CompressionTensorData& compression_data, int scratch_buffer_handle) { + TFLITE_DCHECK(compression_data.scheme == CompressionScheme::kBinQuant); + TFLITE_DCHECK(scratch_buffer_handle != -1); + uint8_t* scratch_buffer = + static_cast(GetScratchBuffer(scratch_buffer_handle)); + TFLITE_DCHECK(scratch_buffer != nullptr); + size_t count = ElementCount(*tensor.dims); + size_t num_channels = 1; + + if (compression_data.data.lut_data->is_per_channel_quantized) { + const size_t channel_axis = + compression_data.data.lut_data->use_alternate_axis + ? tensor.dims->size - 1 + : 0; + num_channels = tensor.dims->data[channel_axis]; + } + + DecompressionState ds(static_cast(tensor.data.data), count, + compression_data, num_channels); + + switch (tensor.type) { + case kTfLiteBool: { + return ds.DecompressToBuffer(scratch_buffer); + } break; + case kTfLiteInt8: { + return ds.DecompressToBuffer(scratch_buffer); + } break; + case kTfLiteInt16: { + return ds.DecompressToBuffer(scratch_buffer); + } break; + case kTfLiteInt32: { + return ds.DecompressToBuffer(scratch_buffer); + } break; + case kTfLiteInt64: { + return ds.DecompressToBuffer(scratch_buffer); + } break; + case kTfLiteFloat32: { + return ds.DecompressToBuffer(scratch_buffer); + } break; + default: { + MicroPrintf("Unsupported decompression tensor type %d", tensor.type); + } break; + } + + return nullptr; +} + +#endif // USE_TFLM_COMPRESSION + } // namespace tflite diff --git a/tensorflow/lite/micro/micro_interpreter_context.cc b/tensorflow/lite/micro/micro_interpreter_context.cc index 098df15d522..0ba461fd7b6 100644 --- a/tensorflow/lite/micro/micro_interpreter_context.cc +++ b/tensorflow/lite/micro/micro_interpreter_context.cc @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,8 +18,28 @@ limitations under the License. #include #include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/micro/micro_utils.h" namespace tflite { + +namespace { + +#ifdef USE_TFLM_COMPRESSION + +int GetInputTensorIndex(const TfLiteNode* node, const int index) { + if (index >= 0 && index < node->inputs->size) { + const int tensor_index = node->inputs->data[index]; + if (tensor_index != kTfLiteOptionalTensor) { + return tensor_index; + } + } + return -1; +} + +#endif // USE_TFLM_COMPRESSION + +} // namespace + MicroInterpreterContext::MicroInterpreterContext(MicroAllocator* allocator, const Model* model, MicroInterpreterGraph* graph) @@ -106,4 +126,83 @@ MicroInterpreterContext::GetInterpreterState() const { return state_; } +#ifdef USE_TFLM_COMPRESSION + +// Available during Prepare & Eval. Returns false if tensor is not +// compressed. +bool MicroInterpreterContext::IsTensorCompressed(const TfLiteNode* node, + int tensor_idx) { + TFLITE_DCHECK(state_ == InterpreterState::kPrepare || + state_ == InterpreterState::kInvoke); + + const SubgraphAllocations* allocations = + &graph_.GetAllocations()[graph_.GetCurrentSubgraphIndex()]; + if (allocations->compressed.tensors == nullptr) { + return false; + } + int index = GetInputTensorIndex(node, tensor_idx); + if (index == -1) { + return false; + } + return allocations->compressed.tensors[index] != nullptr; +} + +// Only available during Prepare. The kernel is responsible for storing the +// scratch buffer handle. +int MicroInterpreterContext::AllocateDecompressionScratchBuffer( + const TfLiteNode* node, int tensor_idx) { + TFLITE_DCHECK(state_ == InterpreterState::kPrepare); + + const SubgraphAllocations* allocations = + &graph_.GetAllocations()[graph_.GetCurrentSubgraphIndex()]; + if (allocations->compressed.tensors == nullptr) { + return -1; + } + int index = GetInputTensorIndex(node, tensor_idx); + if (index == -1 || allocations->compressed.tensors[index] == nullptr) { + return -1; + } + const TfLiteEvalTensor* tensor = &allocations->tensors[index]; + const size_t byte_count = EvalTensorBytes(tensor); + int scratch_index = -1; + TfLiteStatus result = RequestScratchBufferInArena(byte_count, &scratch_index); + if (result != kTfLiteOk) { + return -1; + } + + return scratch_index; +} + +// Available during Prepare & Eval. Returns nullptr if tensor is not +// compressed. +const CompressionTensorData* MicroInterpreterContext::GetTensorCompressionData( + const TfLiteNode* node, int tensor_idx) { + TFLITE_DCHECK(state_ == InterpreterState::kPrepare || + state_ == InterpreterState::kInvoke); + + const SubgraphAllocations* allocations = + &graph_.GetAllocations()[graph_.GetCurrentSubgraphIndex()]; + if (allocations->compressed.tensors == nullptr) { + return nullptr; + } + int index = GetInputTensorIndex(node, tensor_idx); + if (index == -1) { + return nullptr; + } + return allocations->compressed.tensors[index]; +} + +// Only available during Eval. Returns nullptr on failure, otherwise returns a +// pointer to the scratch buffer. +void* MicroInterpreterContext::DecompressTensorToScratchBuffer( + const TfLiteEvalTensor& tensor, + const CompressionTensorData& compression_data, int scratch_buffer_handle) { + TFLITE_DCHECK(state_ == InterpreterState::kInvoke); + + return MicroContext::DecompressTensorToScratchBuffer(tensor, compression_data, + scratch_buffer_handle); +} + +#endif // USE_TFLM_COMPRESSION + } // namespace tflite diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc index e44de6b09aa..873ea96ac1e 100644 --- a/tensorflow/lite/micro/micro_interpreter_test.cc +++ b/tensorflow/lite/micro/micro_interpreter_test.cc @@ -1,4 +1,4 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/lite/micro/micro_interpreter.h" #include +#include #include "tensorflow/lite/micro/arena_allocator/recording_single_arena_buffer_allocator.h" #include "tensorflow/lite/micro/compatibility.h" @@ -108,6 +109,58 @@ TF_LITE_MICRO_TEST(TestInterpreter) { TF_LITE_MICRO_EXPECT_EQ(tflite::testing::MockCustom::freed_, true); } +#ifdef USE_TFLM_COMPRESSION + +TF_LITE_MICRO_TEST(TestInterpreterCompression) { + const tflite::Model* model = tflite::testing::GetSimpleMockModelCompressed(); + TF_LITE_MICRO_EXPECT(nullptr != model); + tflite::testing::TestingOpResolver op_resolver; + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, + tflite::testing::GetTestingOpResolver(op_resolver)); + + constexpr size_t allocator_buffer_size = 2000; + uint8_t allocator_buffer[allocator_buffer_size]; + + // Create a new scope so that we can test the destructor. + { + tflite::MicroInterpreter interpreter(model, op_resolver, allocator_buffer, + allocator_buffer_size); + TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk); + TF_LITE_MICRO_EXPECT_EQ(static_cast(1), interpreter.inputs_size()); + TF_LITE_MICRO_EXPECT_EQ(static_cast(1), interpreter.outputs_size()); + + TfLiteTensor* input = interpreter.input(0); + TF_LITE_MICRO_EXPECT(nullptr != input); + TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt16, input->type); + TF_LITE_MICRO_EXPECT_EQ(1, input->dims->size); + TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]); + TF_LITE_MICRO_EXPECT_EQ(static_cast(2), input->bytes); + TF_LITE_MICRO_EXPECT(nullptr != input->data.data); + static_cast(input->data.data)[0] = 42; + + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke()); + + const std::initializer_list kGolden = { + 43, 44, 45, 46, 47, 41, 40, 39, 38, 37, 43, 44, 45, 46, 47}; + const int kGoldenCount = kGolden.size(); + TfLiteTensor* output = interpreter.output(0); + TF_LITE_MICRO_EXPECT(nullptr != output); + TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt16, output->type); + TF_LITE_MICRO_EXPECT_EQ(1, output->dims->size); + TF_LITE_MICRO_EXPECT_EQ(kGoldenCount, output->dims->data[0]); + TF_LITE_MICRO_EXPECT_EQ( + static_cast(kGoldenCount * sizeof(*kGolden.begin())), + output->bytes); + TF_LITE_MICRO_EXPECT(nullptr != output->data.data); + for (int i = 0; i < kGoldenCount; i++) { + TF_LITE_MICRO_EXPECT_EQ(static_cast(output->data.data)[i], + kGolden.begin()[i]); + } + } +} + +#endif // USE_TFLM_COMPRESSION + TF_LITE_MICRO_TEST(TestMultiTenantInterpreter) { tflite::testing::TestingOpResolver op_resolver; TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, diff --git a/tensorflow/lite/micro/recording_micro_allocator.cc b/tensorflow/lite/micro/recording_micro_allocator.cc index ee76196d255..18addaee5f7 100644 --- a/tensorflow/lite/micro/recording_micro_allocator.cc +++ b/tensorflow/lite/micro/recording_micro_allocator.cc @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -78,14 +78,15 @@ RecordedAllocation RecordingMicroAllocator::GetRecordedAllocation( return recorded_node_and_registration_array_data_; case RecordedAllocationType::kOpData: return recorded_op_data_; - // the function MicroPrintf was never reached outside the switch, because - // each case has a return. As the intention of the MicroPrintf is to be - // called when no matching case is found, a default case was added to - // contemplate an invalid allocation type +#ifdef USE_TFLM_COMPRESSION + case RecordedAllocationType::kCompressionData: + return recorded_compression_data_; +#endif // USE_TFLM_COMPRESSION default: - MicroPrintf("Invalid allocation type supplied: %d", allocation_type); - return RecordedAllocation(); + break; } + MicroPrintf("Invalid allocation type supplied: %d", allocation_type); + return RecordedAllocation(); } const RecordingSingleArenaBufferAllocator* @@ -117,6 +118,13 @@ void RecordingMicroAllocator::PrintAllocations() const { "NodeAndRegistration structs"); PrintRecordedAllocation(RecordedAllocationType::kOpData, "Operator runtime data", "OpData structs"); + +#ifdef USE_TFLM_COMPRESSION + + PrintRecordedAllocation(RecordedAllocationType::kCompressionData, + "Persistent compression data", "allocations"); + +#endif // USE_TFLM_COMPRESSION } void* RecordingMicroAllocator::AllocatePersistentBuffer(size_t bytes) { @@ -233,6 +241,21 @@ TfLiteStatus RecordingMicroAllocator::PopulateTfLiteTensorFromFlatbuffer( return status; } +#ifdef USE_TFLM_COMPRESSION + +TfLiteStatus RecordingMicroAllocator::AllocateCompressedTensorsList( + const Model* model, SubgraphAllocations* subgraph_allocations) { + RecordedAllocation allocations = SnapshotAllocationUsage(); + + TfLiteStatus status = MicroAllocator::AllocateCompressedTensorsList( + model, subgraph_allocations); + + RecordAllocationUsage(allocations, recorded_compression_data_); + return status; +} + +#endif // USE_TFLM_COMPRESSION + RecordedAllocation RecordingMicroAllocator::SnapshotAllocationUsage() const { return {/*requested_bytes=*/recording_memory_allocator_->GetRequestedBytes(), /*used_bytes=*/recording_memory_allocator_->GetUsedBytes(), diff --git a/tensorflow/lite/micro/recording_micro_allocator.h b/tensorflow/lite/micro/recording_micro_allocator.h index b6f69264dc0..80f163240d3 100644 --- a/tensorflow/lite/micro/recording_micro_allocator.h +++ b/tensorflow/lite/micro/recording_micro_allocator.h @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -33,6 +33,11 @@ enum class RecordedAllocationType { kTfLiteTensorVariableBufferData, kNodeAndRegistrationArray, kOpData, +#ifdef USE_TFLM_COMPRESSION + kCompressionData, +#endif // USE_TFLM_COMPRESSION + + kNumAllocationTypes, // must be last }; // Container for holding information about allocation recordings by a given @@ -93,6 +98,13 @@ class RecordingMicroAllocator : public MicroAllocator { int subgraph_index, bool allocate_temp) override; +#ifdef USE_TFLM_COMPRESSION + + TfLiteStatus AllocateCompressedTensorsList( + const Model* model, SubgraphAllocations* subgraph_allocations) override; + +#endif // USE_TFLM_COMPRESSION + private: RecordingMicroAllocator(RecordingSingleArenaBufferAllocator* memory_allocator, MicroMemoryPlanner* memory_planner); @@ -113,6 +125,9 @@ class RecordingMicroAllocator : public MicroAllocator { RecordedAllocation recorded_persistent_buffer_data_ = {}; RecordedAllocation recorded_tflite_tensor_variable_buffer_data_ = {}; RecordedAllocation recorded_node_and_registration_array_data_ = {}; +#ifdef USE_TFLM_COMPRESSION + RecordedAllocation recorded_compression_data_ = {}; +#endif // USE_TFLM_COMPRESSION // TODO(b/187993291): Re-enable OpData allocating tracking. RecordedAllocation recorded_op_data_ = {}; diff --git a/tensorflow/lite/micro/recording_micro_allocator_test.cc b/tensorflow/lite/micro/recording_micro_allocator_test.cc index 9d3a5965de4..121a74c3324 100644 --- a/tensorflow/lite/micro/recording_micro_allocator_test.cc +++ b/tensorflow/lite/micro/recording_micro_allocator_test.cc @@ -1,4 +1,4 @@ -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -317,6 +317,72 @@ TF_LITE_MICRO_TEST(TestMultiSubgraphModel) { num_tensors * TF_LITE_EVAL_TENSOR_STRUCT_SIZE); } +#ifdef USE_TFLM_COMPRESSION + +TF_LITE_MICRO_TEST(TestCompressedModel) { + tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr; + tflite::testing::TestingOpResolver ops_resolver; + const tflite::Model* model = tflite::testing::GetSimpleMockModelCompressed(); + const int arena_size = 2048; + + uint8_t arena[arena_size]; + + tflite::RecordingMicroAllocator* micro_allocator = + tflite::RecordingMicroAllocator::Create(arena, arena_size); + TF_LITE_MICRO_EXPECT(micro_allocator != nullptr); + TF_LITE_MICRO_CHECK_FAIL(); + + tflite::SubgraphAllocations* subgraph_allocations = + micro_allocator->StartModelAllocation(model); + TF_LITE_MICRO_EXPECT(nullptr != subgraph_allocations); + TF_LITE_MICRO_CHECK_FAIL(); + + TfLiteStatus status = micro_allocator->FinishModelAllocation( + model, subgraph_allocations, &scratch_buffer_handles); + TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk); + TF_LITE_MICRO_CHECK_FAIL(); + + micro_allocator->PrintAllocations(); + + size_t count_compression_allocations = 0; + size_t size_compression_allocations = 0; + for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs()->size(); + subgraph_idx++) { + const tflite::CompressionTensorData** ctl = + subgraph_allocations[subgraph_idx].compressed.tensors; + if (ctl == nullptr) { + continue; + } + const tflite::SubGraph* subgraph = model->subgraphs()->Get(subgraph_idx); + const size_t num_tensors = subgraph->tensors()->size(); + for (size_t i = 0; i < num_tensors; i++) { + if (ctl[i] != nullptr) { + count_compression_allocations++; + size_compression_allocations += sizeof(tflite::CompressionTensorData); + count_compression_allocations++; + size_compression_allocations += sizeof(tflite::LookupTableData); + } + } + // Add the CompressionTensorData array + count_compression_allocations++; + size_compression_allocations += + num_tensors * sizeof(tflite::CompressionTensorData*); + } + + tflite::RecordedAllocation recorded_allocation = + micro_allocator->GetRecordedAllocation( + tflite::RecordedAllocationType::kCompressionData); + + TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count, + count_compression_allocations); + TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.requested_bytes, + size_compression_allocations); + TF_LITE_MICRO_EXPECT_GE(recorded_allocation.used_bytes, + size_compression_allocations); +} + +#endif // USE_TFLM_COMPRESSION + // TODO(b/158124094): Find a way to audit OpData allocations on // cross-architectures. diff --git a/tensorflow/lite/micro/test_helper_custom_ops.cc b/tensorflow/lite/micro/test_helper_custom_ops.cc index 374aabcc9df..97577699961 100644 --- a/tensorflow/lite/micro/test_helper_custom_ops.cc +++ b/tensorflow/lite/micro/test_helper_custom_ops.cc @@ -1,4 +1,4 @@ -/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -35,6 +35,18 @@ limitations under the License. namespace tflite { namespace testing { +namespace { + +template +void BroadcastAdd(const T input_scalar, const T* weights, T* output, + const size_t count) { + for (size_t i = 0; i < count; i++) { + output[i] = input_scalar + weights[i]; + } +} + +} // namespace + const TFLMRegistration* PackerOp::getRegistration() { return GetMutableRegistration(); } @@ -107,5 +119,180 @@ TfLiteStatus PackerOp::Invoke(TfLiteContext* context, TfLiteNode* node) { bool PackerOp::freed_ = false; +const TFLMRegistration* BroadcastAddOp::getRegistration() { + return GetMutableRegistration(); +} + +TFLMRegistration* BroadcastAddOp::GetMutableRegistration() { + static TFLMRegistration r; + r.init = Init; + r.prepare = Prepare; + r.invoke = Invoke; + return &r; +} + +void* BroadcastAddOp::Init(TfLiteContext* context, const char* buffer, + size_t length) { +#ifdef USE_TFLM_COMPRESSION + + weight_scratch_index_ = -1; + +#endif // USE_TFLM_COMPRESSION + + // Do nothing. + return nullptr; +} + +TfLiteStatus BroadcastAddOp::Prepare(TfLiteContext* context, TfLiteNode* node) { + MicroContext* micro_context = GetMicroContext(context); + + TfLiteTensor* input = micro_context->AllocateTempInputTensor(node, 0); + TF_LITE_ENSURE(context, input != nullptr); + TfLiteTensor* weights = micro_context->AllocateTempInputTensor(node, 1); + TF_LITE_ENSURE(context, weights != nullptr); + TfLiteTensor* output = micro_context->AllocateTempOutputTensor(node, 0); + TF_LITE_ENSURE(context, output != nullptr); + + TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type); + TF_LITE_ENSURE_TYPES_EQ(context, input->type, weights->type); + TF_LITE_ENSURE( + context, input->type == kTfLiteFloat32 || input->type == kTfLiteInt8 || + input->type == kTfLiteInt16 || input->type == kTfLiteInt32 || + input->type == kTfLiteInt64); + TF_LITE_ENSURE(context, input->quantization.type == kTfLiteNoQuantization); + TF_LITE_ENSURE(context, weights->quantization.type == kTfLiteNoQuantization); + TF_LITE_ENSURE(context, output->quantization.type == kTfLiteNoQuantization); + TF_LITE_ENSURE(context, + ElementCount(*weights->dims) == ElementCount(*output->dims)); + TF_LITE_ENSURE(context, ElementCount(*input->dims) == 1); + TF_LITE_ENSURE(context, input->dims->size == 1); + TF_LITE_ENSURE(context, weights->dims->size == 1); + +#ifdef USE_TFLM_COMPRESSION + + // Compression scratch buffers. + // These will only be allocated if the tensor is compressed. + weight_scratch_index_ = + micro_context->AllocateDecompressionScratchBuffer(node, 1); + if (micro_context->IsTensorCompressed(node, 1)) { + TF_LITE_ENSURE(context, weight_scratch_index_ != -1); + } else { + TF_LITE_ENSURE(context, weight_scratch_index_ == -1); + } + +#endif // USE_TFLM_COMPRESSION + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(weights); + micro_context->DeallocateTempTfLiteTensor(output); + + return kTfLiteOk; +} + +TfLiteStatus BroadcastAddOp::Invoke(TfLiteContext* context, TfLiteNode* node) { + const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0); + TF_LITE_ENSURE(context, input != nullptr); + const TfLiteEvalTensor* weights = + tflite::micro::GetEvalInput(context, node, 1); + TF_LITE_ENSURE(context, weights != nullptr); + TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0); + TF_LITE_ENSURE(context, output != nullptr); + +#ifdef USE_TFLM_COMPRESSION + + MicroContext* micro_context = GetMicroContext(context); + + const CompressionTensorData* weights_comp_td = + micro_context->GetTensorCompressionData(node, 1); + if (micro_context->IsTensorCompressed(node, 1)) { + TF_LITE_ENSURE(context, weights_comp_td != nullptr); + } else { + TF_LITE_ENSURE(context, weights_comp_td == nullptr); + } + +#endif // USE_TFLM_COMPRESSION + + switch (input->type) { + case kTfLiteFloat32: { + BroadcastAdd( + tflite::micro::GetTensorData(input)[0], +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData( + micro_context, weights, weights_comp_td, weight_scratch_index_), +#else // USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(weights), +#endif // USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(output), + ElementCount(*output->dims)); + } break; + + case kTfLiteInt8: { + BroadcastAdd( + tflite::micro::GetTensorData(input)[0], +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData( + micro_context, weights, weights_comp_td, weight_scratch_index_), +#else // USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(weights), +#endif // USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(output), + ElementCount(*output->dims)); + } break; + + case kTfLiteInt16: { + BroadcastAdd( + tflite::micro::GetTensorData(input)[0], +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData( + micro_context, weights, weights_comp_td, weight_scratch_index_), +#else // USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(weights), +#endif // USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(output), + ElementCount(*output->dims)); + } break; + + case kTfLiteInt32: { + BroadcastAdd( + tflite::micro::GetTensorData(input)[0], +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData( + micro_context, weights, weights_comp_td, weight_scratch_index_), +#else // USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(weights), +#endif // USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(output), + ElementCount(*output->dims)); + } break; + + case kTfLiteInt64: { + BroadcastAdd( + tflite::micro::GetTensorData(input)[0], +#ifdef USE_TFLM_COMPRESSION + tflite::micro::GetTensorData( + micro_context, weights, weights_comp_td, weight_scratch_index_), +#else // USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(weights), +#endif // USE_TFLM_COMPRESSION + tflite::micro::GetTensorData(output), + ElementCount(*output->dims)); + } break; + + default: { + MicroPrintf("Input type %s (%d) not supported.", + TfLiteTypeGetName(input->type), input->type); + return kTfLiteError; + } + } + + return kTfLiteOk; +} + +#ifdef USE_TFLM_COMPRESSION + +int BroadcastAddOp::weight_scratch_index_ = -1; + +#endif // USE_TFLM_COMPRESSION + } // namespace testing } // namespace tflite diff --git a/tensorflow/lite/micro/test_helper_custom_ops.h b/tensorflow/lite/micro/test_helper_custom_ops.h index d28bb4038f1..53a8cc3bdd4 100644 --- a/tensorflow/lite/micro/test_helper_custom_ops.h +++ b/tensorflow/lite/micro/test_helper_custom_ops.h @@ -1,4 +1,4 @@ -/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -43,6 +43,23 @@ class PackerOp { static bool freed_; }; +// This op optionally supports compressed weights +class BroadcastAddOp { + public: + static const TFLMRegistration* getRegistration(); + static TFLMRegistration* GetMutableRegistration(); + static void* Init(TfLiteContext* context, const char* buffer, size_t length); + static TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node); + static TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node); + + private: +#ifdef USE_TFLM_COMPRESSION + + static int weight_scratch_index_; // decompression scratch buffer index + +#endif // USE_TFLM_COMPRESSION +}; + } // namespace testing } // namespace tflite diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc index 3f0f5ec0826..aeb9a439ecf 100644 --- a/tensorflow/lite/micro/test_helpers.cc +++ b/tensorflow/lite/micro/test_helpers.cc @@ -1,4 +1,4 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/lite/micro/test_helpers.h" +#include #include #include #include @@ -33,6 +34,12 @@ limitations under the License. #include "tensorflow/lite/micro/test_helper_custom_ops.h" #include "tensorflow/lite/schema/schema_generated.h" +#ifdef USE_TFLM_COMPRESSION + +#include "tensorflow/lite/micro/compression/metadata_generated.h" + +#endif // USE_TFLM_COMPRESSION + // TODO(b/170464050): Use TFLM test only version of schema_utils. namespace tflite { @@ -236,7 +243,7 @@ const Model* ModelBuilder::BuildModel( *builder_, 0, builder_->CreateVector(operator_codes_, next_operator_code_id_), builder_->CreateVector(subgraphs, subgraphs_size), - builder_->CreateString("teset_model"), + builder_->CreateString("test_model"), builder_->CreateVector(buffers, buffer_size), 0, builder_->CreateVector(metadata_, ModelBuilder::nbr_of_metadata_buffers_)); @@ -245,7 +252,7 @@ const Model* ModelBuilder::BuildModel( *builder_, 0, builder_->CreateVector(operator_codes_, next_operator_code_id_), builder_->CreateVector(subgraphs, subgraphs_size), - builder_->CreateString("teset_model"), + builder_->CreateString("test_model"), builder_->CreateVector(buffers, buffer_size)); } @@ -578,6 +585,116 @@ const Model* BuildSimpleMockModel() { return model; } +#ifdef USE_TFLM_COMPRESSION + +const flatbuffers::span BuildLutMetadata( + const std::initializer_list& + lut_tensor_structs) { + using flatbuffers::Offset; + namespace compression = tflite::micro::compression; + + flatbuffers::FlatBufferBuilder* builder = BuilderInstance(); + auto lut_tensors = builder->CreateVectorOfStructs(lut_tensor_structs.begin(), + lut_tensor_structs.size()); + auto metadata = compression::CreateMetadata(*builder, lut_tensors); + compression::FinishMetadataBuffer(*builder, metadata); + return builder->GetBufferSpan(); +} + +const Model* BuildSimpleMockModelCompressed() { + using flatbuffers::Offset; + using flatbuffers::Vector; + using tflite::micro::compression::LutTensor; + constexpr uint kEmptyBuffer = 0; + constexpr uint kMetadataBuffer = 1; + constexpr uint kWeightsBuffer = 2; + constexpr uint kValueTableBuffer = 3; + // constexpr uint kInputTensor = 0; + constexpr uint kWeightsTensor = 1; + // constexpr uint kOutputTensor = 2; + constexpr uint kSubgraphIndex = 0; + constexpr uint kCompressedBitWidth = 4; + + const std::initializer_list lut_tensors = { + LutTensor(kSubgraphIndex, kWeightsTensor, kCompressedBitWidth, + kWeightsBuffer, kValueTableBuffer), + }; + auto lut_tensors_span = BuildLutMetadata(lut_tensors); + + flatbuffers::FlatBufferBuilder* builder = BuilderInstance(); + + // [1, 2, 3, 4, 5, -1, -2, -3, -4, -5, 1, 2, 3, 4, 5] + const std::initializer_list weights_data = {0x01, 0x23, 0x45, 0x98, + 0x76, 0x01, 0x23, 0x40}; + const std::initializer_list value_table_data = {1, 2, 3, 4, 5, + -1, -5, -4, -3, -2}; + auto value_table_offset = builder->CreateVector(value_table_data).o; + const std::initializer_list> buffers = { + CreateBuffer(*builder), + CreateBuffer(*builder, builder->CreateVector(lut_tensors_span)), + CreateBuffer(*builder, builder->CreateVector(weights_data)), + CreateBuffer(*builder, Offset>(value_table_offset)), + }; + + const std::initializer_list input_shape = {1}; + const std::initializer_list weights_shape = {15}; + const std::initializer_list output_shape = weights_shape; + const std::initializer_list> tensors = { + CreateTensor(*builder, builder->CreateVector(input_shape), + TensorType_INT16, kEmptyBuffer, + builder->CreateString("test_input_tensor"), 0, false), + CreateTensor(*builder, builder->CreateVector(weights_shape), + TensorType_INT16, kWeightsBuffer, + builder->CreateString("test_weight_tensor"), 0, false), + CreateTensor(*builder, builder->CreateVector(output_shape), + TensorType_INT16, kEmptyBuffer, + builder->CreateString("test_output_tensor"), 0, false), + }; + + const std::initializer_list subgraph_inputs = {0}; + const std::initializer_list subgraph_outputs = {2}; + const std::initializer_list operator_inputs = {0, 1}; + const std::initializer_list operator_outputs = {2}; + const std::initializer_list> operators = { + CreateOperator(*builder, 0, builder->CreateVector(operator_inputs), + builder->CreateVector(operator_outputs), + BuiltinOptions_NONE), + }; + + const std::initializer_list> subgraphs = { + CreateSubGraph(*builder, builder->CreateVector(tensors), + builder->CreateVector(subgraph_inputs), + builder->CreateVector(subgraph_outputs), + builder->CreateVector(operators), + builder->CreateString("test_subgraph")), + }; + + const std::initializer_list> operator_codes = { + CreateOperatorCodeDirect(*builder, /*deprecated_builtin_code=*/0, + "broadcast_add_op", + /*version=*/0, BuiltinOperator_CUSTOM), + }; + + const std::initializer_list> metadata = { + CreateMetadata(*builder, + builder->CreateString(kCompressionMetadataString), + kMetadataBuffer), + }; + + const Offset model_offset = CreateModel( + *builder, 0, builder->CreateVector(operator_codes), + builder->CreateVector(subgraphs), builder->CreateString("test_model"), + builder->CreateVector(buffers), 0, builder->CreateVector(metadata)); + + FinishModelBuffer(*builder, model_offset); + void* model_pointer = builder->GetBufferPointer(); + const Model* model = flatbuffers::GetRoot(model_pointer); + + return model; +} + +#endif // USE_TFLM_COMPRESSION + const Model* BuildComplexMockModel() { using flatbuffers::Offset; flatbuffers::FlatBufferBuilder* builder = BuilderInstance(); @@ -1665,6 +1782,8 @@ TfLiteStatus GetTestingOpResolver( op_resolver.AddCustom("no_op", NoOp::GetMutableRegistration())); TF_LITE_ENSURE_STATUS(op_resolver.AddCustom( "custom_packer_op", PackerOp::GetMutableRegistration())); + TF_LITE_ENSURE_STATUS(op_resolver.AddCustom( + "broadcast_add_op", BroadcastAddOp::GetMutableRegistration())); TF_LITE_ENSURE_STATUS(op_resolver.AddIf()); return kTfLiteOk; } @@ -1698,6 +1817,18 @@ const Model* GetSimpleMockModel() { return model; } +#ifdef USE_TFLM_COMPRESSION + +const Model* GetSimpleMockModelCompressed() { + static Model* model = nullptr; + if (!model) { + model = const_cast(BuildSimpleMockModelCompressed()); + } + return model; +} + +#endif // USE_TFLM_COMPRESSION + const Model* GetSimpleMultipleInputsModel() { static Model* model = nullptr; if (!model) { @@ -1890,100 +2021,6 @@ TfLiteFloatArray* FloatArrayFromFloats(const float* floats) { return reinterpret_cast(const_cast(floats)); } -TfLiteTensor CreateQuantizedBiasTensor(const float* data, int16_t* quantized, - TfLiteIntArray* dims, float input_scale, - float weights_scale, bool is_variable) { - float bias_scale = input_scale * weights_scale; - tflite::SymmetricQuantize(data, quantized, ElementCount(*dims), bias_scale); - - // Quantized int16_t tensors always have a zero point of 0, since the range of - // int16_t values is large, and because zero point costs extra cycles during - // processing. - TfLiteTensor result = - CreateQuantizedTensor(quantized, dims, bias_scale, 0, is_variable); - return result; -} - -TfLiteTensor CreateQuantizedBiasTensor(const float* data, int32_t* quantized, - TfLiteIntArray* dims, float input_scale, - float weights_scale, bool is_variable) { - float bias_scale = input_scale * weights_scale; - tflite::SymmetricQuantize(data, quantized, ElementCount(*dims), bias_scale); - - // Quantized int32_t tensors always have a zero point of 0, since the range of - // int32_t values is large, and because zero point costs extra cycles during - // processing. - TfLiteTensor result = - CreateQuantizedTensor(quantized, dims, bias_scale, 0, is_variable); - return result; -} - -TfLiteTensor CreateQuantizedBiasTensor(const float* data, - std::int64_t* quantized, - TfLiteIntArray* dims, float input_scale, - float weights_scale, bool is_variable) { - float bias_scale = input_scale * weights_scale; - tflite::SymmetricQuantize(data, quantized, ElementCount(*dims), bias_scale); - - // Quantized int32_t tensors always have a zero point of 0, since the range of - // int32_t values is large, and because zero point costs extra cycles during - // processing. - TfLiteTensor result = - CreateQuantizedTensor(quantized, dims, bias_scale, 0, is_variable); - return result; -} - -// Quantizes int32_t bias tensor with per-channel weights determined by input -// scale multiplied by weight scale for each channel. -template -TfLiteTensor CreatePerChannelQuantizedBiasTensor( - const float* input, T* quantized, TfLiteIntArray* dims, float input_scale, - float* weight_scales, float* scales, int* zero_points, - TfLiteAffineQuantization* affine_quant, int quantized_dimension, - bool is_variable) { - int input_size = ElementCount(*dims); - int num_channels = dims->data[quantized_dimension]; - // First element is reserved for array length - zero_points[0] = num_channels; - scales[0] = static_cast(num_channels); - float* scales_array = &scales[1]; - for (int i = 0; i < num_channels; i++) { - scales_array[i] = input_scale * weight_scales[i]; - zero_points[i + 1] = 0; - } - - SymmetricPerChannelQuantize(input, quantized, input_size, num_channels, - scales_array); - - affine_quant->scale = FloatArrayFromFloats(scales); - affine_quant->zero_point = IntArrayFromInts(zero_points); - affine_quant->quantized_dimension = quantized_dimension; - - TfLiteTensor result = CreateTensor(quantized, dims, is_variable); - result.quantization = {kTfLiteAffineQuantization, affine_quant}; - return result; -} - -TfLiteTensor CreatePerChannelQuantizedBiasTensor( - const float* input, int32_t* quantized, TfLiteIntArray* dims, - float input_scale, float* weight_scales, float* scales, int* zero_points, - TfLiteAffineQuantization* affine_quant, int quantized_dimension, - bool is_variable) { - return CreatePerChannelQuantizedBiasTensor( - input, quantized, dims, input_scale, weight_scales, scales, zero_points, - affine_quant, quantized_dimension, is_variable); -} - -TfLiteTensor CreatePerChannelQuantizedBiasTensor( - const float* input, std::int64_t* quantized, TfLiteIntArray* dims, - float input_scale, float* weight_scales, float* scales, int* zero_points, - TfLiteAffineQuantization* affine_quant, int quantized_dimension, - bool is_variable) { - return CreatePerChannelQuantizedBiasTensor( - input, quantized, dims, input_scale, weight_scales, scales, zero_points, - affine_quant, quantized_dimension, is_variable); -} - TfLiteTensor CreateSymmetricPerChannelQuantizedTensor( const float* input, int8_t* quantized, TfLiteIntArray* dims, float* scales, int* zero_points, TfLiteAffineQuantization* affine_quant, diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h index 6315b9fecdc..6831b467bc8 100644 --- a/tensorflow/lite/micro/test_helpers.h +++ b/tensorflow/lite/micro/test_helpers.h @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -31,6 +31,13 @@ limitations under the License. #include "tensorflow/lite/portable_type_to_tflitetype.h" #include "tensorflow/lite/schema/schema_generated.h" +#ifdef USE_TFLM_COMPRESSION + +#include "tensorflow/lite/micro/compression.h" +#include "tensorflow/lite/micro/micro_log.h" + +#endif // TENSORFLOW_LITE_MICRO_TEST_HELPERS_H_ + namespace tflite { namespace testing { @@ -112,6 +119,15 @@ TfLiteStatus GetTestingOpResolver(TestingOpResolver& op_resolver); // 1 layer of weights, 1 output Tensor, and 1 operator. const Model* GetSimpleMockModel(); +#ifdef USE_TFLM_COMPRESSION + +// Returns a simple example flatbuffer TensorFlow Lite model. Contains 1 input, +// 1 layer of weights, 1 output Tensor, and 1 operator (BroadcastAddOp). The +// weights tensor is compressed. +const Model* GetSimpleMockModelCompressed(); + +#endif // USE_TFLM_COMPRESSION + // Returns a flatbuffer TensorFlow Lite model with more inputs, variable // tensors, and operators. const Model* GetComplexMockModel(); @@ -220,8 +236,6 @@ TfLiteTensor CreateTensor(const T* data, TfLiteIntArray* dims, result.is_variable = is_variable; result.allocation_type = kTfLiteMemNone; result.data.data = const_cast(data); - result.bytes = ElementCount(*dims) * sizeof(T); - result.data.data = const_cast(data); if (type == kTfLiteInt4) { result.type = kTfLiteInt4; @@ -233,7 +247,13 @@ TfLiteTensor CreateTensor(const T* data, TfLiteIntArray* dims, // a single CreateTensor method. A Const array should be used for immutable // input tensors and non-const array should be used for mutable and output // tensors. - result.type = typeToTfLiteType(); + if (type == kTfLiteNoType) { + result.type = typeToTfLiteType(); + } else { + result.type = type; + } + + result.bytes = ElementCount(*dims) * TfLiteTypeGetSize(result.type); } return result; } @@ -260,37 +280,95 @@ TfLiteTensor CreateQuantizedTensor(const float* input, T* quantized, type); } -TfLiteTensor CreateQuantizedBiasTensor(const float* data, int16_t* quantized, +template +TfLiteTensor CreateQuantizedBiasTensor(const float* data, T* quantized, TfLiteIntArray* dims, float input_scale, float weights_scale, - bool is_variable = false); + bool is_variable = false) { + float bias_scale = input_scale * weights_scale; + tflite::SymmetricQuantize(data, quantized, ElementCount(*dims), bias_scale); + + // Quantized bias tensors always have a zero point of 0, since the range of + // values is large, and because zero point costs extra cycles during + // processing. + TfLiteTensor result = + CreateQuantizedTensor(quantized, dims, bias_scale, 0, is_variable); + return result; +} -TfLiteTensor CreateQuantizedBiasTensor(const float* data, int32_t* quantized, - TfLiteIntArray* dims, float input_scale, - float weights_scale, - bool is_variable = false); +// Creates bias tensor with input data, and per-channel weights determined by +// input scale multiplied by weight scale for each channel. Input data will not +// be quantized. +template +TfLiteTensor CreatePerChannelQuantizedBiasTensor( + const T* input_data, TfLiteIntArray* dims, float input_scale, + const TfLiteFloatArray* weight_scales, TfLiteFloatArray* scales, + TfLiteIntArray* zero_points, TfLiteAffineQuantization* affine_quant, + int quantized_dimension, bool is_variable = false, + TfLiteType type = kTfLiteNoType) { + int num_channels = dims->data[quantized_dimension]; + zero_points->size = num_channels; + scales->size = num_channels; + for (int i = 0; i < num_channels; i++) { + scales->data[i] = input_scale * weight_scales->data[i]; + zero_points->data[i] = 0; + } -TfLiteTensor CreateQuantizedBiasTensor(const float* data, - std::int64_t* quantized, - TfLiteIntArray* dims, float input_scale, - float weights_scale, - bool is_variable = false); + affine_quant->scale = scales; + affine_quant->zero_point = zero_points; + affine_quant->quantized_dimension = quantized_dimension; -// Quantizes int32_t bias tensor with per-channel weights determined by input -// scale multiplied by weight scale for each channel. -TfLiteTensor CreatePerChannelQuantizedBiasTensor( - const float* input, int32_t* quantized, TfLiteIntArray* dims, - float input_scale, float* weight_scales, float* scales, int* zero_points, - TfLiteAffineQuantization* affine_quant, int quantized_dimension, - bool is_variable = false); + TfLiteTensor result = CreateTensor(input_data, dims, is_variable, type); + result.quantization = {kTfLiteAffineQuantization, affine_quant}; + return result; +} -// Quantizes int64_t bias tensor with per-channel weights determined by input +// Quantizes bias tensor with per-channel weights determined by input // scale multiplied by weight scale for each channel. +template TfLiteTensor CreatePerChannelQuantizedBiasTensor( - const float* input, std::int64_t* quantized, TfLiteIntArray* dims, - float input_scale, float* weight_scales, float* scales, int* zero_points, + const float* input, T* quantized, TfLiteIntArray* dims, float input_scale, + const float* weight_scales, float* scales, int* zero_points, TfLiteAffineQuantization* affine_quant, int quantized_dimension, - bool is_variable = false); + bool is_variable = false) { + int input_size = ElementCount(*dims); + int num_channels = dims->data[quantized_dimension]; + // First element is reserved for array length + zero_points[0] = num_channels; + scales[0] = static_cast(num_channels); + float* scales_array = &scales[1]; + for (int i = 0; i < num_channels; i++) { + scales_array[i] = input_scale * weight_scales[i]; + zero_points[i + 1] = 0; + } + + SymmetricPerChannelQuantize(input, quantized, input_size, num_channels, + scales_array); + + affine_quant->scale = FloatArrayFromFloats(scales); + affine_quant->zero_point = IntArrayFromInts(zero_points); + affine_quant->quantized_dimension = quantized_dimension; + + TfLiteTensor result = CreateTensor(quantized, dims, is_variable); + result.quantization = {kTfLiteAffineQuantization, affine_quant}; + + return result; +} + +template +TfLiteTensor CreatePerChannelQuantizedTensor( + const T* quantized, TfLiteIntArray* dims, TfLiteFloatArray* scales, + TfLiteIntArray* zero_points, TfLiteAffineQuantization* affine_quant, + int quantized_dimension, bool is_variable = false, + TfLiteType type = kTfLiteNoType) { + affine_quant->scale = scales; + affine_quant->zero_point = zero_points; + affine_quant->quantized_dimension = quantized_dimension; + + TfLiteTensor result = CreateTensor(quantized, dims, is_variable, type); + result.quantization = {kTfLiteAffineQuantization, affine_quant}; + return result; +} TfLiteTensor CreateSymmetricPerChannelQuantizedTensor( const float* input, int8_t* quantized, TfLiteIntArray* dims, float* scales, @@ -329,6 +407,128 @@ inline int ZeroPointFromMinMax(const float min, const float max) { static_cast(roundf(-min / ScaleFromMinMax(min, max))); } +#ifdef USE_TFLM_COMPRESSION + +template +struct TestCompressionInfo { + TFILTER* filter_value_table; + size_t filter_value_table_stride; + int filter_bit_width; + TBIAS* bias_value_table; + size_t bias_value_table_stride; + int bias_bit_width; + CompressionScheme scheme; +}; + +template +struct TestCompressionQuantizedInfo : TestCompressionInfo { + const uint8_t* filter_compressed; + const float* filter_data; + const int* filter_dims_data; // TfLiteIntArray + const float* filter_scales; // TfLiteFloatArray + const int* filter_zero_points; // TfLiteIntArray + + const uint8_t* bias_compressed; + const float* bias_data; + const int* bias_dims_data; // TfLiteIntArray + float* bias_scales; // TfLiteFloatArray (computed) + int* bias_zero_points; // TfLiteIntArray (computed) +}; + +template +class TestCompressedList { + public: + TfLiteStatus AddWeight(const TestCompressionInfo& tci, + const TfLiteTensor& tensor, + const size_t tensor_index) { + filter_comp_data_.data.lut_data = &filter_lut_; + filter_comp_data_.scheme = tci.scheme; + filter_comp_data_.data.lut_data->compressed_bit_width = + tci.filter_bit_width; + filter_comp_data_.data.lut_data->value_table = tci.filter_value_table; + filter_comp_data_.data.lut_data->value_table_channel_stride = + tci.filter_value_table_stride; + filter_comp_data_.data.lut_data->is_per_channel_quantized = + IsPerChannelQuantized(tensor); + filter_comp_data_.data.lut_data->use_alternate_axis = UsesAltAxis(tensor); + return SetCompressionData(tensor_index, filter_comp_data_); + } + + TfLiteStatus AddBias(const TestCompressionInfo& tci, + const TfLiteTensor& tensor, const size_t tensor_index) { + bias_comp_data_.data.lut_data = &bias_lut_; + bias_comp_data_.scheme = tci.scheme; + bias_comp_data_.data.lut_data->compressed_bit_width = tci.bias_bit_width; + bias_comp_data_.data.lut_data->value_table = tci.bias_value_table; + bias_comp_data_.data.lut_data->value_table_channel_stride = + tci.bias_value_table_stride; + bias_comp_data_.data.lut_data->is_per_channel_quantized = + IsPerChannelQuantized(tensor); + bias_comp_data_.data.lut_data->use_alternate_axis = UsesAltAxis(tensor); + return SetCompressionData(tensor_index, bias_comp_data_); + } + const CompressedTensorList* GetCompressedTensorList() { return &ctl_; } + + private: + LookupTableData filter_lut_ = {}; + CompressionTensorData filter_comp_data_ = {}; + LookupTableData bias_lut_ = {}; + CompressionTensorData bias_comp_data_ = {}; + const CompressionTensorData* ctdp_[N] = {}; + const CompressedTensorList ctl_ = {ctdp_}; + + TfLiteStatus SetCompressionData(const size_t tensor_index, + const CompressionTensorData& cd) { + TFLITE_DCHECK_LT(tensor_index, N); + TFLITE_DCHECK(cd.data.lut_data->value_table != nullptr); + TFLITE_DCHECK(cd.data.lut_data->value_table_channel_stride != 0); + + if (cd.scheme != CompressionScheme::kBinQuant) { + MicroPrintf("TestCompressedList: unsupported compression scheme"); + return kTfLiteError; + } + if (ctdp_[tensor_index] != nullptr) { + MicroPrintf("TestCompressedList: tensor index %d already in use", + tensor_index); + return kTfLiteError; + } + + ctdp_[tensor_index] = &cd; + return kTfLiteOk; + } + + bool IsPerChannelQuantized(const TfLiteTensor& tensor) { + if (tensor.quantization.type == kTfLiteAffineQuantization && + tensor.quantization.params != nullptr) { + const TfLiteAffineQuantization* qp = + static_cast( + tensor.quantization.params); + if (qp->scale->size > 1) { + return true; + } + } + + return false; + } + + bool UsesAltAxis(const TfLiteTensor& tensor) { + if (tensor.quantization.type == kTfLiteAffineQuantization && + tensor.quantization.params != nullptr) { + const TfLiteAffineQuantization* qp = + static_cast( + tensor.quantization.params); + if (qp->quantized_dimension != 0) { + TFLITE_DCHECK_EQ(qp->quantized_dimension, tensor.dims->size - 1); + return true; + } + } + + return false; + } +}; + +#endif // USE_TFLM_COMPRESSION + } // namespace testing } // namespace tflite diff --git a/tensorflow/lite/micro/tools/benchmarking/metrics.cc b/tensorflow/lite/micro/tools/benchmarking/metrics.cc index 3a4bf7e4917..f71a4cd139e 100644 --- a/tensorflow/lite/micro/tools/benchmarking/metrics.cc +++ b/tensorflow/lite/micro/tools/benchmarking/metrics.cc @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -46,7 +46,8 @@ struct LogAllocationRecord { constexpr int kArenaRows = 3; constexpr int kArenaColumns = 3; -constexpr int kAllocationTypes = 7; +constexpr int kAllocationTypes = + static_cast(tflite::RecordedAllocationType::kNumAllocationTypes); constexpr int kAllocationColumns = 6; constexpr int kMaxBufSize = 100; @@ -85,16 +86,25 @@ LogAllocationRecord GetLogAllocationRecord( tflite::RecordedAllocationType::kPersistentBufferData, tflite::RecordedAllocationType::kTfLiteTensorVariableBufferData, tflite::RecordedAllocationType::kNodeAndRegistrationArray, - tflite::RecordedAllocationType::kOpData}; + tflite::RecordedAllocationType::kOpData, +#ifdef USE_TFLM_COMPRESSION + tflite::RecordedAllocationType::kCompressionData, +#endif // USE_TFLM_COMPRESSION + }; static_assert(std::extent::value == kAllocationTypes, "kAllocationTypes mismatch"); - const char* titles[] = {"Eval tensor data", - "Persistent tensor data", - "Persistent quantization data", - "Persistent buffer data", - "Tensor variable buffer data", - "Node and registration array", - "Operation data"}; + const char* titles[] = { + "Eval tensor data", + "Persistent tensor data", + "Persistent quantization data", + "Persistent buffer data", + "Tensor variable buffer data", + "Node and registration array", + "Operation data", +#ifdef USE_TFLM_COMPRESSION + "Compression data", +#endif // USE_TFLM_COMPRESSION + }; static_assert(std::extent::value == kAllocationTypes, "kAllocationTypes mismatch"); const size_t total_bytes = diff --git a/tensorflow/lite/micro/tools/ci_build/test_x86_default.sh b/tensorflow/lite/micro/tools/ci_build/test_x86_default.sh index 998827f24de..f5392dddeec 100755 --- a/tensorflow/lite/micro/tools/ci_build/test_x86_default.sh +++ b/tensorflow/lite/micro/tools/ci_build/test_x86_default.sh @@ -41,6 +41,12 @@ readable_run make -s -j8 -f ${TENSORFLOW_ROOT}tensorflow/lite/micro/tools/make/M readable_run make -s -j8 -f ${TENSORFLOW_ROOT}tensorflow/lite/micro/tools/make/Makefile test TENSORFLOW_ROOT=${TENSORFLOW_ROOT} EXTERNAL_DIR=${EXTERNAL_DIR} readable_run make -s -j8 -f ${TENSORFLOW_ROOT}tensorflow/lite/micro/tools/make/Makefile integration_tests TENSORFLOW_ROOT=${TENSORFLOW_ROOT} EXTERNAL_DIR=${EXTERNAL_DIR} +# optional TFLM tensor compression - execute the unit tests +readable_run make -s -j8 -f ${TENSORFLOW_ROOT}tensorflow/lite/micro/tools/make/Makefile test \ + TENSORFLOW_ROOT=${TENSORFLOW_ROOT} \ + EXTERNAL_DIR=${EXTERNAL_DIR} \ + USE_TFLM_COMPRESSION=yes + # run generic benchmark readable_run make -j$(nproc) -f ${TENSORFLOW_ROOT}tensorflow/lite/micro/tools/make/Makefile \ TENSORFLOW_ROOT=${TENSORFLOW_ROOT} \ diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile index 3bf2b549316..e837e9d33b9 100644 --- a/tensorflow/lite/micro/tools/make/Makefile +++ b/tensorflow/lite/micro/tools/make/Makefile @@ -167,6 +167,7 @@ endif COMMON_FLAGS := \ -Werror \ -fno-unwind-tables \ + -fno-asynchronous-unwind-tables \ -ffunction-sections \ -fdata-sections \ -fmessage-length=0 \ @@ -263,6 +264,17 @@ endif # runtime that can be linked in to other programs. MICROLITE_LIB_NAME := libtensorflow-microlite.a +# TFLM optional compression support (default disabled) +ENABLE_COMPRESSION := no +ifneq ($(USE_TFLM_COMPRESSION),) + # currently only Linux targets supported + ifeq ($(TARGET), $(filter $(TARGET), linux)) + CXXFLAGS += -DUSE_TFLM_COMPRESSION + CCFLAGS += -DUSE_TFLM_COMPRESSION + ENABLE_COMPRESSION := yes + endif +endif + # Where compiled objects are stored. BASE_GENDIR := gen GENDIR := $(BASE_GENDIR)/$(TARGET)_$(TARGET_ARCH)_$(BUILD_TYPE) @@ -272,6 +284,9 @@ endif ifneq ($(CO_PROCESSOR),) GENDIR := $(GENDIR)_$(CO_PROCESSOR) endif +ifeq ($(ENABLE_COMPRESSION), yes) + GENDIR := $(GENDIR)_compression +endif GENDIR := $(GENDIR)_$(TOOLCHAIN)/ CORE_OBJDIR := $(GENDIR)obj/core/