From 4b7c4aa90cdd8541052ab6d571a15533c01fab09 Mon Sep 17 00:00:00 2001 From: juliannguyen4 <109386615+juliannguyen4@users.noreply.github.com> Date: Wed, 15 Nov 2023 11:45:32 -0800 Subject: [PATCH] [CLIENT-1824] Add HyperLogLog class to represent HLL values (#509) --- aerospike_helpers/__init__.py | 12 ++++++++ doc/data_mapping.rst | 39 ++++++++++------------- src/main/conversions.c | 58 +++++++++++++++++++++++++++++++++-- src/main/serializer.c | 53 ++++++++++++++++++++++++++++++++ test/new_tests/test_hll.py | 38 +++++++++++++++++++++++ 5 files changed, 174 insertions(+), 26 deletions(-) diff --git a/aerospike_helpers/__init__.py b/aerospike_helpers/__init__.py index 17b0fd30a..a2e4c962e 100644 --- a/aerospike_helpers/__init__.py +++ b/aerospike_helpers/__init__.py @@ -13,3 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. ########################################################################## + +class HyperLogLog(bytes): + """ + Represents a HyperLogLog value. This can be returned from the server or created in order to be sent to the server. + + The constructor takes in any argument that the :class:`bytes` constructor takes in. + + >>> h = HyperLogLog([1, 2, 3]) + >>> client.put(key, {"hyperloglog": h}) + """ + def __new__(cls, o) -> "HyperLogLog": + return super().__new__(cls, o) diff --git a/doc/data_mapping.rst b/doc/data_mapping.rst index e50164ee2..5dd95bdbd 100644 --- a/doc/data_mapping.rst +++ b/doc/data_mapping.rst @@ -46,29 +46,21 @@ Data Mappings The following table shows which Python types map directly to Aerospike server types. -+---------------------------------+------------------------+ -| Python Type | Server type | -+=================================+========================+ -|:class:`int` |`integer`_ | -+---------------------------------+------------------------+ -|:class:`bool` |depends on send_bool_as | -+---------------------------------+------------------------+ -|:class:`str` |`string`_ | -+---------------------------------+------------------------+ -|:class:`unicode` |`string`_ | -+---------------------------------+------------------------+ -|:class:`float` |`double`_ | -+---------------------------------+------------------------+ -|:class:`dict` |`map`_ | -+---------------------------------+------------------------+ -|:class:`aerospike.KeyOrderedDict`|`key ordered map`_ | -+---------------------------------+------------------------+ -|:class:`list` |`list`_ | -+---------------------------------+------------------------+ -|:class:`bytes` |`blob`_ | -+---------------------------------+------------------------+ -|:class:`aerospike.GeoJSON` |`GeoJSON`_ | -+---------------------------------+------------------------+ + ======================================== ========================= + Python Type Server type + ======================================== ========================= + :class:`int` `integer`_ + :class:`bool` depends on send_bool_as + :class:`str` `string`_ + :class:`unicode` `string`_ + :class:`float` `double`_ + :class:`dict` `map`_ + :class:`aerospike.KeyOrderedDict` `key ordered map`_ + :class:`list` `list`_ + :class:`bytes` `blob`_ + :class:`aerospike.GeoJSON` `GeoJSON`_ + :class:`aerospike_helpers.HyperLogLog` `HyperLogLog`_ + ======================================== ========================= .. note:: @@ -86,3 +78,4 @@ as a value. .. _list: https://docs.aerospike.com/server/guide/data-types/cdt-list .. _blob: https://docs.aerospike.com/server/guide/data-types/blob .. _GeoJSON: https://docs.aerospike.com/server/guide/data-types/geospatial +.. _HyperLogLog: https://docs.aerospike.com/server/guide/data-types/hll diff --git a/src/main/conversions.c b/src/main/conversions.c index 8a152a946..25cdd9087 100644 --- a/src/main/conversions.c +++ b/src/main/conversions.c @@ -771,6 +771,41 @@ as_status pyobject_to_map(AerospikeClient *self, as_error *err, return err->code; } +static bool is_aerospike_hll_type(PyObject *obj) +{ + if (strcmp(obj->ob_type->tp_name, "HyperLogLog")) { + // Class name is not HyperLogLog + return false; + } + + PyObject *py_module_name = + PyDict_GetItemString(obj->ob_type->tp_dict, "__module__"); + if (!py_module_name) { + // Class does not belong to any module + return false; + } + + bool retval = true; + + Py_INCREF(py_module_name); + if (!PyUnicode_Check(py_module_name)) { + // Invalid module name + retval = false; + goto CLEANUP; + } + + const char *module_name = PyUnicode_AsUTF8(py_module_name); + if (strcmp(module_name, "aerospike_helpers")) { + // Class belongs to the wrong module + retval = false; + goto CLEANUP; + } + +CLEANUP: + Py_DECREF(py_module_name); + return retval; +} + as_status pyobject_to_val(AerospikeClient *self, as_error *err, PyObject *py_obj, as_val **val, as_static_pool *static_pool, int serializer_type) @@ -823,9 +858,23 @@ as_status pyobject_to_val(AerospikeClient *self, as_error *err, Py_DECREF(py_ustr); } else if (PyBytes_Check(py_obj)) { - uint8_t *b = (uint8_t *)PyBytes_AsString(py_obj); - uint32_t b_len = (uint32_t)PyBytes_Size(py_obj); - *val = (as_val *)as_bytes_new_wrap(b, b_len, false); + char *py_obj_buffer = PyBytes_AsString(py_obj); + Py_ssize_t b_len = PyBytes_Size(py_obj); + uint8_t *new_buffer = (uint8_t *)malloc(sizeof(uint8_t) * b_len); + memcpy(new_buffer, py_obj_buffer, sizeof(uint8_t) * b_len); + + as_bytes *bytes = as_bytes_new_wrap(new_buffer, b_len, true); + if (bytes == NULL) { + free(new_buffer); + return as_error_update( + err, AEROSPIKE_ERR_CLIENT, + "Unable to convert Python bytes to C client's as_bytes"); + } + *val = (as_val *)bytes; + + if (is_aerospike_hll_type(py_obj)) { + bytes->type = AS_BYTES_HLL; + } } else if (!strcmp(py_obj->ob_type->tp_name, "aerospike.Geospatial")) { PyObject *py_parameter = PyUnicode_FromString("geo_data"); @@ -1032,6 +1081,9 @@ as_status pyobject_to_record(AerospikeClient *self, as_error *err, char *str = PyBytes_AsString(value); as_bytes_set(bytes, 0, (const uint8_t *)str, str_len); + if (is_aerospike_hll_type(value)) { + bytes->type = AS_BYTES_HLL; + } ret_val = as_record_set_bytes(rec, name, bytes); } else if (PyByteArray_Check(value)) { diff --git a/src/main/serializer.c b/src/main/serializer.c index a2d41f2de..bee1e85f4 100644 --- a/src/main/serializer.c +++ b/src/main/serializer.c @@ -476,6 +476,59 @@ extern as_status deserialize_based_on_as_bytes_type(AerospikeClient *self, } } } break; + case AS_BYTES_HLL: { + // Convert bytes to Python bytes object + PyObject *py_bytes = PyBytes_FromStringAndSize( + (const char *)bytes->value, (Py_ssize_t)bytes->size); + if (py_bytes == NULL) { + as_error_update( + error_p, AEROSPIKE_ERR_CLIENT, + "Unable to convert C client's as_bytes to Python bytes"); + goto CLEANUP; + } + // Pass bytes object to new HLL class instance + PyObject *py_aerospike_helpers_module = + PyImport_ImportModule("aerospike_helpers"); + if (py_aerospike_helpers_module == NULL) { + as_error_update(error_p, AEROSPIKE_ERR_CLIENT, + "Unable to import aerospike_helpers module"); + goto HLL_CLEANUP1; + } + + PyObject *py_hll_class = + PyObject_GetAttrString(py_aerospike_helpers_module, "HyperLogLog"); + if (py_hll_class == NULL) { + as_error_update(error_p, AEROSPIKE_ERR, + "Unable to import HyperLogLog class from " + "aerospike_helpers module"); + goto HLL_CLEANUP2; + } + + if (!PyCallable_Check(py_hll_class)) { + as_error_update(error_p, AEROSPIKE_ERR, + "Unable to create HyperLogLog instance; " + "HyperLogLog class is not callable"); + goto HLL_CLEANUP3; + } + + PyObject *py_hll_instance = + PyObject_CallFunctionObjArgs(py_hll_class, py_bytes, NULL); + if (py_hll_instance == NULL) { + // An exception has been thrown by calling the HLL constructor + // We want to show the original exception instead of throwing our own exception + goto HLL_CLEANUP3; + } + + *retval = py_hll_instance; + + HLL_CLEANUP3: + Py_DECREF(py_hll_class); + HLL_CLEANUP2: + Py_DECREF(py_aerospike_helpers_module); + HLL_CLEANUP1: + Py_DECREF(py_bytes); + break; + } default: { // First try to return a raw byte array, if that fails raise an error uint32_t bval_size = as_bytes_size(bytes); diff --git a/test/new_tests/test_hll.py b/test/new_tests/test_hll.py index eae2b4385..c2634886e 100644 --- a/test/new_tests/test_hll.py +++ b/test/new_tests/test_hll.py @@ -2,6 +2,7 @@ import pytest from aerospike import exception as e from aerospike_helpers.operations import hll_operations +from aerospike_helpers import HyperLogLog from math import sqrt @@ -451,3 +452,40 @@ def test_pos_hll_update(self): _, _, res = self.as_connection.operate(self.test_keys[0], ops) assert res["hll_bine"] == 3 + + def test_get_put_operate_hll(self): + """ + Can you read and write HLL bins to the server and still perform HLL operations on those bins? + """ + _, _, rec = self.as_connection.get(self.test_keys[0]) + assert type(rec["mh_bin"]) == HyperLogLog + + self.as_connection.put(self.test_keys[0], {"mh_bin": rec["mh_bin"]}) + + # mh_bin should return the same results as before reading and rewritting the bin + ops = [hll_operations.hll_describe("mh_bin")] + _, _, res = self.as_connection.operate(self.test_keys[0], ops) + assert res["mh_bin"] == [6, 12] + + def test_put_get_hll_list(self): + """ + This is to cover putting nested HLLs in the server + Since the conversion for nested HLLs to the C client equivalent is separate from top-level HLLs + """ + # Test setup to retrieve an HLL bin + _, _, rec = self.as_connection.get(self.test_keys[0]) + + self.as_connection.put( + self.test_keys[0], + { + "hll_list": [ + rec["hll_bin"] + ] + } + ) + # Verify we stored the HLL in the list as an HLL type + _, _, rec = self.as_connection.get(self.test_keys[0]) + assert type(rec["hll_list"][0]) == HyperLogLog + + def test_hll_superclass(self): + assert issubclass(HyperLogLog, bytes)