diff --git a/.build.yml b/.build.yml new file mode 100644 index 00000000..75e3101b --- /dev/null +++ b/.build.yml @@ -0,0 +1,41 @@ +name: aerospike-server + +environment: + EEREPO: modules/ee + +dependency: + - url: git@github.com:citrusleaf/aerospike-server-enterprise + dir: $EEREPO + +container: + - base: + - docker.qe.aerospike.com/build/aerospike-server:centos-6 + - docker.qe.aerospike.com/build/aerospike-server:centos-7 + - docker.qe.aerospike.com/build/aerospike-server:debian-7 + - docker.qe.aerospike.com/build/aerospike-server:debian-8 + - docker.qe.aerospike.com/build/aerospike-server:ubuntu-12.04 + - docker.qe.aerospike.com/build/aerospike-server:ubuntu-14.04 + - docker.qe.aerospike.com/build/aerospike-server:ubuntu-16.04 + +build: + - name: community + environment: + EEREPO: + script: + - make + - make $PKG + - make tar + - make source + - cp -p modules/telemetry/{README,TELEMETRY}.md + artifact: + - pkg/packages/* + - modules/telemetry/TELEMETRY.md + - name: enterprise + environment: + EEREPO: /work/source/$EEREPO + script: + - make +ee + - make $PKG+ee + - make source+ee + artifact: + - pkg/packages/* diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 00000000..76d5afff --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,22 @@ +Please direct the following to our [community forum](https://discuss.aerospike.com/): +- general questions +- help requests +- feature requests +- non Aerospike Server issues + +The issues submitted here should be Aerospike Server **code** related. Examples include: +- crashes (please provide stack trace from logs) +- bugs (not behaving as expected/documented) +- code quality + +__________ + +**OS:** *Put your operating system here. For example: "Ubuntu 16.10", "CentOS 7", "Debian 8" etc.* + +**Aerospike version:** *Put your Aerospike release version or `git describe --long --all` output here. For example: "3.15.0.1, heads/master-0-g450aee1"* + +**Client version:** *Put which client and the version of the client you are using (if applicable) here. For example: "Java 4.0.8, C 4.2.0" + +__________ + +*Explain your _Aerospike Server_ issue in detail here and (if applicable) provide logs snippets, configuration, and/or reproduction instructions.* diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..f40050ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +.DS_Store +.cproject +.project +.settings +/vg.log +TAGS +run +target + +# emacs backup / temp files +*~ +\#*\# +.\#* + +# TLS credentials +key.pem +cert.pem +chain.pem +cacert.pem +cbl.txt diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..e42544ba --- /dev/null +++ b/.gitmodules @@ -0,0 +1,27 @@ +[submodule "modules/common"] + path = modules/common + url = https://github.com/aerospike/aerospike-common.git + ignore = dirty +[submodule "modules/mod-lua"] + path = modules/mod-lua + url = https://github.com/aerospike/aerospike-mod-lua.git +[submodule "modules/jansson"] + path = modules/jansson + url = https://github.com/aerospike/jansson.git + ignore = dirty +[submodule "modules/lua-core"] + path = modules/lua-core + url = https://github.com/aerospike/aerospike-lua-core.git +[submodule "modules/luajit"] + path = modules/luajit + url = https://github.com/aerospike/luajit.git +[submodule "modules/s2-geometry-library"] + path = modules/s2-geometry-library + url = https://github.com/aerospike/s2-geometry-library.git +[submodule "modules/telemetry"] + path = modules/telemetry + url = https://github.com/aerospike/aerospike-telemetry-agent.git +[submodule "modules/jemalloc"] + path = modules/jemalloc + url = https://github.com/aerospike/jemalloc.git + ignore = dirty diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..96473525 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,3 @@ +# Contributing + +For details on contributing to Aerospike, please read http://www.aerospike.com/community/contributor/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..efc2e191 --- /dev/null +++ b/LICENSE @@ -0,0 +1,260 @@ +================================================================================ + +AEROSPIKE SERVER LICENSE + +The Aerospike Server Community Edition is made available under the terms of +the GNU Affero General Public License version 3 (AGPLv3), as stated in the +file `LICENSE-AGPL`. + +Individual files may be made available under their own specific license, +all compatible with AGPLv3. Please see individual files for details. + +================================================================================ + +AEROSPIKE MODULE LICENSE + +The following directories and their subdirectories thereof are made available +under the terms of the Apache License, version 2.0, as stated in the file +`LICENSE-APACHE`, or a compatible license stated in the file itself. Please +see individual files for details. + + - modules/common + - modules/lua-core + - modules/mod-lua + - modules/telemetry + +================================================================================ + +THIRD PARTY LIBRARY LICENSES + +The following are the licenses for 3rd party libraries utilized by Aerospike +Server. + +-------------------------------------------------------------------------------- + +Lua +--- + +Copyright © 1994–2013 Lua.org, PUC-Rio. +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +associated documentation files (the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, publish, distribute, +sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial +portions of the Software.THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +LuaJIT +------ + +Copyright (C) 2005-2014 Mike Pall. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +[ MIT license: http://www.opensource.org/licenses/mit-license.php ] + +[ LuaJIT includes code from dlmalloc, which has this license statement: ] + +This is a version (aka dlmalloc) of malloc/free/realloc written by +Doug Lea and released to the public domain, as explained at +http://creativecommons.org/licenses/publicdomain + +-------------------------------------------------------------------------------- + +Jansson +------- + +Copyright (c) 2009-2012 Petri Lehtinen + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +associated documentation files (the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, publish, distribute, +sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +-------------------------------------------------------------------------------- + +jemalloc +-------- + +Copyright (C) 2002-2014 Jason Evans . +All rights reserved. +Copyright (C) 2007-2012 Mozilla Foundation. All rights reserved. +Copyright (C) 2009-2014 Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice(s), + this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice(s), + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +BTree +----- + +Copyright 1997-1999, 2001 John-Mark Gurney. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +-------------------------------------------------------------------------------- + +BCrypt +------ + +Written by Solar Designer in 1998-2011. +No copyright is claimed, and the software is hereby placed in the public +domain. In case this attempt to disclaim copyright and place the software +in the public domain is deemed null and void, then the software is +Copyright (c) 1998-2011 Solar Designer and it is hereby released to the +general public under the following terms: + +Redistribution and use in source and binary forms, with or without +modification, are permitted. + +There's ABSOLUTELY NO WARRANTY, express or implied. + +It is my intent that you should be able to use this on your system, +as part of a software package, or anywhere else to improve security, +ensure compatibility, or for any other purpose. I would appreciate +it if you give credit where it is due and keep your modifications in +the public domain as well, but I don't require that in order to let +you place this code and any modifications you make under a license +of your choice. + +-------------------------------------------------------------------------------- + +Concurrency Kit +--------------- + +Copyright 2010-2013 Samy Al Bahra. +Copyright 2011-2013 AppNexus, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + +Hazard Pointers (src/ck_hp.c) also includes this license: + +(c) Copyright 2008, IBM Corporation. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +ck_pr_rtm leverages work from Andi Kleen: +Copyright (c) 2012,2013 Intel Corporation + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that: (1) source code distributions +retain the above copyright notice and this paragraph in its entirety, (2) +distributions including binary code include the above copyright notice and +this paragraph in its entirety in the documentation or other materials +provided with the distribution + +THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + +-------------------------------------------------------------------------------- + +S2 +-- + +Copyright 2005 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + diff --git a/LICENSE-AGPL b/LICENSE-AGPL new file mode 100644 index 00000000..2def0e88 --- /dev/null +++ b/LICENSE-AGPL @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. \ No newline at end of file diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 00000000..d6456956 --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/LICENSE.3rdParty b/LICENSE.3rdParty new file mode 100644 index 00000000..44db61d9 --- /dev/null +++ b/LICENSE.3rdParty @@ -0,0 +1,235 @@ +================================================================================ + +THIRD PARTY LIBRARY LICENSES + +The following are the licenses for 3rd party libraries utilized by Aerospike +Server. + +-------------------------------------------------------------------------------- + +Lua +--- + +Copyright © 1994–2013 Lua.org, PUC-Rio. +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +associated documentation files (the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, publish, distribute, +sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial +portions of the Software.THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +LuaJIT +------ + +Copyright (C) 2005-2014 Mike Pall. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +[ MIT license: http://www.opensource.org/licenses/mit-license.php ] + +[ LuaJIT includes code from dlmalloc, which has this license statement: ] + +This is a version (aka dlmalloc) of malloc/free/realloc written by +Doug Lea and released to the public domain, as explained at +http://creativecommons.org/licenses/publicdomain + +-------------------------------------------------------------------------------- + +Jansson +------- + +Copyright (c) 2009-2012 Petri Lehtinen + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +associated documentation files (the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, publish, distribute, +sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +-------------------------------------------------------------------------------- + +jemalloc +-------- + +Copyright (C) 2002-2014 Jason Evans . +All rights reserved. +Copyright (C) 2007-2012 Mozilla Foundation. All rights reserved. +Copyright (C) 2009-2014 Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice(s), + this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice(s), + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +BTree +----- + +Copyright 1997-1999, 2001 John-Mark Gurney. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +-------------------------------------------------------------------------------- + +BCrypt +------ + +Written by Solar Designer in 1998-2011. +No copyright is claimed, and the software is hereby placed in the public +domain. In case this attempt to disclaim copyright and place the software +in the public domain is deemed null and void, then the software is +Copyright (c) 1998-2011 Solar Designer and it is hereby released to the +general public under the following terms: + +Redistribution and use in source and binary forms, with or without +modification, are permitted. + +There's ABSOLUTELY NO WARRANTY, express or implied. + +It is my intent that you should be able to use this on your system, +as part of a software package, or anywhere else to improve security, +ensure compatibility, or for any other purpose. I would appreciate +it if you give credit where it is due and keep your modifications in +the public domain as well, but I don't require that in order to let +you place this code and any modifications you make under a license +of your choice. + +-------------------------------------------------------------------------------- + +Concurrency Kit +--------------- + +Copyright 2010-2013 Samy Al Bahra. +Copyright 2011-2013 AppNexus, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + +Hazard Pointers (src/ck_hp.c) also includes this license: + +(c) Copyright 2008, IBM Corporation. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +ck_pr_rtm leverages work from Andi Kleen: +Copyright (c) 2012,2013 Intel Corporation + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that: (1) source code distributions +retain the above copyright notice and this paragraph in its entirety, (2) +distributions including binary code include the above copyright notice and +this paragraph in its entirety in the documentation or other materials +provided with the distribution + +THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED +WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + +-------------------------------------------------------------------------------- + +S2 +-- + +Copyright 2005 Google Inc. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + diff --git a/LICENSE.CE b/LICENSE.CE new file mode 100644 index 00000000..b45bda4e --- /dev/null +++ b/LICENSE.CE @@ -0,0 +1,25 @@ +================================================================================ + +AEROSPIKE SERVER LICENSE + +The Aerospike Server Community Edition is made available under the terms of +the GNU Affero General Public License version 3 (AGPLv3), as stated in the +file `LICENSE-AGPL`. + +Individual files may be made available under their own specific license, +all compatible with AGPLv3. Please see individual files for details. + +================================================================================ + +AEROSPIKE MODULE LICENSE + +The following directories and their subdirectories thereof are made available +under the terms of the Apache License, version 2.0, as stated in the file +`LICENSE-APACHE`, or a compatible license stated in the file itself. Please +see individual files for details. + + - modules/common + - modules/lua-core + - modules/mod-lua + - modules/telemetry + diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..65c3d595 --- /dev/null +++ b/Makefile @@ -0,0 +1,163 @@ +# Aerospike Server +# Makefile +# +# Main Build Targets: +# +# make {all|server} - Build the Aerospike Server. +# make clean - Remove build products, excluding built packages. +# make cleanpkg - Remove built packages. +# make cleanall - Remove all build products, including built packages. +# make cleangit - Remove all files untracked by Git. (Use with caution!) +# make strip - Build stripped versions of the server executables. +# +# Packaging Targets: +# +# make deb - Package server for Debian / Ubuntu platforms as a ".deb" file. +# make rpm - Package server for the Red Hat Package Manager (RPM.) +# make tar - Package server as a compressed tarball for every Linux platform. +# make source - Package the server source code as a compressed "tar" archive. +# +# Building a distribution release is a two step process: +# +# 1). The initial "make" builds the server itself. +# +# 2). The second step packages up the server using "make" with one of the following targets: +# +# rpm: Suitable for building and installing on Red Hat-derived systems. +# deb: Suitable for building and installing on Debian-derived systems. +# tar: Makes an "Every Linux" distribution, packaged as a compressed "tar" archive. +# +# Targets for running the Aerospike Server in the source tree: +# +# make init - Initialize the server run-time directories. +# make start - Start the server. +# make stop - Stop the server. +# + +# Common variable definitions: +include make_in/Makefile.vars + +.PHONY: all server +all server: targetdirs version $(JANSSON)/Makefile $(JEMALLOC)/Makefile $(LUAJIT)/src/luaconf.h +ifeq ($(USE_LUAJIT),1) + $(MAKE) -C $(LUAJIT) Q= TARGET_SONAME=libluajit.so CCDEBUG=-g +endif + $(MAKE) -C $(JEMALLOC) + $(MAKE) -C $(JANSSON) + $(MAKE) -C $(COMMON) CF=$(CF) EXT_CFLAGS="$(EXT_CFLAGS)" + $(MAKE) -C $(CF) + $(MAKE) -C $(MOD_LUA) CF=$(CF) COMMON=$(COMMON) LUA_CORE=$(LUA_CORE) EXT_CFLAGS="$(EXT_CFLAGS)" USE_LUAJIT=$(USE_LUAJIT) LUAJIT=$(LUAJIT) TARGET_SERVER=1 + $(MAKE) -C $(S2) + $(MAKE) -C ai + $(MAKE) -C as + +.PHONY: targetdirs +targetdirs: + mkdir -p $(GEN_DIR) $(LIBRARY_DIR) $(BIN_DIR) + mkdir -p $(OBJECT_DIR)/base $(OBJECT_DIR)/fabric $(OBJECT_DIR)/storage $(OBJECT_DIR)/geospatial $(OBJECT_DIR)/transaction + +strip: server + $(MAKE) -C xdr strip + $(MAKE) -C as strip + +.PHONY: init start stop +init: + @echo "Creating and initializing working directories..." + mkdir -p run/log run/work/smd run/work/sys/udf/lua run/work/usr/udf/lua + cp -pr modules/lua-core/src/* run/work/sys/udf/lua + +start: + @echo "Running the Aerospike Server locally..." + @PIDFILE=run/asd.pid ; if [ -f $$PIDFILE ]; then echo "Aerospike already running? Please do \"make stop\" first."; exit -1; fi + @nohup ./modules/telemetry/telemetry.py as/etc/telemetry_dev.conf > /dev/null 2>&1 & + $(BIN_DIR)/asd --config-file as/etc/aerospike_dev.conf + +stop: + @echo "Stopping the local Aerospike Server..." + @PIDFILE=run/asd.pid ; if [ -f $$PIDFILE ]; then kill `cat $$PIDFILE`; rm $$PIDFILE; fi + @PID=`pgrep telemetry.py | grep -v grep`; if [ -n "$$PID" ]; then kill $$PID; fi + +.PHONY: clean +clean: cleanmodules cleandist + $(RM) $(VERSION_SRC) $(VERSION_OBJ) + $(RM) -rf $(TARGET_DIR) + +.PHONY: cleanmodules +cleanmodules: + $(MAKE) -C $(COMMON) clean + if [ -e "$(JANSSON)/Makefile" ]; then \ + $(MAKE) -C $(JANSSON) clean; \ + $(MAKE) -C $(JANSSON) distclean; \ + fi + if [ -e "$(JEMALLOC)/Makefile" ]; then \ + $(MAKE) -C $(JEMALLOC) clean; \ + $(MAKE) -C $(JEMALLOC) distclean; \ + fi + if [ -e "$(LUAJIT)/Makefile" ]; then \ + $(MAKE) -C $(LUAJIT) clean; \ + fi + $(MAKE) -C $(MOD_LUA) COMMON=$(COMMON) LUA_CORE=$(LUA_CORE) USE_LUAJIT=$(USE_LUAJIT) LUAJIT=$(LUAJIT) clean + $(MAKE) -C $(S2) clean + +.PHONY: cleandist +cleandist: + $(RM) -r pkg/dist/* + +.PHONY: cleanall +cleanall: clean cleanpkg + +.PHONY: cleanpkg +cleanpkg: + $(RM) pkg/packages/* + +GIT_CLEAN = git clean -fdx + +.PHONY: cleangit +cleangit: + cd $(COMMON); $(GIT_CLEAN) + cd $(JANSSON); $(GIT_CLEAN) + cd $(JEMALLOC); $(GIT_CLEAN) + cd $(LUA_CORE); $(GIT_CLEAN) + cd $(LUAJIT); $(GIT_CLEAN) + cd $(MOD_LUA); $(GIT_CLEAN) + cd $(S2); $(GIT_CLEAN) + $(GIT_CLEAN) + +.PHONY: rpm deb tar +rpm deb tar src: + $(MAKE) -C pkg/$@ EDITION=$(EDITION) + +$(VERSION_SRC): targetdirs + build/gen_version $(EDITION) $(shell $(DEPTH)/build/os_version) > $(VERSION_SRC) + +$(VERSION_OBJ): $(VERSION_SRC) + $(CC) -o $@ -c $< + +.PHONY: version +version: $(VERSION_OBJ) + +$(JANSSON)/configure: + cd $(JANSSON) && autoreconf -i + +$(JANSSON)/Makefile: $(JANSSON)/configure + cd $(JANSSON) && ./configure $(JANSSON_CONFIG_OPT) + +$(JEMALLOC)/configure: + cd $(JEMALLOC) && autoconf + +$(JEMALLOC)/Makefile: $(JEMALLOC)/configure + cd $(JEMALLOC) && ./configure $(JEM_CONFIG_OPT) + +$(LUAJIT)/src/luaconf.h: $(LUAJIT)/src/luaconf.h.orig + ln -s $(notdir $<) $@ + +.PHONY: source +source: src + +tags etags: + etags `find ai as cf modules xdr $(EEREPO) -name "*.[ch]" -o -name "*.cc" | egrep -v '(target/Linux|m4)'` `find /usr/include -name "*.h"` + +# Common target definitions: +ifneq ($(EEREPO),) + include $(EEREPO)/make_in/Makefile.targets +endif diff --git a/README.md b/README.md new file mode 100644 index 00000000..cb7c5f0a --- /dev/null +++ b/README.md @@ -0,0 +1,224 @@ +# Aerospike Database Server + +Welcome to the Aerospike Database Server source code tree! + +Aerospike is a distributed, scalable NoSQL database. It is architected with three key objectives: + +- To create a high-performance, scalable platform that would meet the needs of today's web-scale applications +- To provide the robustness and reliability (i.e., ACID) expected from traditional databases. +- To provide operational efficiency (minimal manual involvement) + +For more information on Aerospike, please visit: [`http://aerospike.com`](http://aerospike.com) + +## Telemetry Anonymized Data Collection + +The Aerospike Community Edition collects anonymized server performance statistics. +Please see the +[Aerospike Telemetery web page](http://aerospike.com/aerospike-telemetry) for more +information. The full Telemetry data collection agent source code may be found in the +["telemetry" submodule](https://github.com/aerospike/aerospike-telemetry-agent/blob/master/README.md). + +## Build Prerequisites + +The Aerospike Database Server can be built and deployed on various +current 64-bit GNU/Linux platform versions, such as the Red Hat family (e.g., +CentOS 6 or later), Debian 7 or later, and Ubuntu 10.04 or later. + +### Dependencies + +The majority of the Aerospike source code is written in the C +programming language, conforming to the ANSI C99 standard. + +In particular, the following tools and libraries are needed: + +#### C Compiler Toolchain + +Building Aerospike requires the GCC 4.1 or later C compiler toolchain, +with the standard GNU/Linux development tools and libraries installed in +the build environment, including: + +* `autoconf` + +* `automake` + +* `libtool` + +* `make` + +#### C++ + +The C++ compiler is required for the Aerospike geospatial indexing +feature and its dependency, Google's S2 Geometry Library (both written in C++.) + +* The required CentOS 6/7 package to install is: `gcc-c++`. + +* The required Debian 7/8 and Ubuntu 10/12/14/16 package to install is: `g++`. + +#### OpenSSL + +OpenSSL 0.9.8b or later is required for cryptographic hash functions +(RIPEMD-160 & SHA-1) and pseudo-random number generation. + +* The CentOS 6/7 OpenSSL packages to install are: `openssl`, +`openssl-devel`, `openssl-static`. + +* The Debian 7/8 and Ubuntu 10/12/14/16 OpenSSL packages to install are: +`openssl` and `libssl-dev`. + +#### Lua 5.1 + +The [Lua](http://www.lua.org) 5.1 language is required for User Defined +Function (UDF) support. + +* By default, Aerospike builds with Lua 5.1 support provided by the +[LuaJIT](http://luajit.org) submodule. + +* Alternatively, it is possible to build with standard Lua 5.1 provided +by the build environment. In that case: + + * The CentOS 6/7 Lua packages to install are: `lua`, +`lua-devel`, and `lua-static`. + + * The Debian 7/8 and Ubuntu 10/12/14/16 Lua packages to install are: +`lua5.1` and `liblua5.1-dev`. + + * Build by passing the `USE_LUAJIT=0` option to `make`. + +#### Python 2 + +Running the Telemetry Agent requires Python 2.6+, which is available by default on most +platforms, and can be installed on Ubuntu 16.04 as the package `python`. + +### Submodules + +The Aerospike Database Server build depends upon 8 submodules: + +| Submodule | Description | +|---------- | ----------- | +| common | The Aerospike Common Library | +| jansson | C library for encoding, decoding and manipulating JSON data | +| jemalloc | The JEMalloc Memory Allocator | +| lua-core | The Aerospike Core Lua Source Files | +| luajit | The LuaJIT (Just-In-Time Compiler for Lua) | +| mod-lua | The Aerospike Lua Interface | +| s2-geometry-library | The S2 Spherical Geometry Library | +| telemetry | The Aerospike Telemetry Agent (Community Edition only) | + +After the initial cloning of the `aerospike-server` repo., the +submodules must be fetched for the first time using the following +command: + + $ git submodule update --init + +*Note:* As this project uses submodules, the source archive downloadable +via GitHub's `Download ZIP` button will not build unless the correct +revision of each submodule is first manually installed in the appropriate +`modules` subdirectory. + +## Building Aerospike + +### Default Build + + $ make -- Perform the default build (no packaging.) + +*Note:* You can use the `-j` option with `make` to speed up the build +on multiple CPU cores. For example, to run four parallel jobs: + + $ make -j4 + +### Build Options + + $ make deb -- Build the Debian (Ubuntu) package. + + $ make rpm -- Build the Red Hat Package Manager (RPM) package. + + $ make tar -- Build the "Every Linux" compressed "tar" archive (".tgz") package. + + $ make source -- Package the source code as a compressed "tar" archive. + + $ make clean -- Delete any existing build products, excluding built packages. + + $ make cleanpkg -- Delete built packages. + + $ make cleanall -- Delete all existing build products, including built packages. + + $ make cleangit -- Delete all files untracked by Git. (Use with caution!) + + $ make strip -- Build "strip(1)"ed versions of the server executables. + +### Overriding Default Build Options + + $ make {}* {=}* -- Build (s) with optional variable overrides. + +#### Example: + + $ make USE_JEM=0 -- Default build *without* JEMalloc support. + +## Configuring Aerospike + +Sample Aerospike configuration files are provided in `as/etc`. The +developer configuration file, `aerospike_dev.conf`, contains basic +settings that should work out-of-the-box on most systems. The package +example configuration files, `aerospike.conf`, and the Solid State Drive +(SSD) version, `aerospike_ssd.conf`, are suitable for running Aerospike +as a system daemon. + +These sample files may be modified for specific use cases (e.g., setting +network addresses, defining namespaces, and setting storage engine +properties) and tuned for for maximum performance on a particular +system. Also, system resource limits may need to be increased to allow, +e.g., a greater number of concurrent connections to the database. See +"man limits.conf" for how to change the system's limit on a process' +number of open file descriptors ("nofile".) + +## Running Aerospike + +There are several options for running the Aerospike database. Which +option to use depends upon whether the primary purpose is production +deployment or software development. + +The preferred method for running Aerospike in a production environment +is to build and install the Aerospike package appropriate for the target +Linux distribution (i.e., an `".rpm"`, `".deb"`, or `".tgz"` file), and +then to control the state of the Aerospike daemon, either via the SysV +daemon init script commands, e.g., `service aerospike start`, or else +via `systemctl` on `systemd`-based systems, e.g., `systemctl start aerospike`. + +A convenient way to run Aerospike in a development environment is to use +the following commands from within the top-level directory of the source +code tree (`aerospike-server`): + +To create and initialize the `run` directory with the files needed for +running Aerospike, use: + + $ make init + +or, equivalently: + + $ mkdir -p run/{log,work/{smd,{sys,usr}/udf/lua}} + $ cp -pr modules/lua-core/src/* run/work/sys/udf/lua + +To launch the server with `as/etc/aerospike_dev.conf` as the config: + + $ make start + +or, equivalently: + + $ nohup ./modules/telemetry/telemetry.py as/etc/telemetry_dev.conf > /dev/null 2>&1 & + $ target/Linux-x86_64/bin/asd --config-file as/etc/aerospike_dev.conf + +To halt the server: + + $ make stop + +or, equivalently: + + $ PID=`pgrep telemetry.py | grep -v grep`; if [ -n "$PID" ]; then kill $PID; fi + $ kill `cat run/asd.pid` ; rm run/asd.pid + +Please refer to the full documentation on the Aerospike web site, +[`http://aerospike.com/docs/`](http://aerospike.com/docs/), for more +detailed information about configuring and running the Aerospike +Database Server, as well as about the Aerospike client API packages +for popular programming languages. + diff --git a/ai/Makefile b/ai/Makefile new file mode 100644 index 00000000..a800e597 --- /dev/null +++ b/ai/Makefile @@ -0,0 +1,9 @@ +# Aerospike Server -- Aerospike Index +# Makefile + +.PHONY: default +default: all + @echo "done." + +%: + $(MAKE) -C src $@ diff --git a/ai/include/ai_btree.h b/ai/include/ai_btree.h new file mode 100644 index 00000000..e431d074 --- /dev/null +++ b/ai/include/ai_btree.h @@ -0,0 +1,68 @@ +/* + * ai_btree.h + * + * Copyright (C) 2013-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include "base/secondary_index.h" + +#include "ai_obj.h" +#include "btreepriv.h" + +#include + +void ai_btree_create(as_sindex_metadata *imd); + +void ai_btree_destroy(as_sindex_metadata *imd); + +int ai_btree_put(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, void *key, cf_digest *value); + +int ai_btree_delete(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, void *key, cf_digest *val); + +int ai_btree_query(as_sindex_metadata *imd, as_sindex_range *range, as_sindex_qctx *qctx); + +uint64_t ai_btree_get_isize(as_sindex_metadata *imd); + +uint64_t ai_btree_get_nsize(as_sindex_metadata *imd); + +uint64_t ai_btree_get_pimd_nsize(as_sindex_pmetadata *pimd); + +uint64_t ai_btree_get_pimd_isize(as_sindex_pmetadata *pimd); + +int ai_btree_list(char *ns, char *set, as_sindex_metadata **imds, int *num_indexes); + +uint64_t ai_btree_get_numkeys(as_sindex_metadata *imd); + +void ai_btree_dump(as_sindex_metadata *imd, char *fname, bool verbose); + +int ai_btree_build_defrag_list(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, struct ai_obj *icol, ulong *nofst, ulong lim, uint64_t * tot_processed, uint64_t * tot_found, cf_ll *apk2d); + +bool ai_btree_defrag_list(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, cf_ll *apk2d, ulong n2del, ulong *deleted); + +int ai_btree_key_hash_from_sbin(as_sindex_metadata *imd, as_sindex_bin_data *sbin); + +int ai_btree_key_hash(as_sindex_metadata *imd, void *skey); + +void ai_btree_delete_ibtr(bt *ibtr); + +void ai_btree_reinit_pimd(as_sindex_pmetadata *pimd, col_type_t sktype); + +void ai_btree_reset_pimd(as_sindex_pmetadata * pimd); diff --git a/ai/include/ai_obj.h b/ai/include/ai_obj.h new file mode 100644 index 00000000..9107a949 --- /dev/null +++ b/ai/include/ai_obj.h @@ -0,0 +1,42 @@ +/* + * ai_obj.h + * + * Copyright (C) 2013-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +/* + * Aerospike Index Object Declarations. + */ + +#pragma once + +#include + +#include "ai_types.h" + +void init_ai_obj(ai_obj *a); + +void init_ai_objLong(ai_obj *a, ulong l); + +void init_ai_objU160(ai_obj *a, uint160 y); + +void ai_objClone(ai_obj *dest, ai_obj *src); + +bool ai_objEQ(ai_obj *a, ai_obj *b); + +void dump_ai_obj_as_digest(FILE *fp, ai_obj *a); diff --git a/ai/include/ai_types.h b/ai/include/ai_types.h new file mode 100644 index 00000000..ab8a1de2 --- /dev/null +++ b/ai/include/ai_types.h @@ -0,0 +1,101 @@ +/* + * ai_types.h + * + * Copyright (C) 2013-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +/* + * SYNOPSIS + * This file provides common declarations and definitions for + * the Aerospike Index module. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#define uchar unsigned char +#define ushort16 unsigned short +#define uint32 unsigned int +#define ull unsigned long long +#define uint128 __uint128_t + +#define AS_DIGEST_KEY_SZ 20 +typedef struct uint160 { + char digest[AS_DIGEST_KEY_SZ]; +} uint160; + +// Same as as_sindex_ktype +typedef uint8_t col_type_t; +#define COL_TYPE_INVALID 0 +#define COL_TYPE_LONG 1 +#define COL_TYPE_DIGEST 2 +#define COL_TYPE_GEOJSON 3 +#define COL_TYPE_MAX 4 + +#define C_IS_L(ctype) (ctype == COL_TYPE_LONG) +#define C_IS_DG(ctype) (ctype == COL_TYPE_DIGEST) +#define C_IS_G(ctype) (ctype == COL_TYPE_GEOJSON) +// TODO - should this have C_IS_G as well +#define C_IS_NUM(ctype) (C_IS_L(ctype)) + +#define VOIDINT (void *) (long) + +#define SPLICE_160(num) \ + ull ubh, ubm; uint32 u; \ + char *pbu = (char *) # \ + memcpy(&ubh, pbu + 12, 8); \ + memcpy(&ubm, pbu + 4, 8); \ + memcpy(&u, pbu, 4); + +#define DEBUG_U160(fp, num) \ + { \ + SPLICE_160(num); \ + fprintf(fp, "DEBUG_U160: high: %llu mid: %llu low: %u", ubh, ubm, u); \ + } + +/***************** Opaque Forward Type Declarations *****************/ + +/* + * B-Tree Object [Implementation defined in "btreepriv.h".] + */ +typedef struct btree bt; + + +/***************** Type Declarations *****************/ +typedef struct ai_obj { + ulong l; + uint160 y; + col_type_t type; +} ai_obj; + +typedef struct filter { + ai_obj alow; + ai_obj ahigh; +} f_t; + +typedef struct check_sql_where_clause { + f_t wf; +} cswc_t; diff --git a/ai/include/bt.h b/ai/include/bt.h new file mode 100644 index 00000000..8d0f140d --- /dev/null +++ b/ai/include/bt.h @@ -0,0 +1,90 @@ +/* + * bt.h + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +/* + * Creation of different btree types and + * Public Btree Operations w/ stream abstractions under the covers + */ + +#pragma once + +#include "ai_obj.h" +#include "btreepriv.h" + +bt *createIBT (col_type_t ktype, int imatch); +bt *createNBT (col_type_t ktype); + +/* different Btree types */ +#define INDEX_BTREE 0 +#define NODE_BTREE 1 + +// SPAN OUTS +// This values are choosen to fit the node size into multiples +// of cacheline (64 byte) +#define BTREE_LONG_TYPE_DEGREE 31 // node size becomes 504 +#define BTREE_STRING_TYPE_DEGREE 18 // node size becomes 512 + +#define NBT_DG(btr) \ + (btr->s.btype == NODE_BTREE && C_IS_DG(btr->s.ktype)) + +#define NBT(btr) (NBT_DG(btr)) + +typedef struct ulong_ulong_key { + ulong key; + ulong val; +} __attribute__ ((packed)) llk; +#define LL(btr) (btr->s.bflag & BTFLAG_ULONG_ULONG) +#define LL_SIZE 16 +typedef struct u160_ulong_key { + uint160 key; + ulong val; +} __attribute__ ((packed)) ylk; +#define YL(btr) (btr->s.bflag & BTFLAG_U160_ULONG) +#define YL_SIZE 28 + +typedef struct btk_t { + llk LL; + ylk YL; +} btk_t; + +#define DECLARE_BT_KEY(akey, ret) \ + bool med; uint32 ksize; btk_t btk; \ + char *btkey = createBTKey(akey, &med, &ksize, btr, &btk);/*FREE ME 026*/ \ + if (!btkey) return ret; + +typedef struct crs_t { + llk LL_StreamPtr; + ylk YL_StreamPtr; +} crs_t; + +#define OTHER_BT(btr) (btr->s.bflag >= BTFLAG_ULONG_ULONG) +#define NONE_BT(btr) (btr->s.bflag == BTFLAG_U160) +#define BIG_BT(btr) (btr->s.ksize > 8) + +#define IS_GHOST(btr, rrow) (NONE_BT(btr) && rrow && !(*(uchar *)rrow)) + +void btIndAdd (bt *ibtr, ai_obj *ikey, bt *nbtr); +bt *btIndFind (bt *ibtr, ai_obj *ikey); +int btIndDelete(bt *ibtr, ai_obj *ikey); + +bool btIndNodeAdd (bt *nbtr, ai_obj *apk); +bool btIndNodeExist (bt *nbtr, ai_obj *apk); +int btIndNodeDelete (bt *nbtr, ai_obj *apk, ai_obj *ocol); diff --git a/ai/include/bt_iterator.h b/ai/include/bt_iterator.h new file mode 100644 index 00000000..468706c9 --- /dev/null +++ b/ai/include/bt_iterator.h @@ -0,0 +1,98 @@ +/* + * bt_iteretor.h + * + * Copyright (C) 2013-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * This file implements Aerospike Index B-tree iterators. + */ + +#pragma once + +#include "ai_types.h" +#include "bt.h" + +typedef struct btEntry { + void *key; + void *val; + void *stream; // some iterators need the raw stream (INDEX CURSORS) + bt_n *x; // some iterators need the position in the bt_n + int i; // some iterators need the position in the bt_n + bool missed; + uint32 dr; // RANGE DELETEs simulate Keys using DR +} btEntry; + +typedef struct bTreeLinkedListNode { // 3ptr(24) 2int(8) -> 32 bytes + struct bTreeLinkedListNode *parent; + struct btreenode *self; + struct bTreeLinkedListNode *child; + int ik; + int in; //TODO in not needed, ik & logic is enough +} bt_ll_n; + +typedef void iter_single(struct btIterator *iter); + +/* using 16 as 8^16 can hold 2.8e14 elements (8 is min members in a btn)*/ +#define MAX_BTREE_DEPTH 16 +typedef struct btIterator { // 60B + 16*bt_ll_n(512) -> dont malloc + bt *btr; + bt_ll_n *bln; + int depth; + iter_single *iNode; // function to iterate on node's + iter_single *iLeaf; // function to iterate on leaf's + bool finished; + long high; // HIGH for INT & LONG + uint160 highy; // HIGH for U160 + uchar num_nodes; // \/-slot in nodes[] + bt_ll_n nodes[MAX_BTREE_DEPTH]; +} btIterator; + +typedef struct btSIter { // btIterator 500+ bytes -> STACK (globals) ALLOCATE + btIterator x; + bool missed; // CURRENT iteration is miss + bool nim; // NEXT iteration is miss + bool empty; + bool scan; + col_type_t ktype; + btEntry be; + ai_obj key; // static AI_OBJ for be.key + char dofree; +} btSIter; + +#define II_FAIL -1 +#define II_OK 0 +#define II_LEAF_EXIT 1 +#define II_ONLY_RIGHT 2 +#define II_MISS 3 +#define II_L_MISS 4 + +bt_ll_n *get_new_iter_child(btIterator *iter); +void to_child(btIterator *iter, bt_n* self); +int init_iterator(bt *btr, bt_data_t simkey, struct btIterator *iter); + +btSIter *btGetRangeIter (bt *btr, ai_obj *alow, ai_obj *ahigh, bool asc); +btSIter *btGetFullRangeIter(bt *btr, bool asc, cswc_t *w); +btSIter *btGetFullXthIter (bt *btr, ulong x, bool asc, cswc_t *w, long lim); +btSIter *btSetFullRangeIter(btSIter *iter, bt *btr, bool asc, cswc_t *w); +btSIter *btSetRangeIter (btSIter *iter, bt *btr, ai_obj *alow, ai_obj *ahigh, bool asc); +btEntry *btRangeNext (btSIter *iter, bool asc); +void btReleaseRangeIterator(btSIter *iter); +bool assignMinKey(bt *btr, ai_obj *key); +bool assignMaxKey(bt *btr, ai_obj *key); diff --git a/ai/include/bt_output.h b/ai/include/bt_output.h new file mode 100644 index 00000000..605d7428 --- /dev/null +++ b/ai/include/bt_output.h @@ -0,0 +1,34 @@ +/* + * bt_output.h + * + * Copyright (C) 2013 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +/* + * SYNOPSIS + * This file provides declarations for the B-Tree output functions. + */ + +#pragma once + +#include + +#include "bt.h" + +void bt_dump_info(FILE *fp, bt *btr); +void bt_dumptree(FILE *fp, bt *btr, bool is_index, bool verbose); diff --git a/ai/include/btree.h b/ai/include/btree.h new file mode 100644 index 00000000..6b40272c --- /dev/null +++ b/ai/include/btree.h @@ -0,0 +1,87 @@ +/*- + * Copyright 1997, 1998, 2001 John-Mark Gurney. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#pragma once + +#include "ai_types.h" + +struct btree; +struct btreenode; + +#define VOIDSIZE 8 /* force to 8, otherwise UU would not work on 32bit */ +#define U160SIZE AS_DIGEST_KEY_SZ + +typedef struct btree_specification { /* size 9B */ + unsigned char ktype; /* [STRING,INT,FLOAT,LONG]--------------------| */ + unsigned char btype; /* [data,index,node] | */ + unsigned char ksize; /* UU&INDEX(8), UL&LU(12), LL(16) | */ + unsigned int bflag; /* [OTHER_BT + BTFLAG_*_INDEX] | */ + unsigned short num; /*--------------------------------------------| */ +} __attribute__ ((packed)) bts_t; + +typedef void * bt_data_t; +typedef int (*bt_cmp_t)(bt_data_t k1, bt_data_t k2); + +// CONSTRUCTOR CONSTRUCTOR CONSTRUCTOR CONSTRUCTOR CONSTRUCTOR CONSTRUCTOR +struct btree *bt_create(bt_cmp_t cmp, bts_t *s, char dirty); + +// CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD CRUD +typedef struct data_with_dirt_t { + bt_data_t k; // the data + uint32 dr; // dirty-right +} dwd_t; +bool bt_insert (struct btree *btr, bt_data_t k, uint32 dr); +dwd_t bt_delete (struct btree *btr, bt_data_t k, bool leafd); + +// OPERATORS OPERATORS OPERATORS OPERATORS OPERATORS OPERATORS OPERATORS +bt_data_t bt_max (struct btree *btr); +bt_data_t bt_min (struct btree *btr); +bt_data_t bt_find (struct btree *btr, bt_data_t k, ai_obj *akey); + +// DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY +struct btreenode *addDStoBTN(struct btree *btr, struct btreenode *x, + struct btreenode *p, int pi, char dirty); + +uint32 getDR (struct btree *btr, struct btreenode *x, int i); +bool bt_exist (struct btree *btr, bt_data_t k, ai_obj *akey); + +typedef struct data_with_miss_t { + bt_data_t k; // the data + bool miss; + struct btreenode *x; // NOTE: used for DELETE an EVICTed row + int i; // NOTE: used for DELETE an EVICTed row + struct btreenode *p; // NOTE: used for DELETE an EVICTed row + int pi; // NOTE: used for DELETE an EVICTed row +} dwm_t; + +struct ai_obj; +dwm_t findnodekey(struct btree *btr, struct btreenode *x, bt_data_t k, ai_obj *akey); + +// ITERATOR ITERATOR ITERATOR ITERATOR ITERATOR ITERATOR ITERATOR ITERATOR +struct btIterator; +int bt_init_iterator(struct btree *br, bt_data_t k, struct btIterator *iter, ai_obj *alow); +void bt_destroy (struct btree *btr); diff --git a/ai/include/btreepriv.h b/ai/include/btreepriv.h new file mode 100644 index 00000000..f010eeeb --- /dev/null +++ b/ai/include/btreepriv.h @@ -0,0 +1,105 @@ +/*- + * Copyright 1997-1999, 2001 John-Mark Gurney. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#pragma once + +#include "btree.h" + +// BTREE TYPE FLAGS +#define BTFLAG_U160 0x00 +#define BTFLAG_ULONG_ULONG 0x01 +#define BTFLAG_U160_ULONG 0x02 + +struct btree { // 62 Bytes -> 64B + struct btreenode *root; + bt_cmp_t cmp; + + unsigned long msize; + unsigned long nsize; // sizeof underlying nbtr + unsigned long dsize; + + unsigned int numkeys; /* --- 8 bytes | */ + unsigned int numnodes; /* ------------| */ + + unsigned short keyofst; /* --- 8 bytes | */ //TODO can be computed + unsigned short nodeofst; /* | */ //TODO can be computed + unsigned short nbyte; /* | */ + unsigned short kbyte; /* ------------| */ + + unsigned char t; + unsigned char nbits; + bts_t s; // 9 bytes + + unsigned int dirty_left; // 4 bytes (num evicted before 1st key) + unsigned char dirty; // NOTE: bool: if ANY btn in btr is dirty +} __attribute__ ((packed)); + +// Aerospike Index local list ... this is to optimize for space for the high selectivity index. +typedef struct { + uint8_t capacity; + uint8_t used; + uint8_t data[]; +} __attribute__ ((__packed__)) ai_arr; + +/* + * Note: The "ai_arr" structure is limited to 8 bits for capacity / used. + */ +#define AI_ARR_MAX_SIZE 255 + +// Do not change order it is same as struct B-tree inside Aerospike Index ~~ +// pretty hacky stuff. Inside Aerospike Index code is_btree is checked +typedef struct { + union { + ai_arr *arr; + bt *nbtr; + } u; + bool is_btree; +} __attribute__ ((__packed__)) ai_nbtr; + +//NOTE: For Aerospike, not currently using EVICT, save one byte in bt_n +// This changes a 2049 allocation to 2048 -> which is IMPORTANT +typedef struct btreenode { // 9 bytes -> 16 bytes + unsigned int scion; /* 4 billion max scion */ + unsigned short n; /* 65 thousand max entries (per bt_n)*/ + unsigned char leaf; + // DIRTY: -1->CLEAN, + // 0->TreeDirty but BTN_clean, 1->ucharDR, 2->ushortDR, 3->uintDR + char dirty; +} __attribute__ ((packed)) bt_n; + +// BTREE access of KEYs & NODEs via position in bt_n +void *KEYS(bt *btr, bt_n *x, int i); +#define NODES(btr, x) ((bt_n **)((char *)x + btr->nodeofst)) + +#define GET_BTN_SIZE(leaf) \ + size_t nsize = leaf ? btr->kbyte : btr->nbyte; +#define GET_BTN_MSIZE(dirty) \ + size_t msize = (dirty == -1) ? nsize : nsize + sizeof(void *); +#define GET_BTN_SIZES(leaf, dirty) \ + GET_BTN_SIZE(leaf) GET_BTN_MSIZE(dirty) +#define GET_DS(x, nsize) (*((void **)((char *)x + nsize))) + +bt_n *findminnode(bt *btr, bt_n *x); diff --git a/ai/include/stream.h b/ai/include/stream.h new file mode 100644 index 00000000..bc6c8658 --- /dev/null +++ b/ai/include/stream.h @@ -0,0 +1,41 @@ +/* + * stream.h + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +/* + * This file implements stream parsing for rows + */ + +#pragma once + +#include "ai_obj.h" +#include "bt.h" + +int u160Cmp (void *s1, void *s2); +int llCmp (void *s1, void *s2); +int ylCmp (void *s1, void *s2); + +char *createBTKey(ai_obj *key, bool *med, uint32 *ksize, bt *btr, btk_t *btk); +void destroyBTKey(char *btkey, bool med); + +void convertStream2Key(uchar *stream, ai_obj *key, bt *btr); +uchar *parseStream(uchar *stream, bt *btr); +void *createStream(bt *btr, void *val, char *btkey, uint32 klen, uint32 *ssize, crs_t *crs); +bool destroyStream(bt *btr, uchar *ostream); diff --git a/ai/src/Makefile b/ai/src/Makefile new file mode 100644 index 00000000..64cccaf9 --- /dev/null +++ b/ai/src/Makefile @@ -0,0 +1,33 @@ +# Aerospike Server - Aerospike Index +# Makefile + +DEPTH = ../.. +include $(DEPTH)/make_in/Makefile.in + +HEADERS = ai_btree.h ai_types.h ai_obj.h bt.h bt_iterator.h bt_output.h btree.h btreepriv.h stream.h + +SOURCES = ai_btree.c ai_obj.c bt.c bt_code.c bt_iterator.c bt_output.c stream.c + +INCLUDES += $(INCLUDE_DIR:%=-I%) +INCLUDES += -I$(CF)/include -I$(AS)/include +INCLUDES += -I$(AS)/include +INCLUDES += -I$(COMMON)/target/$(PLATFORM)/include +INCLUDES += -I$(MOD_LUA)/target/$(PLATFORM)/include + +LIBRARY = $(LIBRARY_DIR)/libai.a + +OBJECTS = $(SOURCES:%.c=$(OBJECT_DIR)/%.o) +DEPENDENCIES = $(OBJECTS:%.o=%.d) + +.PHONY: all +all: $(LIBRARY) + +.PHONY: clean +clean: + $(RM) $(OBJECTS) $(LIBRARY) + $(RM) $(DEPENDENCIES) + +$(LIBRARY): $(OBJECTS) + $(AR) rs $(LIBRARY) $(OBJECTS) + +include $(DEPTH)/make_in/Makefile.targets diff --git a/ai/src/ai_btree.c b/ai/src/ai_btree.c new file mode 100644 index 00000000..c81ce4b7 --- /dev/null +++ b/ai/src/ai_btree.c @@ -0,0 +1,1178 @@ +/* + * ai_btree.c + * + * Copyright (C) 2013-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include +#include +#include +#include +#include +#include + +#include "ai_obj.h" +#include "ai_btree.h" +#include "bt_iterator.h" +#include "bt_output.h" +#include "stream.h" +#include "base/thr_sindex.h" +#include "base/cfg.h" +#include "fabric/partition.h" + +#include +#include +#include +#include + +#include "fault.h" + +#define AI_ARR_MAX_USED 32 + +/* + * Global determining whether to use array rather than B-Tree. + */ +bool g_use_arr = true; + +static void +cloneDigestFromai_obj(cf_digest *d, ai_obj *akey) +{ + memcpy(d, &akey->y, CF_DIGEST_KEY_SZ); +} + +static void +init_ai_objFromDigest(ai_obj *akey, cf_digest *d) +{ + init_ai_objU160(akey, *(uint160 *)d); +} + +const uint8_t INIT_CAPACITY = 1; + +static ai_arr * +ai_arr_new() +{ + ai_arr *arr = cf_malloc(sizeof(ai_arr) + (INIT_CAPACITY * CF_DIGEST_KEY_SZ)); + arr->capacity = INIT_CAPACITY; + arr->used = 0; + return arr; +} + +static void +ai_arr_move_to_tree(ai_arr *arr, bt *nbtr) +{ + for (int i = 0; i < arr->used; i++) { + ai_obj apk; + init_ai_objFromDigest(&apk, (cf_digest *)&arr->data[i * CF_DIGEST_KEY_SZ]); + if (!btIndNodeAdd(nbtr, &apk)) { + // what to do ?? + continue; + } + } +} + +/* + * Side effect if success full *arr will be freed + */ +static void +ai_arr_destroy(ai_arr *arr) +{ + if (!arr) return; + cf_free(arr); +} + +static int +ai_arr_size(ai_arr *arr) +{ + if (!arr) return 0; + return(sizeof(ai_arr) + (arr->capacity * CF_DIGEST_KEY_SZ)); +} + +/* + * Finds the digest in the AI array. + * Returns + * idx if found + * -1 if not found + */ +static int +ai_arr_find(ai_arr *arr, cf_digest *dig) +{ + for (int i = 0; i < arr->used; i++) { + if (0 == cf_digest_compare(dig, (cf_digest *)&arr->data[i * CF_DIGEST_KEY_SZ])) { + return i; + } + } + return -1; +} + +static ai_arr * +ai_arr_shrink(ai_arr *arr) +{ + int size = arr->capacity / 2; + + // Do not shrink if the capacity not greater than 4 + // or if the halving capacity is not a extra level + // over currently used + if ((arr->capacity <= 4) || + (size < arr->used * 2)) { + return arr; + } + + ai_arr * temp_arr = cf_realloc(arr, sizeof(ai_arr) + (size * CF_DIGEST_KEY_SZ)); + temp_arr->capacity = size; + return temp_arr; +} + +static ai_arr * +ai_arr_delete(ai_arr *arr, cf_digest *dig, bool *notfound) +{ + int idx = ai_arr_find(arr, dig); + // Nothing to delete + if (idx < 0) { + *notfound = true; + return arr; + } + if (idx != arr->used - 1) { + int dest_offset = idx * CF_DIGEST_KEY_SZ; + int src_offset = (arr->used - 1) * CF_DIGEST_KEY_SZ; + // move last element + memcpy(&arr->data[dest_offset], &arr->data[src_offset], CF_DIGEST_KEY_SZ); + } + arr->used--; + return ai_arr_shrink(arr); +} + +/* + * Returns + * arr pointer in case of successful operation + * NULL in case of failure + */ +static ai_arr * +ai_arr_expand(ai_arr *arr) +{ + int size = arr->capacity * 2; + + if (size > AI_ARR_MAX_SIZE) { + cf_crash(AS_SINDEX, "Refusing to expand ai_arr to %d (beyond limit of %d)", size, AI_ARR_MAX_SIZE); + } + + arr = cf_realloc(arr, sizeof(ai_arr) + (size * CF_DIGEST_KEY_SZ)); + //cf_info(AS_SINDEX, "EXPAND REALLOC to %d", size); + arr->capacity = size; + return arr; +} + +/* + * Returns + * arr in case of success + * NULL in case of failure + */ +static ai_arr * +ai_arr_insert(ai_arr *arr, cf_digest *dig, bool *found) +{ + int idx = ai_arr_find(arr, dig); + // already found + if (idx >= 0) { + *found = true; + return arr; + } + if (arr->used == arr->capacity) { + arr = ai_arr_expand(arr); + } + memcpy(&arr->data[arr->used * CF_DIGEST_KEY_SZ], dig, CF_DIGEST_KEY_SZ); + arr->used++; + return arr; +} + +/* + * Returns the size diff + */ +static int +anbtr_check_convert(ai_nbtr *anbtr, col_type_t sktype) +{ + // Nothing to do + if (anbtr->is_btree) + return 0; + + ai_arr *arr = anbtr->u.arr; + if (arr && (arr->used >= AI_ARR_MAX_USED)) { + //cf_info(AS_SINDEX,"Flipped @ %d", arr->used); + ulong ba = ai_arr_size(arr); + // Allocate btree move digest from arr to btree + bt *nbtr = createNBT(sktype); + if (!nbtr) { + cf_warning(AS_SINDEX, "btree allocation failure"); + return 0; + } + + ai_arr_move_to_tree(arr, nbtr); + ai_arr_destroy(anbtr->u.arr); + + // Update anbtr + anbtr->u.nbtr = nbtr; + anbtr->is_btree = true; + + ulong aa = nbtr->msize; + return (aa - ba); + } + return 0; +} + +/* + * return -1 in case of failure + * size of allocation in case of success + */ +static int +anbtr_check_init(ai_nbtr *anbtr, col_type_t sktype) +{ + bool create_arr = false; + bool create_nbtr = false; + + if (anbtr->is_btree) { + if (anbtr->u.nbtr) { + create_nbtr = false; + } else { + create_nbtr = true; + } + } else { + if (anbtr->u.arr) { + create_arr = false; + } else { + if (g_use_arr) { + create_arr = true; + } else { + create_nbtr = true; + } + } + } + + // create array or btree + if (create_arr) { + anbtr->u.arr = ai_arr_new(); + return ai_arr_size(anbtr->u.arr); + } else if (create_nbtr) { + anbtr->u.nbtr = createNBT(sktype); + if (!anbtr->u.nbtr) { + return -1; + } + anbtr->is_btree = true; + return anbtr->u.nbtr->msize; + } else { + if (!anbtr->u.arr && !anbtr->u.nbtr) { + cf_warning(AS_SINDEX, "Something wrong!!!"); + return -1; + } + } + return 0; +} + +/* + * Insert operation for the nbtr does the following + * 1. Sets up anbtr if it is set up + * 2. Inserts in the arr or nbtr depending number of elements. + * 3. Cuts over from arr to btr at AI_ARR_MAX_USED + * + * Parameter: ibtr : Btree of key + * acol : Secondary index key + * apk : value (primary key to be inserted) + * sktype : value type (U160 currently) + * + * Returns: + * AS_SINDEX_OK : In case of success + * AS_SINDEX_ERR : In case of failure + * AS_SINDEX_KEY_FOUND : If key already exists + */ +static int +reduced_iAdd(bt *ibtr, ai_obj *acol, ai_obj *apk, col_type_t sktype) +{ + ai_nbtr *anbtr = (ai_nbtr *)btIndFind(ibtr, acol); + ulong ba = 0, aa = 0; + bool allocated_anbtr = false; + if (!anbtr) { + anbtr = cf_malloc(sizeof(ai_nbtr)); + aa += sizeof(ai_nbtr); + memset(anbtr, 0, sizeof(ai_nbtr)); + allocated_anbtr = true; + } + + // Init the array + int ret = anbtr_check_init(anbtr, sktype); + if (ret < 0) { + if (allocated_anbtr) { + cf_free(anbtr); + } + return AS_SINDEX_ERR; + } else if (ret) { + ibtr->nsize += ret; + btIndAdd(ibtr, acol, (bt *)anbtr); + } + + // Convert from arr to nbtr if limit is hit + ibtr->nsize += anbtr_check_convert(anbtr, sktype); + + // If already a btree use it + if (anbtr->is_btree) { + bt *nbtr = anbtr->u.nbtr; + if (!nbtr) { + return AS_SINDEX_ERR; + } + + if (btIndNodeExist(nbtr, apk)) { + return AS_SINDEX_KEY_FOUND; + } + + ba += nbtr->msize; + if (!btIndNodeAdd(nbtr, apk)) { + return AS_SINDEX_ERR; + } + aa += nbtr->msize; + + } else { + ai_arr *arr = anbtr->u.arr; + if (!arr) { + return AS_SINDEX_ERR; + } + + ba += ai_arr_size(anbtr->u.arr); + bool found = false; + ai_arr *t_arr = ai_arr_insert(arr, (cf_digest *)&apk->y, &found); + if (found) { + return AS_SINDEX_KEY_FOUND; + } + anbtr->u.arr = t_arr; + aa += ai_arr_size(anbtr->u.arr); + } + ibtr->nsize += (aa - ba); // ibtr inherits nbtr + + return AS_SINDEX_OK; +} + +/* + * Delete operation for the nbtr does the following. Delete in the arr or nbtr + * based on state of anbtr + * + * Parameter: ibtr : Btree of key + * acol : Secondary index key + * apk : value (primary key to be inserted) + * + * Returns: + * AS_SINDEX_OK : In case of success + * AS_SINDEX_ERR : In case of failure + * AS_SINDEX_KEY_NOTFOUND : If key does not exist + */ +static int +reduced_iRem(bt *ibtr, ai_obj *acol, ai_obj *apk) +{ + ai_nbtr *anbtr = (ai_nbtr *)btIndFind(ibtr, acol); + ulong ba = 0, aa = 0; + if (!anbtr) { + return AS_SINDEX_KEY_NOTFOUND; + } + if (anbtr->is_btree) { + if (!anbtr->u.nbtr) return AS_SINDEX_ERR; + + // Remove from nbtr if found + bt *nbtr = anbtr->u.nbtr; + if (!btIndNodeExist(nbtr, apk)) { + return AS_SINDEX_KEY_NOTFOUND; + } + ba = nbtr->msize; + + // TODO - Needs to be cleaner, type convert from signed + // to unsigned. Should be 64 bit !! + int nkeys_before = nbtr->numkeys; + int nkeys_after = btIndNodeDelete(nbtr, apk, NULL); + aa = nbtr->msize; + + if (nkeys_after == nkeys_before) { + return AS_SINDEX_KEY_NOTFOUND; + } + + // remove from ibtr + if (nkeys_after == 0) { + btIndDelete(ibtr, acol); + aa = 0; + bt_destroy(nbtr); + ba += sizeof(ai_nbtr); + cf_free(anbtr); + } + } else { + if (!anbtr->u.arr) return AS_SINDEX_ERR; + + // Remove from arr if found + bool notfound = false; + ba = ai_arr_size(anbtr->u.arr); + anbtr->u.arr = ai_arr_delete(anbtr->u.arr, (cf_digest *)&apk->y, ¬found); + if (notfound) return AS_SINDEX_KEY_NOTFOUND; + aa = ai_arr_size(anbtr->u.arr); + + // Remove from ibtr + if (anbtr->u.arr->used == 0) { + btIndDelete(ibtr, acol); + aa = 0; + ai_arr_destroy(anbtr->u.arr); + ba += sizeof(ai_nbtr); + cf_free(anbtr); + } + } + ibtr->nsize -= (ba - aa); + + return AS_SINDEX_OK; +} + +int +ai_btree_key_hash_from_sbin(as_sindex_metadata *imd, as_sindex_bin_data *b) +{ + uint64_t u; + + if (C_IS_DG(imd->sktype)) { + char *x = (char *) &b->digest; // x += 4; + u = ((* (uint128 *) x) % imd->nprts); + } else { + u = (((uint64_t) b->u.i64) % imd->nprts); + } + + return (int) u; +} + +int +ai_btree_key_hash(as_sindex_metadata *imd, void *skey) +{ + uint64_t u; + + if (C_IS_DG(imd->sktype)) { + char *x = (char *) ((cf_digest *)skey); // x += 4; + u = ((* (uint128 *) x) % imd->nprts); + } else { + u = ((*(uint64_t*)skey) % imd->nprts); + } + + return (int) u; +} + +/* + * Return 0 in case of success + * -1 in case of failure + */ +static int +btree_addsinglerec(as_sindex_metadata *imd, ai_obj * key, cf_digest *dig, cf_ll *recl, uint64_t *n_bdigs, + bool * can_partition_query, bool partitions_pre_reserved) +{ + // The digests which belongs to one of the query-able partitions are elligible to go into recl + uint32_t pid = as_partition_getid(dig); + as_namespace * ns = imd->si->ns; + if (partitions_pre_reserved) { + if (!can_partition_query[pid]) { + return 0; + } + } + else { + if (! client_replica_maps_is_partition_queryable(ns, pid)) { + return 0; + } + } + + bool create = (cf_ll_size(recl) == 0) ? true : false; + as_index_keys_arr * keys_arr = NULL; + if (!create) { + cf_ll_element * ele = cf_ll_get_tail(recl); + keys_arr = ((as_index_keys_ll_element*)ele)->keys_arr; + if (keys_arr->num == AS_INDEX_KEYS_PER_ARR) { + create = true; + } + } + if (create) { + keys_arr = as_index_get_keys_arr(); + if (!keys_arr) { + cf_warning(AS_SINDEX, "Fail to allocate sindex key value array"); + return -1; + } + as_index_keys_ll_element * node = cf_malloc(sizeof(as_index_keys_ll_element)); + node->keys_arr = keys_arr; + cf_ll_append(recl, (cf_ll_element *)node); + } + // Copy the digest (value) + memcpy(&keys_arr->pindex_digs[keys_arr->num], dig, CF_DIGEST_KEY_SZ); + + // Copy the key + if (C_IS_DG(imd->sktype)) { + memcpy(&keys_arr->sindex_keys[keys_arr->num].key.str_key, &key->y, CF_DIGEST_KEY_SZ); + } + else { + keys_arr->sindex_keys[keys_arr->num].key.int_key = key->l; + } + + keys_arr->num++; + *n_bdigs = *n_bdigs + 1; + return 0; +} + +/* + * Return 0 in case of success + * -1 in case of failure + */ +static int +add_recs_from_nbtr(as_sindex_metadata *imd, ai_obj *ikey, bt *nbtr, as_sindex_qctx *qctx, bool fullrng) +{ + int ret = 0; + ai_obj sfk, efk; + init_ai_obj(&sfk); + init_ai_obj(&efk); + btSIter *nbi; + btEntry *nbe; + btSIter stack_nbi; + + if (fullrng) { + nbi = btSetFullRangeIter(&stack_nbi, nbtr, 1, NULL); + } else { // search from LAST batches end-point + init_ai_objFromDigest(&sfk, &qctx->bdig); + assignMaxKey(nbtr, &efk); + nbi = btSetRangeIter(&stack_nbi, nbtr, &sfk, &efk, 1); + } + if (nbi) { + while ((nbe = btRangeNext(nbi, 1))) { + ai_obj *akey = nbe->key; + // FIRST can be REPEAT (last batch) + if (!fullrng && ai_objEQ(&sfk, akey)) { + continue; + } + if (btree_addsinglerec(imd, ikey, (cf_digest *)&akey->y, qctx->recl, &qctx->n_bdigs, + qctx->can_partition_query, qctx->partitions_pre_reserved)) { + ret = -1; + break; + } + if (qctx->n_bdigs == qctx->bsize) { + if (ikey) { + ai_objClone(qctx->bkey, ikey); + } + cloneDigestFromai_obj(&qctx->bdig, akey); + break; + } + } + btReleaseRangeIterator(nbi); + } else { + cf_warning(AS_QUERY, "Could not find nbtr iterator.. skipping !!"); + } + return ret; +} + +static int +add_recs_from_arr(as_sindex_metadata *imd, ai_obj *ikey, ai_arr *arr, as_sindex_qctx *qctx) +{ + bool ret = 0; + + for (int i = 0; i < arr->used; i++) { + if (btree_addsinglerec(imd, ikey, (cf_digest *)&arr->data[i * CF_DIGEST_KEY_SZ], qctx->recl, + &qctx->n_bdigs, qctx->can_partition_query, qctx->partitions_pre_reserved)) { + ret = -1; + break; + } + // do not break on hitting batch limit, if the tree converts to + // bt from arr, there is no way to know which digest were already + // returned when attempting subsequent batch. Return the entire + // thing. + } + // mark nbtr as finished and copy the offset + qctx->nbtr_done = true; + if (ikey) { + ai_objClone(qctx->bkey, ikey); + } + + return ret; +} + +/* + * Return 0 in case of success + * -1 in case of failure + */ +static int +get_recl(as_sindex_metadata *imd, ai_obj *afk, as_sindex_qctx *qctx) +{ + as_sindex_pmetadata *pimd = &imd->pimd[qctx->pimd_idx]; + ai_nbtr *anbtr = (ai_nbtr *)btIndFind(pimd->ibtr, afk); + + if (!anbtr) { + return 0; + } + + if (anbtr->is_btree) { + if (add_recs_from_nbtr(imd, afk, anbtr->u.nbtr, qctx, qctx->new_ibtr)) { + return -1; + } + } else { + // If already entire batch is returned + if (qctx->nbtr_done) { + return 0; + } + if (add_recs_from_arr(imd, afk, anbtr->u.arr, qctx)) { + return -1; + } + } + return 0; +} + +/* + * Return 0 in case of success + * -1 in case of failure + */ +static int +get_numeric_range_recl(as_sindex_metadata *imd, uint64_t begk, uint64_t endk, as_sindex_qctx *qctx) +{ + ai_obj sfk; + init_ai_objLong(&sfk, qctx->new_ibtr ? begk : qctx->bkey->l); + ai_obj efk; + init_ai_objLong(&efk, endk); + as_sindex_pmetadata *pimd = &imd->pimd[qctx->pimd_idx]; + bool fullrng = qctx->new_ibtr; + int ret = 0; + btSIter *bi = btGetRangeIter(pimd->ibtr, &sfk, &efk, 1); + btEntry *be; + + if (bi) { + while ((be = btRangeNext(bi, 1))) { + ai_obj *ikey = be->key; + ai_nbtr *anbtr = be->val; + + if (!anbtr) { + ret = -1; + break; + } + + // figure out nbtr to deal with. If the key which was + // used last time vanishes work with next key. If the + // key exist but 'last' entry made to list in the last + // iteration; Move to next nbtr + if (!fullrng) { + if (!ai_objEQ(&sfk, ikey)) { + fullrng = 1; // bkey disappeared + } else if (qctx->nbtr_done) { + qctx->nbtr_done = false; + // If we are moving to the next key, we need + // to search the full range. + fullrng = 1; + continue; + } + } + + if (anbtr->is_btree) { + if (add_recs_from_nbtr(imd, ikey, anbtr->u.nbtr, qctx, fullrng)) { + ret = -1; + break; + } + } else { + if (add_recs_from_arr(imd, ikey, anbtr->u.arr, qctx)) { + ret = -1; + break; + } + } + + // Since add_recs_from_arr() returns entire thing and do not support the batch limit, + // >= operator is needed here. + if (qctx->n_bdigs >= qctx->bsize) { + break; + } + + // If it reaches here, this means last key could not fill the batch. + // So if we are to start a new key, search should be done on full range + // and the new nbtr is obviously not done. + fullrng = 1; + qctx->nbtr_done = false; + } + btReleaseRangeIterator(bi); + } + return ret; +} + +int +ai_btree_query(as_sindex_metadata *imd, as_sindex_range *srange, as_sindex_qctx *qctx) +{ + bool err = 1; + if (!srange->isrange) { // EQUALITY LOOKUP + ai_obj afk; + init_ai_obj(&afk); + if (C_IS_DG(imd->sktype)) { + init_ai_objFromDigest(&afk, &srange->start.digest); + } + else { + init_ai_objLong(&afk, srange->start.u.i64); + } + err = get_recl(imd, &afk, qctx); + } else { // RANGE LOOKUP + err = get_numeric_range_recl(imd, srange->start.u.i64, srange->end.u.i64, qctx); + } + return (err ? AS_SINDEX_ERR_NO_MEMORY : + (qctx->n_bdigs >= qctx->bsize) ? AS_SINDEX_CONTINUE : AS_SINDEX_OK); +} + +int +ai_btree_put(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, void *skey, cf_digest *value) +{ + ai_obj ncol; + if (C_IS_DG(imd->sktype)) { + init_ai_objFromDigest(&ncol, (cf_digest*)skey); + } + else { + // TODO - ai_obj type is LONG for both Geo and Long + init_ai_objLong(&ncol, *(ulong *)skey); + } + + ai_obj apk; + init_ai_objFromDigest(&apk, value); + + + uint64_t before = pimd->ibtr->msize + pimd->ibtr->nsize; + int ret = reduced_iAdd(pimd->ibtr, &ncol, &apk, COL_TYPE_DIGEST); + uint64_t after = pimd->ibtr->msize + pimd->ibtr->nsize; + cf_atomic64_add(&imd->si->ns->n_bytes_sindex_memory, (after - before)); + + if (ret && ret != AS_SINDEX_KEY_FOUND) { + cf_warning(AS_SINDEX, "Insert into the btree failed"); + return AS_SINDEX_ERR_NO_MEMORY; + } + return ret; +} + +int +ai_btree_delete(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, void * skey, cf_digest * value) +{ + int ret = AS_SINDEX_OK; + + if (!pimd->ibtr) { + return AS_SINDEX_KEY_NOTFOUND; + } + + ai_obj ncol; + if (C_IS_DG(imd->sktype)) { + init_ai_objFromDigest(&ncol, (cf_digest *)skey); + } + else { + // TODO - ai_obj type is LONG for both Geo and Long + init_ai_objLong(&ncol, *(ulong *)skey); + } + + ai_obj apk; + init_ai_objFromDigest(&apk, value); + + uint64_t before = pimd->ibtr->msize + pimd->ibtr->nsize; + ret = reduced_iRem(pimd->ibtr, &ncol, &apk); + uint64_t after = pimd->ibtr->msize + pimd->ibtr->nsize; + cf_atomic64_sub(&imd->si->ns->n_bytes_sindex_memory, (before - after)); + + return ret; +} + +/* + * Internal function which adds digests to the defrag_list + * Mallocs the nodes of defrag_list + * Returns : + * -1 : Error + * number of digests found : success + * + */ +static long +build_defrag_list_from_nbtr(as_namespace *ns, ai_obj *acol, bt *nbtr, ulong nofst, ulong *limit, uint64_t * tot_found, cf_ll *gc_list) +{ + int error = -1; + btEntry *nbe; + // STEP 1: go thru a portion of the nbtr and find to-be-deleted-PKs + // TODO: a range query may be smarter then using the Xth Iterator + btSIter *nbi = (nofst ? btGetFullXthIter(nbtr, nofst, 1, NULL, 0) : + btGetFullRangeIter(nbtr, 1, NULL)); + if (!nbi) { + return error; + } + + long found = 0; + long processed = 0; + while ((nbe = btRangeNext(nbi, 1))) { + ai_obj *akey = nbe->key; + int ret = as_sindex_can_defrag_record(ns, (cf_digest *) (&akey->y)); + + if (ret == AS_SINDEX_GC_SKIP_ITERATION) { + *limit = 0; + break; + } else if (ret == AS_SINDEX_GC_OK) { + + bool create = (cf_ll_size(gc_list) == 0) ? true : false; + objs_to_defrag_arr *dt; + + if (!create) { + cf_ll_element * ele = cf_ll_get_tail(gc_list); + dt = ((ll_sindex_gc_element*)ele)->objs_to_defrag; + if (dt->num == SINDEX_GC_NUM_OBJS_PER_ARR) { + create = true; + } + } + if (create) { + dt = as_sindex_gc_get_defrag_arr(); + if (!dt) { + *tot_found += found; + return -1; + } + ll_sindex_gc_element * node; + node = cf_malloc(sizeof(ll_sindex_gc_element)); + node->objs_to_defrag = dt; + cf_ll_append(gc_list, (cf_ll_element *)node); + } + cloneDigestFromai_obj(&(dt->acol_digs[dt->num].dig), akey); + ai_objClone(&(dt->acol_digs[dt->num].acol), acol); + + dt->num += 1; + found++; + } + processed++; + (*limit)--; + if (*limit == 0) break; + } + btReleaseRangeIterator(nbi); + *tot_found += found; + return processed; +} + +static long +build_defrag_list_from_arr(as_namespace *ns, ai_obj *acol, ai_arr *arr, ulong nofst, ulong *limit, uint64_t * tot_found, cf_ll *gc_list) +{ + long found = 0; + long processed = 0; + + for (ulong i = nofst; i < arr->used; i++) { + int ret = as_sindex_can_defrag_record(ns, (cf_digest *) &arr->data[i * CF_DIGEST_KEY_SZ]); + if (ret == AS_SINDEX_GC_SKIP_ITERATION) { + *limit = 0; + break; + } else if (ret == AS_SINDEX_GC_OK) { + bool create = (cf_ll_size(gc_list) == 0) ? true : false; + objs_to_defrag_arr *dt; + + if (!create) { + cf_ll_element * ele = cf_ll_get_tail(gc_list); + dt = ((ll_sindex_gc_element*)ele)->objs_to_defrag; + if (dt->num == SINDEX_GC_NUM_OBJS_PER_ARR) { + create = true; + } + } + if (create) { + dt = as_sindex_gc_get_defrag_arr(); + if (!dt) { + *tot_found += found; + return -1; + } + ll_sindex_gc_element * node; + node = cf_malloc(sizeof(ll_sindex_gc_element)); + node->objs_to_defrag = dt; + cf_ll_append(gc_list, (cf_ll_element *)node); + } + memcpy(&(dt->acol_digs[dt->num].dig), (cf_digest *) &arr->data[i * CF_DIGEST_KEY_SZ], CF_DIGEST_KEY_SZ); + ai_objClone(&(dt->acol_digs[dt->num].acol), acol); + + dt->num += 1; + found++; + } + processed++; + (*limit)--; + if (*limit == 0) { + break; + } + } + *tot_found += found; + return processed; +} + +/* + * Aerospike Index interface to build a defrag_list. + * + * Returns : + * AS_SINDEX_DONE ---> The current pimd has been scanned completely for defragging + * AS_SINDEX_CONTINUE ---> Current pimd sill may have some candidate digest to be defragged + * AS_SINDEX_ERR ---> Error. Abort this pimd. + * + * Notes : Caller has the responsibility to free the iterators. + * Requires a proper offset value from the caller. + */ +int +ai_btree_build_defrag_list(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, ai_obj *icol, + ulong *nofst, ulong limit, uint64_t * tot_processed, uint64_t * tot_found, cf_ll *gc_list) +{ + int ret = AS_SINDEX_ERR; + + if (!pimd || !imd) { + return ret; + } + + as_namespace *ns = imd->si->ns; + if (!ns) { + ns = as_namespace_get_byname((char *)imd->ns_name); + } + + if (!pimd || !pimd->ibtr || !pimd->ibtr->numkeys) { + goto END; + } + //Entry is range query, FROM previous icol TO maxKey(ibtr) + if (icol->type == COL_TYPE_INVALID) { + assignMinKey(pimd->ibtr, icol); // init first call + } + ai_obj iH; + assignMaxKey(pimd->ibtr, &iH); + btEntry *be = NULL; + btSIter *bi = btGetRangeIter(pimd->ibtr, icol, &iH, 1); + if (!bi) { + goto END; + } + + while ( true ) { + be = btRangeNext(bi, 1); + if (!be) { + ret = AS_SINDEX_DONE; + break; + } + ai_obj *acol = be->key; + ai_nbtr *anbtr = be->val; + long processed = 0; + if (!anbtr) { + break; + } + if (anbtr->is_btree) { + processed = build_defrag_list_from_nbtr(ns, acol, anbtr->u.nbtr, *nofst, &limit, tot_found, gc_list); + } else { + processed = build_defrag_list_from_arr(ns, acol, anbtr->u.arr, *nofst, &limit, tot_found, gc_list); + } + + if (processed < 0) { // error .. abort everything. + cf_detail(AS_SINDEX, "build_defrag_list returns an error. Aborting defrag on current pimd"); + ret = AS_SINDEX_ERR; + break; + } + *tot_processed += processed; + // This tree may have some more digest to defrag + if (limit == 0) { + *nofst = *nofst + processed; + ai_objClone(icol, acol); + cf_detail(AS_SINDEX, "Current pimd may need more iteration of defragging."); + ret = AS_SINDEX_CONTINUE; + break; + } + + // We have finished this tree. Yet we have not reached our limit to defrag. + // Goes to next iteration + *nofst = 0; + ai_objClone(icol, acol); + }; + btReleaseRangeIterator(bi); +END: + + return ret; +} + +/* + * Deletes the digest as in the passed in as gc_list, bound by n2del number of + * elements per iteration, with *deleted successful deletes. + */ +bool +ai_btree_defrag_list(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, cf_ll *gc_list, ulong n2del, ulong *deleted) +{ + // If n2del is zero here, that means caller do not want to defrag + if (n2del == 0) { + return false; + } + ulong success = 0; + as_namespace *ns = imd->si->ns; + // STEP 3: go thru the PKtoDeleteList and delete the keys + + uint64_t before = 0; + uint64_t after = 0; + + while (cf_ll_size(gc_list)) { + cf_ll_element * ele = cf_ll_get_head(gc_list); + ll_sindex_gc_element * node = (ll_sindex_gc_element * )ele; + objs_to_defrag_arr * dt = node->objs_to_defrag; + + // check before deleting. The digest may re-appear after the list + // creation and before deletion from the secondary index + + int i = 0; + while (dt->num != 0) { + i = dt->num - 1; + int ret = as_sindex_can_defrag_record(ns, &(dt->acol_digs[i].dig)); + if (ret == AS_SINDEX_GC_SKIP_ITERATION) { + goto END; + } else if (ret == AS_SINDEX_GC_OK) { + ai_obj apk; + init_ai_objFromDigest(&apk, &(dt->acol_digs[i].dig)); + ai_obj *acol = &(dt->acol_digs[i].acol); + cf_detail(AS_SINDEX, "Defragged %lu %ld", acol->l, *((uint64_t *)&apk.y)); + + before += pimd->ibtr->msize + pimd->ibtr->nsize; + if (reduced_iRem(pimd->ibtr, acol, &apk) == AS_SINDEX_OK) { + success++; + } + after += pimd->ibtr->msize + pimd->ibtr->nsize; + } + dt->num -= 1; + n2del--; + if (n2del == 0) { + goto END; + } + } + cf_ll_delete(gc_list, (cf_ll_element*)node); + } + +END: + cf_atomic64_sub(&imd->si->ns->n_bytes_sindex_memory, (before - after)); + *deleted += success; + return cf_ll_size(gc_list) ? true : false; +} + +void +ai_btree_create(as_sindex_metadata *imd) +{ + for (int i = 0; i < imd->nprts; i++) { + as_sindex_pmetadata *pimd = &imd->pimd[i]; + pimd->ibtr = createIBT(imd->sktype, -1); + if (! pimd->ibtr) { + cf_crash(AS_SINDEX, "Failed to allocate secondary index tree for ns:%s, indexname:%s", + imd->ns_name, imd->iname); + } + } +} + +static void +destroy_index(bt *ibtr, bt_n *n) +{ + if (! n->leaf) { + for (int i = 0; i <= n->n; i++) { + destroy_index(ibtr, NODES(ibtr, n)[i]); + } + } + + for (int i = 0; i < n->n; i++) { + void *be = KEYS(ibtr, n, i); + ai_nbtr *anbtr = (ai_nbtr *) parseStream(be, ibtr); + if (anbtr) { + if (anbtr->is_btree) { + bt_destroy(anbtr->u.nbtr); + } else { + ai_arr_destroy(anbtr->u.arr); + } + cf_free(anbtr); + } + } +} + +void +ai_btree_dump(as_sindex_metadata *imd, char *fname, bool verbose) +{ + FILE *fp = NULL; + if (!(fp = fopen(fname, "w"))) { + return; + } + + fprintf(fp, "Namespace: %s set: %s\n", imd->ns_name, imd->set ? imd->set : "None"); + + for (int i = 0; i < imd->nprts; i++) { + as_sindex_pmetadata *pimd = &imd->pimd[i]; + fprintf(fp, "INDEX: name: %s:%d (%p)\n", imd->iname, i, (void *) pimd->ibtr); + if (pimd->ibtr) { + bt_dumptree(fp, pimd->ibtr, 1, verbose); + } + } + + fclose(fp); +} + +uint64_t +ai_btree_get_numkeys(as_sindex_metadata *imd) +{ + uint64_t val = 0; + + for (int i = 0; i < imd->nprts; i++) { + as_sindex_pmetadata *pimd = &imd->pimd[i]; + PIMD_RLOCK(&pimd->slock); + val += pimd->ibtr->numkeys; + PIMD_RUNLOCK(&pimd->slock); + } + + return val; +} + +uint64_t +ai_btree_get_pimd_isize(as_sindex_pmetadata *pimd) +{ + // TODO - Why check of > 0 + return pimd->ibtr->msize > 0 ? pimd->ibtr->msize : 0; +} + +uint64_t +ai_btree_get_isize(as_sindex_metadata *imd) +{ + uint64_t size = 0; + for (int i = 0; i < imd->nprts; i++) { + as_sindex_pmetadata *pimd = &imd->pimd[i]; + PIMD_RLOCK(&pimd->slock); + size += ai_btree_get_pimd_isize(pimd); + PIMD_RUNLOCK(&pimd->slock); + } + return size; +} + +uint64_t +ai_btree_get_pimd_nsize(as_sindex_pmetadata *pimd) +{ + // TODO - Why check of > 0 + return pimd->ibtr->nsize > 0 ? pimd->ibtr->nsize : 0; +} + +uint64_t +ai_btree_get_nsize(as_sindex_metadata *imd) +{ + uint64_t size = 0; + for (int i = 0; i < imd->nprts; i++) { + as_sindex_pmetadata *pimd = &imd->pimd[i]; + PIMD_RLOCK(&pimd->slock); + size += ai_btree_get_pimd_nsize(pimd); + PIMD_RUNLOCK(&pimd->slock) + } + + return size; +} + +void +ai_btree_reinit_pimd(as_sindex_pmetadata * pimd, col_type_t sktype) +{ + if (! pimd->ibtr) { + cf_crash(AS_SINDEX, "IBTR is null"); + } + pimd->ibtr = createIBT(sktype, -1); +} + +void +ai_btree_reset_pimd(as_sindex_pmetadata *pimd) +{ + if (! pimd->ibtr) { + cf_crash(AS_SINDEX, "IBTR is null"); + } + pimd->ibtr = NULL; +} + +void +ai_btree_delete_ibtr(bt * ibtr) +{ + if (! ibtr) { + cf_crash(AS_SINDEX, "IBTR is null"); + } + destroy_index(ibtr, ibtr->root); +} diff --git a/ai/src/ai_obj.c b/ai/src/ai_obj.c new file mode 100644 index 00000000..ac70b03a --- /dev/null +++ b/ai/src/ai_obj.c @@ -0,0 +1,103 @@ +/* + * ai_obj.h + * + * Copyright (C) 2013-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +/* + * Aerospike Index Object Implementation. + */ + +#include +#include +#include +#include +#include +#include +#include // For MIN(). + +#include "ai_obj.h" +#include "stream.h" + +#include + +void init_ai_obj(ai_obj *a) +{ + bzero(a, sizeof(ai_obj)); + a->type = COL_TYPE_INVALID; +} + +void init_ai_objLong(ai_obj *a, ulong l) +{ + init_ai_obj(a); + a->l = l; + a->type = COL_TYPE_LONG; +} + +void init_ai_objU160(ai_obj *a, uint160 y) { + a->type = COL_TYPE_DIGEST; + a->y = y; +} + +void ai_objClone(ai_obj *dest, ai_obj *src) +{ + memcpy(dest, src, sizeof(ai_obj)); +} + +static int ai_objCmp(ai_obj *a, ai_obj *b) +{ + if (C_IS_L(a->type) || C_IS_G(a->type)) { + return (a->l == b->l) ? 0 : ((a->l > b->l) ? 1 : -1); + } else if (C_IS_DG(a->type)) { + return u160Cmp(&a->y, &b->y); + } else { + assert(!"ai_objCmp ERROR"); + } +} + +bool ai_objEQ(ai_obj *a, ai_obj *b) +{ + return !ai_objCmp(a, b); +} + +static void dump_ai_obj_internal(FILE *fp, ai_obj *a, bool as_digest) +{ + if (C_IS_L(a->type) || C_IS_G(a->type)) { + fprintf(fp, "\tLONG ai_obj: val: %lu\n", a->l); + } else if (C_IS_DG(a->type)) { + fprintf(fp, "\tU160 ai_obj:"); + if (as_digest) { + const int len = 20; + char digest_str[2 + (len * 2) + 1]; + digest_str[0] = '\0'; + generate_packed_hex_string((uint8_t *) &(a->y), len, digest_str); + fprintf(fp, "%s\n", digest_str); + } else { + DEBUG_U160(fp, a->y); + fprintf(fp, "\n"); + } + } else { + fprintf(fp, "\tUNINITIALISED ai_obj\n"); + } +} + + +void dump_ai_obj_as_digest(FILE *fp, ai_obj *a) +{ + dump_ai_obj_internal(fp, a, true); +} diff --git a/ai/src/bt.c b/ai/src/bt.c new file mode 100644 index 00000000..b43bc64f --- /dev/null +++ b/ai/src/bt.c @@ -0,0 +1,133 @@ +/* + * bt.c + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +/* + * Creation of different btree types and + * Public B-tree operations w/ stream abstractions under the covers. + */ + +#include +#include +#include +#include +#include + +#include "bt.h" +#include "bt_iterator.h" +#include "stream.h" + +#include + +bt *createIBT(col_type_t ktype, int imatch) { + bt_cmp_t cmp; + bts_t bts; + bts.ktype = ktype; + bts.btype = INDEX_BTREE; + bts.num = imatch; + if (C_IS_L(ktype)) { /* NOTE: under the covers: LL */ + bts.ksize = LL_SIZE; + cmp = llCmp; + bts.bflag = BTFLAG_ULONG_ULONG; + } else if (C_IS_G(ktype)) { /* NOTE: under the covers: LL */ + bts.ksize = LL_SIZE; + cmp = llCmp; + bts.bflag = BTFLAG_ULONG_ULONG; + } else if (C_IS_DG(ktype)) { /* NOTE: under the covers: YL */ + bts.ksize = YL_SIZE; + cmp = ylCmp; + bts.bflag = BTFLAG_U160_ULONG; + } else { /* STRING or FLOAT */ + assert(!"Unsupport Key Type"); + } + + return bt_create(cmp, &bts, 0); +} + +bt *createNBT(col_type_t ktype) { + bt_cmp_t cmp; + bts_t bts; + bts.ktype = ktype; + bts.btype = NODE_BTREE; + bts.num = -1; + if (C_IS_DG(ktype)) { + cmp = u160Cmp; + bts.ksize = U160SIZE; + bts.bflag = BTFLAG_U160; + } else { + assert(!"Unsupport Key Type"); + } + + return bt_create(cmp, &bts, 0); +} + +static void *abt_find(bt *btr, ai_obj *akey) { + DECLARE_BT_KEY(akey, 0) + uchar *stream = bt_find(btr, btkey, akey); + destroyBTKey(btkey, med); /* FREED 026 */ + return parseStream(stream, btr); +} +static bool abt_exist(bt *btr, ai_obj *akey) { //NOTE: Evicted Indexes are NULL + DECLARE_BT_KEY(akey, 0) + bool ret = bt_exist(btr, btkey, akey); + destroyBTKey(btkey, med); /* FREED 026 */ + return ret; +} +static bool abt_del(bt *btr, ai_obj *akey, bool leafd) { // DELETE the row + DECLARE_BT_KEY(akey, 0) + dwd_t dwd = bt_delete(btr, btkey, leafd); /* FREED 028 */ + if (!dwd.k) return 0; + uchar *stream = dwd.k; + destroyBTKey(btkey, med); /* FREED 026 */ + return destroyStream(btr, stream); /* DESTROYED 027 */ +} +static uint32 abt_insert(bt *btr, ai_obj *akey, void *val) { + crs_t crs; + uint32 ssize; + DECLARE_BT_KEY(akey, 0) + char *stream = createStream(btr, val, btkey, ksize, &ssize, &crs); // D 027 + if (!stream) return 0; + destroyBTKey(btkey, med); /* FREED 026 */ + if (!bt_insert(btr, stream, 0)) return 0; /* FREE ME 028 */ + return 1; +} + +/* INDEX INDEX INDEX INDEX INDEX INDEX INDEX INDEX INDEX INDEX INDEX INDEX */ +void btIndAdd (bt *ibtr, ai_obj *ikey, bt *nbtr) { + abt_insert (ibtr, ikey, nbtr); +} +bt *btIndFind (bt *ibtr, ai_obj *ikey) { + return abt_find (ibtr, ikey); +} +int btIndDelete(bt *ibtr, ai_obj *ikey) { + abt_del (ibtr, ikey, 0); + return ibtr->numkeys; +} + +bool btIndNodeExist(bt *nbtr, ai_obj *apk) { + return abt_exist(nbtr, apk); +} +bool btIndNodeAdd(bt *nbtr, ai_obj *apk) { //DEBUG_NBT_ADD + return abt_insert(nbtr, apk, NULL); +} +int btIndNodeDelete(bt *nbtr, ai_obj *apk, ai_obj *ocol) { + abt_del (nbtr, ocol ? ocol : apk, 0); + return nbtr->numkeys; +} diff --git a/ai/src/bt_code.c b/ai/src/bt_code.c new file mode 100644 index 00000000..a6fe2561 --- /dev/null +++ b/ai/src/bt_code.c @@ -0,0 +1,1016 @@ +/* + * Copyright 1997-1999, 2001 John-Mark Gurney. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + */ + +#include +#include +#include +#include +#include +#include + +#include "bt.h" +#include "bt_iterator.h" +#include "stream.h" + +#include + +/* CACHE TODO LIST + 8.) U128PK/FK CACHE:[EVICT,MISS] support + + 11.) DS as stream -\/ + 7.) DS in rdbSave/Load (dependency on 11) + + 12.) slab allocator for ALL btn's + + 14.) btFind() in setUniqIndexVal() -> btFindD() + TESTING + + 18.) CREATE TABLE () DIRTY + + 19.) btreesplitchild dirty math (only set dirty if new split child has dirty) +*/ + +// DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG DEBUG + +//#define DEBUG_DEL_CASE_STATS +//#define BT_MEM_PROFILE +#ifdef BT_MEM_PROFILE +static ulong tot_bt_data = 0; static ulong tot_bt_data_mem = 0; +static ulong tot_num_bt_ns = 0; static ulong tnbtnmem = 0; +static ulong tot_num_bts = 0; static ulong tot_num_bt_mem = 0; + #define BT_MEM_PROFILE_BT {tot_num_bts++; tot_num_bt_mem += size;} + #define BT_MEM_PROFILE_NODE {tot_num_bt_ns++; tnbtnmem += size;} +#else + #define BT_MEM_PROFILE_BT + #define BT_MEM_PROFILE_NODE +#endif + +/* PROTOYPES */ +static void release_dirty_stream(bt *btr, bt_n *x); +static int real_log2 (unsigned int a, int nbits); +static bt_data_t findminkey (bt *btr, bt_n *x); +static bt_data_t findmaxkey (bt *btr, bt_n *x); + +// HELPER HELPER HELPER HELPER HELPER HELPER HELPER HELPER HELPER HELPER +static ulong getNumKey(bt *btr, bt_n *x, int i) { //TODO U128 support + if (i < 0 || i >= x->n) return 0; + else { + ai_obj akey; void *be = KEYS(btr, x, i); + convertStream2Key(be, &akey, btr); + return akey.l; + } +} + +// MEMORY_MANAGEMENT MEMORY_MANAGEMENT MEMORY_MANAGEMENT MEMORY_MANAGEMENT +/* NOTE used-memory bookkeeping maintained at the Btree level */ +static void bt_increment_used_memory(bt *btr, size_t size) { //DEBUG_INCR_MEM + btr->msize += (ull)size; +} +static void bt_decrement_used_memory(bt *btr, size_t size) { //DEBUG_DECR_MEM + btr->msize -= (ull)size; +} +// DIRTY_STREAM DIRTY_STREAM DIRTY_STREAM DIRTY_STREAM DIRTY_STREAM +static uint32 get_dssize(bt *btr, char dirty) { + assert(dirty > 0); + uint32 drsize = (dirty == 3) ? sizeof(uint32) : + (dirty == 2) ? sizeof(ushort16) : sizeof(uchar); // 1 + //DEBUG_GETDSSIZE + return (btr->t * 2) * drsize; +} +static void alloc_ds(bt *btr, bt_n *x, size_t size, char dirty) { + assert(dirty != -1); + void **dsp = (void *)((char *)x + size); + if (!dirty) { *dsp = NULL; return; } + size_t dssize = get_dssize(btr, dirty); + void *ds = cf_malloc(dssize); bzero(ds, dssize); // FREEME 108 + bt_increment_used_memory(btr, dssize); + *dsp = ds; //DEBUG_ALLOC_DS +} +void incr_ds(bt *btr, bt_n *x) {//USE: when a DR is too big for its DS (incr_ds) + assert(x->dirty > 0); + GET_BTN_SIZE(x->leaf) + void *ods = GET_DS(x, nsize); + uint32 osize = get_dssize(btr, x->dirty); + uint32 num = (x->leaf ? (btr->t * 2) : btr->t); //DEBUG_RESIZE_DS_1 + alloc_ds(btr, x, nsize, x->dirty + 1); + void *nds = GET_DS(x, nsize); + if (x->dirty == 1) { + uchar *s_ds = (uchar *)ods; ushort16 *d_ds = (ushort16 *)nds; + for (uint32 i = 0; i < num; i++) d_ds[i] = (ushort16)s_ds[i]; + } else if (x->dirty == 2) { + ushort16 *s_ds = (ushort16 *)ods; uint32 *d_ds = (uint32 *)nds; + for (uint32 i = 0; i < num; i++) d_ds[i] = (uint32 )s_ds[i]; + } else assert(!"incr_ds ERROR"); + x->dirty++; //DEBUG_RESIZE_DS_2 + cf_free(ods); bt_decrement_used_memory(btr, osize); +} + +// BT_ALLOC_BTREE BT_ALLOC_BTREE BT_ALLOC_BTREE BT_ALLOC_BTREE BT_ALLOC_BTREE +// BT_ALLOC_BTREE BT_ALLOC_BTREE BT_ALLOC_BTREE BT_ALLOC_BTREE BT_ALLOC_BTREE +static bt_n *allocbtreenode(bt *btr, bool leaf, char dirty) { + btr->numnodes++; + GET_BTN_SIZES(leaf, dirty) BT_MEM_PROFILE_NODE //DEBUG_ALLOC_BTN + bt_n *x = cf_malloc(msize); bzero(x, msize); + bt_increment_used_memory(btr, msize); + x->leaf = -1; + x->dirty = dirty; + if (dirty != -1) alloc_ds(btr, x, nsize, dirty); + return x; +} +static bt *allocbtree() { + int size = sizeof(struct btree); + BT_MEM_PROFILE_BT + bt *btr = (bt *) cf_malloc(size); bzero(btr, size); // FREE ME 035 + bt_increment_used_memory(btr, size); //DEBUG_ALLOC_BTREE + return btr; +} + +static void release_dirty_stream(bt *btr, bt_n *x) { //DEBUG_BTF_BTN_DIRTY + assert(x->dirty > 0); + GET_BTN_SIZE(x->leaf) + bt_decrement_used_memory(btr, get_dssize(btr, x->dirty)); + void **dsp = GET_DS(x, nsize); cf_free(dsp); // FREED 108 + x->dirty = 0; +} +static void bt_free_btreenode(bt *btr, bt_n *x) { + GET_BTN_SIZES(x->leaf, x->dirty) bt_decrement_used_memory(btr, msize); + if (x->dirty > 0) release_dirty_stream(btr, x); + cf_free(x); // FREED 035 +} +static void bt_free_btree(bt *btr) { cf_free(btr); } + +// BT_CREATE BT_CREATE BT_CREATE BT_CREATE BT_CREATE BT_CREATE BT_CREATE +bt *bt_create(bt_cmp_t cmp, bts_t *s, char dirty) { + int n = BTREE_LONG_TYPE_DEGREE; + + if (C_IS_L(s->ktype) || C_IS_G(s->ktype)) { + n = BTREE_LONG_TYPE_DEGREE; + } + else if (C_IS_DG(s->ktype)) { + n = BTREE_STRING_TYPE_DEGREE; + } + + uchar t = (uchar)((int)(n + 1) / 2); + int kbyte = sizeof(bt_n) + n * s->ksize; + int nbyte = kbyte + (n + 1) * VOIDSIZE; + bt *btr = allocbtree(); + if (!btr) return NULL; + memcpy(&btr->s, s, sizeof(bts_t)); /* ktype, btype, ksize, bflag, num */ + btr->cmp = cmp; + btr->keyofst = sizeof(bt_n); + uint32 nodeofst = btr->keyofst + n * s->ksize; + btr->nodeofst = (ushort16)nodeofst; + btr->t = t; + int nbits = real_log2(n, sizeof(int) * 8) + 1; + nbits = 1 << (real_log2(nbits, sizeof(int) * 8) + 1); + btr->nbits = (uchar)nbits; + btr->nbyte = nbyte; + btr->kbyte = kbyte; + btr->dirty = dirty; + btr->root = allocbtreenode(btr, 1, dirty ? 0: -1); + if (!btr->root) return NULL; + btr->numnodes = 1; //printf("bt_create\n"); bt_dump_info(printf, btr); + return btr; +} + +// BINARY_SEARCH BINARY_SEARCH BINARY_SEARCH BINARY_SEARCH BINARY_SEARCH +/* This is the real log2 function. It is only called when we don't have + * a value in the table. -> which is basically never */ +static inline int real_log2(unsigned int a, int nbits) { + uint32 i = 0; + uint32 b = (nbits + 1) / 2; /* divide in half rounding up */ + while (b) { + i = (i << 1); + if (a >= (unsigned int)(1 << b)) { // select top half and mark this bit + a /= (1 << b); + i = i | 1; + } else { // select bottom half & dont set bit + a &= (1 << b) - 1; + } + b /= 2; + } + return i; +} + +#if 0 + +// TODO: global table is pain disabled for avoiding issue +// open it up later +/* Implement a lookup table for the log values. This will only allocate + * memory that we need. This is much faster than calling the log2 routine + * every time. Doing 1 million insert, searches, and deletes will generate + * ~58 million calls to log2. Using a lookup table IS NECESSARY! + -> memory usage of this is trivial, like less than 1KB */ +static inline int _log2(unsigned int a, int nbits) { + static char *table = NULL; + static uint32 alloced = 0; + uint32 i; + if (a >= alloced) { + table = cf_realloc(table, (a + 1) * sizeof *table); + for (i = alloced; i < a + 1; i++) table[i] = -1; + alloced = a + 1; + } + if (table[a] == -1) table[a] = real_log2(a, nbits); + return table[a]; +} +#endif + +static inline int _log2(unsigned int a, int nbits) { + return real_log2(a, nbits); +} + +static int findkindex(bt *btr, bt_n *x, bt_data_t k, int *r, btIterator *iter) { + if (x->n == 0) return -1; + int b, tr; + int *rr = r ? r : &tr ; /* rr: key is greater than current entry */ + int i = 0; + int a = x->n - 1; + while (a > 0) { + b = _log2(a, (int)btr->nbits); + int slot = (1 << b) + i; + bt_data_t k2 = KEYS(btr, x, slot); + if ((*rr = btr->cmp(k, k2)) < 0) { + a = (1 << b) - 1; + } else { + a -= (1 << b); + i |= (1 << b); + } + } + if ((*rr = btr->cmp(k, KEYS(btr, x, i))) < 0) i--; + if (iter) { iter->bln->in = iter->bln->ik = (i > 0) ? i : 0; } + return i; +} + +// KEY_SHUFFLING KEY_SHUFFLING KEY_SHUFFLING KEY_SHUFFLING KEY_SHUFFLING +// NOTE: KEYS are variable sizes: [4,8,12,16,20,24,32 bytes] +#define ISVOID(btr) (btr->s.ksize == VOIDSIZE) + +static inline void **AKEYS(bt *btr, bt_n *x, int i) { + int ofst = (i * btr->s.ksize); + char *v = (char *)x + btr->keyofst + ofst; //DEBUG_AKEYS + return (void **)v; +} +#define OKEYS(btr, x) ((void **)((char *)x + btr->keyofst)) +inline void *KEYS(bt *btr, bt_n *x, int i) { //DEBUG_KEYS + if ISVOID(btr) return OKEYS(btr, x)[i]; + else /* OTHER_BT */ return (void *) AKEYS(btr, x, i); +} + +// SCION SCION SCION SCION SCION SCION SCION SCION SCION SCION SCION SCION +static inline void incr_scion(bt_n *x, int n) { x->scion += n; } +static inline void decr_scion(bt_n *x, int n) { x->scion -= n; } +static inline void move_scion(bt *btr, bt_n *y, bt_n *z, int n) { + for (int i = 0; i < n; i++) { incr_scion(y, NODES(btr, z)[i]->scion); } +} +static inline int get_scion_range(bt *btr, bt_n *x, int beg, int end) { + if (x->dirty <= 0) return end - beg; + int scion = 0; + for (int i = beg; i < end; i++) scion += 1 + getDR(btr, x, i); + return scion; +} + +// DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY DIRTY +typedef struct btn_pos { + bt_n *x; int i; +} bp_t; +typedef struct two_bp_gens { + bp_t p; /* parent */ bp_t c; /* child */ +} tbg_t; +static inline void free_bp(void *v) { cf_free(v); } + +typedef struct ll_ai_bp_element_s { + cf_ll_element ele; + bp_t * value; +} ll_ai_bp_element; + +void +ll_ai_bp_destroy_fn(cf_ll_element * ele) +{ + cf_free(((ll_ai_bp_element *)ele)->value); + cf_free((ll_ai_bp_element *)ele); +} +int +ll_ai_bp_reduce_fn(cf_ll_element *ele, void *udata) +{ + return CF_LL_REDUCE_DELETE; +} + +//TODO inline +bt_n *addDStoBTN(bt *btr, bt_n *x, bt_n *p, int pi, char dirty) { + bt_n *y = allocbtreenode(btr, x->leaf, dirty); + GET_BTN_SIZE(x->leaf) memcpy(y, x, nsize); + y->dirty = dirty; btr->dirty = 1; + if (x == btr->root) btr->root = y; + else NODES(btr, p)[pi] = y; // update parent NODE bookkeeping + bt_free_btreenode(btr, x); //DEBUG_ADD_DS_TO_BTN + return y; +} +uint32 getDR(bt *btr, bt_n *x, int i) { + if (x->dirty <= 0) return 0; + GET_BTN_SIZE(x->leaf) + void *dsp = GET_DS(x, nsize);; + if (x->dirty == 1) { + uchar *ds = (uchar *)dsp; return (uint32)ds[i]; + } else if (x->dirty == 2) { + ushort16 *ds = (ushort16 *)dsp; return (uint32)ds[i]; + } else if (x->dirty == 3) { + uint32 *ds = (uint32 *)dsp; return ds[i]; + } else assert(!"getDR ERROR"); +} +#define INCR_DS_SET_DR \ + { incr_ds(btr, x); __setDR(btr, x, i, dr); return; } + +static void __setDR(bt *btr, bt_n *x, int i, uint32 dr) { + uint32 odr; GET_BTN_SIZE(x->leaf) + void *dsp = GET_DS(x, nsize); + if (x->dirty == 1) { + uchar *ds = (uchar *)dsp; if (dr > UCHAR_MAX) INCR_DS_SET_DR + odr = ds[i]; ds[i] = dr; + } else if (x->dirty == 2) { + ushort16 *ds = (ushort16 *)dsp; if (dr > USHRT_MAX) INCR_DS_SET_DR + odr = ds[i]; ds[i] = dr; + } else if (x->dirty == 3) { + uint32 *ds = (uint32 *)dsp; + odr = ds[i]; ds[i] = dr; + } else assert(!"setDR ERROR"); + (void) odr; // silence compiler warnings +} +static bt_n *setDR(bt *btr, bt_n *x, int i, uint32 dr, bt_n *p, int pi) { + if (!dr) return x; + if (x->dirty <= 0) x = addDStoBTN(btr, x, p, pi, 1); + __setDR(btr, x, i, dr); return x; +} +static bt_n *zeroDR(bt *btr, bt_n *x, int i, bt_n *p, int pi) { + (void)p; (void) pi; // compiler warnings - these will be used later + if (x->dirty <= 0) return x; + __setDR(btr, x, i, 0); return x; +} +static bt_n *incrDR(bt *btr, bt_n *x, int i, uint32 dr, bt_n *p, int pi) { + if (!dr) return x; + if (x->dirty <= 0) x = addDStoBTN(btr, x, p, pi, 1); + uint32 odr = getDR(btr, x, i); + odr += dr; + return setDR(btr, x, i, odr, p, pi); +} +static bt_n *overwriteDR(bt *btr, bt_n *x, int i, uint32 dr, bt_n *p, int pi) { + if (dr) return setDR (btr, x, i, dr, p, pi); + else return zeroDR(btr, x, i, p, pi); +} + +// DEL_CASE_DR DEL_CASE_DR DEL_CASE_DR DEL_CASE_DR DEL_CASE_DR DEL_CASE_DR +static bt_n *incrPrevDR(bt *btr, bt_n *x, int i, uint32 dr, + bt_n *p, int pi, cf_ll *plist) { + if (!dr) return x; //DEBUG_INCR_PREV_DR + if (i > 0) return incrDR(btr, x, i - 1, dr, p, pi); // prev sibling + else { + //TODO findminnode() is too inefficient -> needs to be a part of btr + if (x == findminnode(btr, btr->root)) { // MIN KEY + btr->dirty_left += dr; btr->dirty = 1; return x; + } + cf_ll_element * ele; + cf_ll_iterator * iter = cf_ll_getIterator(plist, true); + bt_n *rx = btr->root; int ri = 0; + + while ((ele = cf_ll_getNext(iter))) { + bp_t *bp = ((ll_ai_bp_element *)ele)->value; + if (bp->i) { rx = bp->x; ri = bp->i - 1; break; } + } + bt_n *prx = btr->root; int pri = 0; + if (rx != btr->root) { // get parent + ele = cf_ll_getNext(iter); + bp_t *bp = ((ll_ai_bp_element *)ele)->value; + prx = bp->x; pri = bp->i; + } + cf_ll_releaseIterator(iter); + //printf("rx: %p ri: %d prx: %p pri: %d\n", rx, ri, prx, pri); + incrDR(btr, rx, ri, dr, prx, pri); + return x; // x not modified (only rx) + } +} +static tbg_t get_prev_child_recurse(bt *btr, bt_n *x, int i) { + bt_n *xp = NODES(btr, x)[i]; //DEBUG_GET_C_REC_1 + if (!xp->leaf) return get_prev_child_recurse(btr, xp, xp->n); + tbg_t tbg; + tbg.p.x = x; tbg.p.i = i; + tbg.c.x = xp; tbg.c.i = xp->n - 1; //DEBUG_GET_C_REC_2 + return tbg; +} +static bt_n *incrCase2B(bt *btr, bt_n *x, int i, int dr) { //DEBUG_INCR_CASE2B + tbg_t tbg = get_prev_child_recurse(btr, x, i); //DEBUG_INCR_PREV + bt_n *nc = incrDR(btr, tbg.c.x, tbg.c.i, dr, tbg.p.x, tbg.p.i); + incr_scion(nc, dr); + return x; // x not modified (only tbg.c.x) +} + +// SET_BT_KEY SET_BT_KEY SET_BT_KEY SET_BT_KEY SET_BT_KEY SET_BT_KEY +static void setBTKeyRaw(bt *btr, bt_n *x, int i, void *src) { //PRIVATE + void **dest = AKEYS(btr, x, i); + if ISVOID(btr) *dest = src; + else memcpy(dest, src, btr->s.ksize); + //DEBUG_SET_KEY +} +static bt_n *setBTKey(bt *btr, bt_n *dx, int di, bt_n *sx, int si, + bool drt, bt_n *pd, int pdi, bt_n *ps, int psi) { + if (drt) { + uint32 dr = getDR (btr, sx, si); //DEBUG_SET_BTKEY + dx = overwriteDR(btr, dx, di, dr, pd, pdi); + sx = zeroDR (btr, sx, si, ps, psi); + } else sx = zeroDR (btr, sx, si, ps, psi); + setBTKeyRaw(btr, dx, di, KEYS(btr, sx, si)); return dx; +} + +static void mvXKeys(bt *btr, bt_n **dx, int di, + bt_n **sx, int si, uint32 num, uint32 ks, + bt_n *pd, int pdi, + bt_n *ps, int psi) { + if (!num) return; + bool x2x = (*dx == *sx); bool forward = (di >= si); + int i = forward ? (int)num - 1: 0; + int end = forward ? -1 : (int)num; + while (i != end) { // DS remove destDR from dx @i, add srcDR to sx @i + int sii = si + i; int dii = di + i; + uint32 drs = getDR(btr, *sx, sii), drd = getDR(btr, *dx, dii); + if (drs) { //DEBUG_MV_X_KEYS_1 + *dx = setDR (btr, *dx, dii, drs, pd, pdi); + if (x2x && *dx != *sx) *sx = *dx; + *sx = zeroDR(btr, *sx, sii, ps, psi); + if (x2x && *dx != *sx) *dx = *sx; + } else if (drd) { //DEBUG_MV_X_KEYS_2 + *dx = zeroDR(btr, *dx, dii, pd, pdi); + if (x2x && *dx != *sx) *sx = *dx; + } + bt_data_t *dest = AKEYS(btr, *dx, di); + bt_data_t *src = AKEYS(btr, *sx, si); + void *dk = (char *)dest + (i * ks); + void *sk = (char *)src + (i * ks); + memcpy(dk, sk, ks); + if (forward) i--; else i++; + } +} +static inline void mvXNodes(bt *btr, bt_n *x, int xofst, + bt_n *z, int zofst, int num) { + memmove(NODES(btr, x) + xofst, NODES(btr, z) + zofst, (num) * VOIDSIZE); +} + +//NOTE: trimBTN*() do not ever dirty btn's -- TODO they could UN-dirty +static bt_n *trimBTN(bt *btr, bt_n *x, bool drt, bt_n *p, int pi) { + //DEBUG_TRIM_BTN + if (drt) x = zeroDR(btr, x, x->n, p, pi); + x->n--; return x; +} +static bt_n *trimBTN_n(bt *btr, bt_n *x, int n, bool drt, bt_n *p, int pi) { + if (drt) { + for (int i = x->n; i >= (x->n - n); i--) x = zeroDR(btr, x, i, p, pi); + } + x->n -= n; return x; +} + +// INSERT INSERT INSERT INSERT INSERT INSERT INSERT INSERT INSERT INSERT +static bool btreesplitchild(bt *btr, bt_n *x, int i, bt_n *y, bt_n *p, int pi) { + ushort16 t = btr->t; //TODO dirtymath + bt_n *z = allocbtreenode(btr, y->leaf, y->dirty); if (!z) return 0; + z->leaf = y->leaf; /* duplicate leaf setting */ + for (int j = 0; j < t - 1; j++) { + z = setBTKey(btr, z, j, y, j + t, 1, p, pi, p, pi); + } + z->scion = get_scion_range(btr, z, 0, t - 1); decr_scion(y, z->scion); + z->n = t - 1; y = trimBTN_n(btr, y, t - 1, 0, p, pi); + if (!y->leaf) { // if it's an internal node, copy the ptr's too + for (int j = 0; j < t; j++) { + uint32_t scion = NODES(btr, y)[j + t]->scion; + decr_scion(y, scion); incr_scion(z, scion); + NODES(btr, z)[j] = NODES(btr, y)[j + t]; + } + } + for (int j = x->n; j > i; j--) { // move nodes in parent down one + NODES(btr, x)[j + 1] = NODES(btr, x)[j]; + } + NODES(btr, x)[i + 1] = z; // store new node + for (int j = x->n - 1; j >= i; j--) { // adjust the keys from previous move + x = setBTKey(btr, x, j + 1, x, j, 1, p, pi, p, pi); + } + decr_scion(y, 1 + getDR(btr, y, y->n - 1)); //NEXT LINE: store new key + x = setBTKey(btr, x, i, y, y->n - 1, 1, p, pi, p, pi); x->n++; + trimBTN(btr, y, 0, p, pi); + return 1; +} + +#define GETN(btr) ((2 * btr->t) - 1) +static bool bt_insertnonfull(bt *btr, bt_n *x, bt_data_t k, bt_n *p, int pi, + int dr) { + if (x->leaf) { /* we are a leaf, just add it in */ + int i = findkindex(btr, x, k, NULL, NULL); + if (i != x->n - 1) { + mvXKeys(btr, &x, i + 2, &x, i + 1, (x->n - i - 1), btr->s.ksize, + p, pi, p, pi); + } + x = overwriteDR(btr, x, i + 1, dr, p, pi); + setBTKeyRaw(btr, x, i + 1, k); x->n++; incr_scion(x, 1); + } else { /* not leaf */ + int i = findkindex(btr, x, k, NULL, NULL) + 1; + if (NODES(btr, x)[i]->n == GETN(btr)) { // if next node is full + if (!btreesplitchild(btr, x, i, NODES(btr, x)[i], x, i)) return 0; + if (btr->cmp(k, KEYS(btr, x, i)) > 0) i++; + } + bt_insertnonfull(btr, NODES(btr, x)[i], k, x, i, dr); incr_scion(x, 1); + } + return 1; +} +bool bt_insert(bt *btr, bt_data_t k, uint32 dr) { + bt_n *r = btr->root; + bt_n *p = r; + int pi = 0; + if (r->n == GETN(btr)) { /* NOTE: tree increase height */ + bt_n *s = allocbtreenode(btr, 0, r->dirty); if (!s) return 0; + btr->root = s; + s->leaf = 0; + s->n = 0; + incr_scion(s, r->scion); + NODES(btr, s)[0] = r; + if (!btreesplitchild(btr, s, 0, r, p, pi)) return 0; + p = r = s; + btr->numnodes++; + } + if (!bt_insertnonfull(btr, r, k, p, pi, dr)) return 0; + btr->numkeys++; + return 1; +} + +// DELETE DELETE DELETE DELETE DELETE DELETE DELETE DELETE DELETE DELETE +static bt_n *replaceKeyWithGhost(bt *btr, bt_n *x, int i, bt_data_t k, + uint32 dr, bt_n *p, int pi) { + //printf("replaceKeyWithGhost\n"); + ai_obj akey; convertStream2Key(k, &akey, btr); + crs_t crs; uint32 ssize; DECLARE_BT_KEY(&akey, x) + char *stream = createStream(btr, NULL, btkey, ksize, &ssize, &crs);//DEST027 + x = overwriteDR(btr, x, i, dr, p, pi); + setBTKeyRaw(btr, x, i, stream); + return x; +} + +#define ADD_BP(plist, p, pi) /* used to trace path to deleted key */ \ + if (plist) { \ + bp_t *bp = (bp_t *) cf_malloc(sizeof(bp_t)); /* FREE ME 109 */ \ + bp->x = p; bp->i = pi; \ + ll_ai_bp_element * node = cf_malloc(sizeof(ll_ai_bp_element)); \ + node->value = bp; \ + cf_ll_append(plist, (cf_ll_element *)node); \ + } + +#define CREATE_RETURN_DELETED_KEY(btr, kp, dr) \ + dwd_t dwd; bzero(&dwd, sizeof(dwd_t)); dwd.dr = dr; \ + if (BIG_BT(btr)) { memcpy(delbuf, kp, btr->s.ksize); } \ + dwd.k = BIG_BT(btr) ? delbuf : kp; + +/* NOTE: ksize > 8 bytes needs buffer for CASE 1 */ +#define MAX_KEY_SIZE (AS_DIGEST_KEY_SZ *2) + +#define DK_NONE 0 +#define DK_2A 1 +#define DK_2B 2 + +/* remove an existing key from the tree. KEY MUST EXIST + the s parameter: + 1.) for normal operation pass it as DK_NONE, + 2.) delete the max node, pass it as DK_2A, + 3.) delete the min node, pass it as DK_2B. + */ +typedef struct btds_t { + ulong leaf_del_hits; ulong leaf_del_noop; + ulong ndel; ulong del_calls; + ulong case1_del; + ulong case2A_del; ulong case2B_del; ulong case2C_del; + ulong case3_del; + ulong case3A1_del; ulong case3A2_del; ulong case3B1_del; ulong case3B2_del; +} btds_t; + +btds_t *btds = NULL; + +static dwd_t deletekey(bt *btr, bt_n *x, bt_data_t k, int s, bool drt, + bt_n *p, int pi, cf_ll *plist, void **c2Cp, + bool leafd, char delbuf[]) { btds->del_calls++; + bt_n *xp, *y, *z; bt_data_t kp; + int yn, zn, i = 0, r = -1, ks = btr->s.ksize; + if (s != DK_NONE) { /* min or max node deletion */ + if (x->leaf) r = 0; + else { + if (s == DK_2A) r = 1; // max node + else if (s == DK_2B) r = -1; // min node + } + if (s == DK_2A) i = x->n - 1; // max node/leaf + else if (s == DK_2B) i = -1; // min node/leaf + } else i = findkindex(btr, x, k, &r, NULL); //DEBUG_DEL_POST_S + + if (!drt) decr_scion(x, 1); // scion reduced by 1 every DELETE + + /* Case 1: + * If the key k is in node x and x is a leaf, delete the key k from x. */ + if (x->leaf) { btds->case1_del++; + bool rgst = 0; + if (s == DK_2B) i++; //DEBUG_DEL_CASE_1 + kp = KEYS (btr, x, i); + int dr = getDR(btr, x, i); + CREATE_RETURN_DELETED_KEY(btr, kp, dr) + if (drt) { // CASE: EVICT + if (s == DK_NONE) { //NOTE: only place DR grows + x = incrPrevDR(btr, x, i, (dr + 1), p, pi, plist); + } else decr_scion(x, 1 + dr); //NOTE: key FOR Case2A/B + } else if (s == DK_NONE) { // CASE: DELETE NOT CASE2A/B + if (dr) { + if (NBT(btr)) { x = incrPrevDR(btr, x, i, dr, p, pi, plist); } + else { rgst = 1; // DELETE DataBT KEY w/ DR -> REPLACE w/ GHOST + x = replaceKeyWithGhost(btr, x, i, kp, dr, p, pi); + } + } + } else if (dr) decr_scion(x, dr); // CASE: DELETE CASE2A/B + if (!rgst) { // IF NO REPLACE_W_GHOST -> Remove from BTREE + mvXKeys(btr, &x, i, &x, i + 1, (x->n - i - 1), ks, p, pi, p, pi); + x = trimBTN(btr, x, drt, p, pi); + } + return dwd; + } + dwd_t dwde; bzero(&dwde, sizeof(dwd_t)); + if (r == 0) { /* (r==0) means key found, but in node */ //DEBUG_DEL_CASE_2 + kp = KEYS(btr, x, i); + if (!drt) { // ON DELETE + int dr = getDR(btr, x, i); + if (dr) { // IF DR -> REPLACE_W_GHOST, no recursive delete + x = replaceKeyWithGhost(btr, x, i, kp, dr, p, pi); + CREATE_RETURN_DELETED_KEY(btr, kp, dr) + return dwd; + } + } + /* Case 2: + * if the key k is in the node x, and x is an internal node */ + if ((yn = NODES(btr, x)[i]->n) >= btr->t) { //DEBUG_DEL_CASE_2a + btds->case2A_del++; + if (leafd) return dwde; + /* Case 2a: + * if the node y that precedes k in node x has at least t keys, + * then find the previous sequential key (kp) of k. + * Recursively delete kp, and replace k with kp in x. */ + xp = NODES(btr, x)[i]; + ADD_BP(plist, x, i) + //printf("CASE2A recurse: key: "); printKey(btr, x, i); + dwd_t dwd = deletekey(btr, xp, NULL, DK_2A, drt, + x, i, plist, c2Cp, leafd, delbuf); + //DEBUG_SET_BTKEY_2A + if (drt) x = incrDR(btr, x, i, ++dwd.dr, p, pi); + else x = setDR (btr, x, i, dwd.dr, p, pi); + setBTKeyRaw(btr, x, i, dwd.k); + dwd.k = kp; // swap back in KPs original value + return dwd; + } + if ((zn = NODES(btr, x)[i + 1]->n) >= btr->t) { //DEBUG_DEL_CASE_2b + btds->case2B_del++; + if (leafd) return dwde; + /* Case 2b: + * if the node z that follows k in node x has at least t keys, + * then find the next sequential key (kp) of k. Recursively delete + * kp, and replace k with kp in x. */ + xp = NODES(btr, x)[i + 1]; + ADD_BP(plist, x, i + 1) + //printf("CASE2B recurse: key: "); printKey(btr, x, i); + dwd_t dwd = deletekey(btr, xp, NULL, DK_2B, drt, + x, i + 1, plist, c2Cp, leafd, delbuf); + //DEBUG_SET_BTKEY_2B + if (drt) { // prev key inherits DR+1 + x = incrCase2B (btr, x, i, (getDR(btr, x, i) + 1)); + } + x = overwriteDR(btr, x, i, dwd.dr, p, pi); + setBTKeyRaw(btr, x, i, dwd.k); + dwd.k = kp; // swap back in KPs original value + return dwd; + } + if (yn == btr->t - 1 && zn == btr->t - 1) { //DEBUG_DEL_CASE_2c + btds->case2C_del++; + if (leafd) return dwde; + /* Case 2c: + * if both y and z have only t - 1 keys, merge k + * then all of z into y, so that x loses both k and + * the pointer to z, and y now contains 2t - 1 keys. */ + if (!*c2Cp) *c2Cp = KEYS(btr, x, i); //used in remove_key() + y = NODES(btr, x)[i]; + z = NODES(btr, x)[i + 1]; + dwd_t dwd; dwd.k = k; dwd.dr = getDR(btr, x, i); + incr_scion(y, 1 + dwd.dr); //DEBUG_SET_BTKEY_2C + y = setDR (btr, y, y->n, dwd.dr, x, i); + setBTKeyRaw(btr, y, y->n, dwd.k); y->n++; + incr_scion(y, get_scion_range(btr, z, 0, z->n)); + mvXKeys(btr, &y, y->n, &z, 0, z->n, ks, x, i, x, i + 1); + if (!y->leaf) { + move_scion(btr, y, z, z->n + 1); + mvXNodes (btr, y, y->n, z, 0, (z->n + 1)); + } + y->n += z->n; + mvXKeys (btr, &x, i, &x, i + 1, (x->n - i - 1), ks, p, pi, p, pi); + mvXNodes(btr, x, i + 1, x, i + 2, (x->n - i - 1)); + x = trimBTN(btr, x, drt, p, pi); + bt_free_btreenode(btr, z); + ADD_BP(plist, x, i) + //printf("CASE2C key: "); printKey(btr, x, i); + return deletekey(btr, y, k, s, drt, x, i, plist, c2Cp, leafd, delbuf); + } + } + /* Case 3: + * if k is not present in internal node x, determine the root xp of + * the appropriate subtree that must contain k, if k is in the tree + * at all. If xp has only t - 1 keys, execute step 3a or 3b as + * necessary to guarantee that we descend to a node containing at + * least t keys. Finish by recursing on the appropriate node of x. */ + i++; + if ((xp = NODES(btr, x)[i])->n == btr->t - 1) { /* case 3a-c are !x->leaf */ + /* Case 3a: + * If xp has only (t-1) keys but has a sibling(y) with at least t keys, + give xp an extra key by moving a key from x down into xp, + moving a key from xp's immediate left or right sibling(y) up into x, + & moving the appropriate node from the sibling(y) into xp. */ + if (i > 0 && (y = NODES(btr, x)[i - 1])->n >= btr->t) { + btds->case3A1_del++; + //printf("CASE3A1 key: "); printKey(btr, x, i); + if (leafd) return dwde; + /* left sibling has t keys */ //DEBUG_DEL_CASE_3a1 + mvXKeys(btr, &xp, 1, &xp, 0, xp->n, ks, x, i, x, i); + if (!xp->leaf) mvXNodes(btr, xp, 1, xp, 0, (xp->n + 1)); + incr_scion(xp, 1 + getDR(btr, x, i - 1)); + xp = setBTKey(btr, xp, 0, x, i - 1, drt, x, i, p, pi); xp->n++; + decr_scion(y, 1 + getDR(btr, y, y->n - 1)); + x = setBTKey(btr, x, i - 1, y, y->n - 1, drt, p, pi, x, i - 1); + if (!xp->leaf) { + int dscion = NODES(btr, y)[y->n]->scion; + incr_scion(xp, dscion); decr_scion(y, dscion); + NODES(btr, xp)[0] = NODES(btr, y)[y->n]; + } + y = trimBTN(btr, y, drt, x, i - 1); + } else if (i < x->n && (y = NODES(btr, x)[i + 1])->n >= btr->t) { + btds->case3A2_del++; + //printf("CASE3A2 key: "); printKey(btr, x, i); + if (leafd) return dwde; + /* right sibling has t keys */ //DEBUG_DEL_CASE_3a2 + incr_scion(xp, 1 + getDR(btr, x, i)); + xp = setBTKey(btr, xp, xp->n++, x, i, drt, x, i, p, pi); + decr_scion(y, 1 + getDR(btr, y, 0)); + x = setBTKey(btr, x, i, y, 0, drt, p, pi, x, i + 1); + if (!xp->leaf) { + int dscion = NODES(btr, y)[0]->scion; + incr_scion(xp, dscion); decr_scion(y, dscion); + NODES(btr, xp)[xp->n] = NODES(btr, y)[0]; + } + mvXKeys(btr, &y, 0, &y, 1, y->n - 1, ks, x, i + 1, x, i + 1); + if (!y->leaf) mvXNodes(btr, y, 0, y, 1, y->n); + y = trimBTN(btr, y, drt, x, i + 1); + } + /* Case 3b: + * If xp and all of xp's siblings have t - 1 keys, merge xp with + one sibling, which involves moving a key from x down into the + new merged node to become the median key for that node. */ + else if (i > 0 && (y = NODES(btr, x)[i - 1])->n == btr->t - 1) { + btds->case3B1_del++; + //printf("CASE3B1 key: "); printKey(btr, x, i); + if (leafd) return dwde; + /* merge i with left sibling */ //DEBUG_DEL_CASE_3b1 + incr_scion(y, 1 + getDR(btr, x, i - 1)); + y = setBTKey(btr, y, y->n++, x, i - 1, drt, x, i - 1, p, pi); + incr_scion(y, get_scion_range(btr, xp, 0, xp->n)); + mvXKeys(btr, &y, y->n, &xp, 0, xp->n, ks, x, i - 1, x, i); + if (!xp->leaf) { + move_scion(btr, y, xp, xp->n + 1); + mvXNodes (btr, y, y->n, xp, 0, (xp->n + 1)); + } + y->n += xp->n; + mvXKeys (btr, &x, i - 1, &x, i, (x->n - i), ks, p, pi, p, pi); + mvXNodes(btr, x, i, x, i + 1, (x->n - i)); + x = trimBTN(btr, x, drt, p, pi); + bt_free_btreenode(btr, xp); + xp = y; i--; // i-- for parent-arg in recursion (below) + } else if (i < x->n && (y = NODES(btr, x)[i + 1])->n == btr->t - 1) { + btds->case3B2_del++; + //printf("CASE3B2 key: "); printKey(btr, x, i); + if (leafd) return dwde; + /* merge i with right sibling */ //DEBUG_DEL_CASE_3b2 + incr_scion(xp, 1 + getDR(btr, x, i)); + xp = setBTKey(btr, xp, xp->n++, x, i, drt, x, i, p, pi); + incr_scion(xp, get_scion_range(btr, y, 0, y->n)); + mvXKeys(btr, &xp, xp->n, &y, 0, y->n, ks, x, i, x, i + 1); + if (!xp->leaf) { + move_scion(btr, xp, y, y->n + 1); + mvXNodes (btr, xp, xp->n, y, 0, (y->n + 1)); + } + xp->n += y->n; + mvXKeys (btr, &x, i, &x, i + 1, (x->n - i - 1), ks, p, pi, p, pi); + mvXNodes(btr, x, i + 1, x, i + 2, (x->n - i - 1)); + x = trimBTN(btr, x, drt, p, pi); + bt_free_btreenode(btr, y); + } + } //printf("RECURSE CASE 3\n"); + btds->case3_del++; + ADD_BP(plist, x, i) //DEBUG_DEL_POST_CASE_3 + dwd_t dwd = deletekey(btr, xp, k, s, drt, x, i, plist, c2Cp, leafd, delbuf); + // CASE2A/B pull keys up from depths, scion must be decremented + if (s != DK_NONE) { + if (drt) decr_scion(x, 1 + dwd.dr); + else decr_scion(x, dwd.dr); // DELETE already decr_scion()ed 1 + } + return dwd; +} + +#ifdef DEBUG_DEL_CASE_STATS +static void print_del_case_stats(bool leafd, dwd_t dwd, bt *btr) { + if (leafd) { + if (!dwd.k) btds->leaf_del_noop++; + else btds->leaf_del_hits++; + printf("deletes: %lu noop: %lu ratio: %f numkeys: %d\n", + btds->leaf_del_hits, btds->leaf_del_noop, + (btds->leaf_del_noop && btds->leaf_del_hits) ? + (double)((double)btds->leaf_del_hits / + (double)btds->leaf_del_noop) : 0, + btr->numkeys); + } else + printf("ndel: %lu ncalls: %lu C1: %lu(%.2f) C2A: %lu(%.2f) " + "C2B: %lu(%.2f) C2C: %lu(%.2f) C3: %lu(%.2f) " + "C3A1: %lu(%.2f) C3A2: %lu(%.2f) C3B1: %lu(%.2f) " + "C3B2: %lu(%.2f)\n", + btds->ndel, btds->del_calls, + btds->case1_del, + (double)((double)btds->case1_del / (double)btds->del_calls), + btds->case2A_del, + (double)((double)btds->case2A_del / (double)btds->del_calls), + btds->case2B_del, + (double)((double)btds->case2B_del / (double)btds->del_calls), + btds->case2C_del, + (double)((double)btds->case2C_del / (double)btds->del_calls), + btds->case3_del, + (double)((double)btds->case3_del / (double)btds->del_calls), + btds->case3A1_del, + (double)((double)btds->case3A1_del / (double)btds->del_calls), + btds->case3A2_del, + (double)((double)btds->case3A2_del / (double)btds->del_calls), + btds->case3B1_del, + (double)((double)btds->case3B1_del / (double)btds->del_calls), + btds->case3B2_del, + (double)((double)btds->case3B2_del / (double)btds->del_calls)); + fflush(NULL); +} +#endif + +static dwd_t remove_key(bt *btr, bt_data_t k, bool drt, bool leafd) { + if (!btds) { btds = cf_malloc(sizeof(btds_t)); bzero(btds, sizeof(btds_t)); } + btds->ndel++; + if (!btr->root) { dwd_t dwde; bzero(&dwde, sizeof(dwd_t)); return dwde; } + void *c2Cp = NULL; /* NOTE: c2Cp gets lost in recursion */ //DEBUG_DEL_START + bt_n *p = btr->root; int pi = 0; + cf_ll plist_tmp; + cf_ll * plist = &plist_tmp; // NOTE: plist stores ancestor line during recursive delete + if (drt) { + cf_ll_init(plist, ll_ai_bp_destroy_fn, false); + ADD_BP(plist, p, pi);//FR110 + } else plist = NULL; + char delbuf[MAX_KEY_SIZE]; // NOTE: ksize > 8B needs buffer for CASE 1 + dwd_t dwd = deletekey(btr, btr->root, k, DK_NONE, drt, + p, pi, plist, &c2Cp, leafd, delbuf); +#ifdef DEBUG_DEL_CASE_STATS + print_del_case_stats(leafd, dwd, btr); +#endif + if (!dwd.k) return dwd; // leafd NO-OP + btr->numkeys--; //DEBUG_DEL_END + /* remove empty non-leaf node from root, */ + if (!btr->root->n && !btr->root->leaf) { /* NOTE: tree decrease height */ + btr->numnodes--; + bt_n *x = btr->root; + btr->root = NODES(btr, x)[0]; + bt_free_btreenode(btr, x); + } + if (c2Cp) dwd.k = c2Cp; + if (plist) { + cf_ll_reduce(plist, true, ll_ai_bp_reduce_fn, NULL); + plist = NULL; + }; // FREED 110 + return dwd; +} +dwd_t bt_delete(bt *btr, bt_data_t k, bool leafd) { + return remove_key(btr, k, 0, leafd); +} + +// ACCESSORS ACCESSORS ACCESSORS ACCESSORS ACCESSORS ACCESSORS ACCESSORS +static inline bool key_covers_miss(bt *btr, bt_n *x, int i, ai_obj *akey) { + if (!(C_IS_NUM(btr->s.ktype))) return 0; + if (i < 0) i = 0; + ulong mkey = getNumKey(btr, x, i); + ulong dr = (ulong)getDR(btr, x, i); + if (mkey && dr) { + ulong qkey = akey->l; + ulong span = mkey + dr; + //DEBUG_CURRKEY_MISS + if (qkey >= mkey && qkey <= span) return 1; + } + return 0; +} +#define SET_DWM_XIP { dwm.x = x; dwm.i = i; dwm.p = p; dwm.pi = pi; } +dwm_t findnodekey(bt *btr, bt_n *x, bt_data_t k, ai_obj *akey) { + int r = -1, i = 0; + bt_n *p = btr->root; int pi = 0; + dwm_t dwm; bzero(&dwm, sizeof(dwm_t)); SET_DWM_XIP + while (x) { + i = findkindex(btr, x, k, &r, NULL); //DEBUG_FIND_NODE_KEY + if (i >= 0 && !r) { SET_DWM_XIP dwm.k = KEYS(btr, x, i); return dwm; } + if (key_covers_miss(btr, x, i, akey)) { SET_DWM_XIP dwm.miss = 1; } + if (x->leaf) { dwm.k = NULL; return dwm; } + p = x; pi = i + 1; x = NODES(btr, x)[i + 1]; + } + return dwm; +} +bt_data_t bt_find(bt *btr, bt_data_t k, ai_obj *akey) { //Indexes still use this + dwm_t dwm = findnodekey(btr, btr->root, k, akey); + return dwm.k; +} + +static bool check_min_miss(bt *btr, ai_obj *alow) { + if (!btr->dirty_left) return 0; + ai_obj amin; convertStream2Key(bt_min(btr), &amin, btr); + return ai_objEQ(alow, &amin); +} +int bt_init_iterator(bt *btr, bt_data_t k, btIterator *iter, ai_obj *alow) { + if (!btr->root) return II_FAIL; + int r = -1; + bool lmiss = check_min_miss(btr, alow); + bool miss = 0; + uchar only_right = 1; + bt_n *x = btr->root; + while (x) { + int i = findkindex(btr, x, k, &r, iter); + if (i >= 0 && r == 0) return lmiss ? II_L_MISS : II_OK; + if (key_covers_miss(btr, x, i, alow)) miss = 1; //DEBUG_BT_II + if (miss) return II_MISS; + if (r < 0 || i != (x->n - 1)) only_right = 0; + if (x->leaf) { + if (i != (x->n - 1)) only_right = 0; + return only_right ? II_ONLY_RIGHT : II_LEAF_EXIT; + } + iter->bln->child = get_new_iter_child(iter); + x = NODES(btr, x)[i + 1]; + to_child(iter, x); + } + return II_FAIL; +} + +bool bt_exist(bt *btr, bt_data_t k, ai_obj *akey) { + int r = -1; + bt_n *x = btr->root; + while (x) { + int i = findkindex(btr, x, k, &r, NULL); + if (i >= 0 && r == 0) return 1; + if (key_covers_miss(btr, x, i, akey)) return 1; + if (x->leaf) return 0; + x = NODES(btr, x)[i + 1]; + } + return 0; +} + +static bt_data_t findminkey(bt *btr, bt_n *x) { + if (x->leaf) return KEYS(btr, x, 0); + else return findminkey(btr, NODES(btr, x)[0]); +} +bt_n *findminnode(bt *btr, bt_n *x) { + if (x->leaf) return x; + else return findminnode(btr, NODES(btr, x)[0]); +} +static bt_data_t findmaxkey(bt *btr, bt_n *x) { + if (x->leaf) return KEYS(btr, x, x->n - 1); + else return findmaxkey(btr, NODES(btr, x)[x->n]); +} +bt_data_t bt_min(bt *btr) { + if (!btr->root || !btr->numkeys) return NULL; + else return findminkey(btr, btr->root); +} +bt_data_t bt_max(bt *btr) { + if (!btr->root || !btr->numkeys) return NULL; + else return findmaxkey(btr, btr->root); +} + +// DESTRUCTOR DESTRUCTOR DESTRUCTOR DESTRUCTOR DESTRUCTOR DESTRUCTOR +static void destroy_bt_node(bt *btr, bt_n *x) { + if (!x->leaf) { + for (int i = 0; i <= x->n; i++) { + destroy_bt_node(btr, NODES(btr, x)[i]); + } + } + bt_free_btreenode(btr, x); /* memory management in btr */ +} +void bt_destroy(bt *btr) { + if (btr->root) { + if (btr->numkeys) destroy_bt_node (btr, btr->root); + else bt_free_btreenode(btr, btr->root); + btr->root = NULL; + } + bt_free_btree(btr); +} diff --git a/ai/src/bt_iterator.c b/ai/src/bt_iterator.c new file mode 100644 index 00000000..55971bf5 --- /dev/null +++ b/ai/src/bt_iterator.c @@ -0,0 +1,528 @@ +/* + * bt_iterator.c + * + * Copyright (C) 2013-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +/* + * This file implements Aerospike Index B-tree iterators. + */ + +#include +#include +#include +#include +#include +#include +#include +#include // For MAX() & MIN(). + +#include "ai_obj.h" +#include "bt_iterator.h" +#include "stream.h" +#include + +// HELPER_DEFINES HELPER_DEFINES HELPER_DEFINES HELPER_DEFINES HELPER_DEFINES +#define GET_NEW_CHILD(iter) \ + if (!iter->bln->child) { iter->bln->child = get_new_iter_child(iter); } + +#define SETITER8R(iter, btr, asc, l, lrev, n, nrev) \ + btSIter *siter = setIterator(iter, btr, asc ? l : lrev, asc ? n : nrev); + +#define CR8ITER8R(btr, asc, l, lrev, n, nrev) \ + btSIter *siter = createIterator(btr, asc ? l : lrev, asc ? n : nrev); + +bt_ll_n *get_new_iter_child(btIterator *iter) { //printf("get_newiterchild\n"); + assert(iter->num_nodes < MAX_BTREE_DEPTH); + bt_ll_n *nn = &(iter->nodes[iter->num_nodes]); + bzero(nn, sizeof(bt_ll_n)); + iter->num_nodes++; + return nn; +} + +void to_child(btIterator *iter, bt_n* self) { //printf("to_child\n"); + iter->depth++; + iter->bln->child->parent = iter->bln; + iter->bln->child->ik = 0; + iter->bln->child->in = 0; + iter->bln->child->self = self; + iter->bln = iter->bln->child; +} +static void toparentrecurse(btIterator *iter) { //printf("to_parent\n"); + if (!iter->bln->parent) { + iter->finished = 1; /* finished */ + return; + } + iter->depth--; + bt *btr = iter->btr; + void *child = KEYS(btr, iter->bln->self, iter->bln->ik); + iter->bln = iter->bln->parent; /* -> parent */ + void *parent = KEYS(btr, iter->bln->self, iter->bln->ik); + int x = btr->cmp(child, parent); + if (x > 0) { + if ((iter->bln->ik + 1) < iter->bln->self->n) iter->bln->ik++; + if ((iter->bln->in + 1) < iter->bln->self->n) iter->bln->in++; + else toparentrecurse(iter); + } +} +static void iter_leaf(btIterator *iter) { //printf("iter_leaf\n"); + if ((iter->bln->ik + 1) < iter->bln->self->n) iter->bln->ik++; + else toparentrecurse(iter); +} +static void tochildrecurse(btIterator *iter, bt_n* self) { + to_child(iter, self); + if (!iter->bln->self->leaf) { // depth-first + GET_NEW_CHILD(iter) + tochildrecurse(iter, NODES(iter->btr, iter->bln->self)[iter->bln->in]); + } +} +static void iter_node(btIterator *iter) { + if ((iter->bln->ik + 1) < iter->bln->self->n) iter->bln->ik++; + if ((iter->bln->in + 1) <= iter->bln->self->n) iter->bln->in++; + GET_NEW_CHILD(iter) + tochildrecurse(iter, NODES(iter->btr, iter->bln->self)[iter->bln->in]); +} + +static void *btNext(btSIter *siter, bt_n **rx, int *ri, bool asc) { + btIterator *iter = &(siter->x); + if (iter->finished) { + if (siter->scan) siter->missed = siter->nim; + return NULL; + } + if (asc) siter->missed = siter->nim; //Curr MISSED = LastLoop's NextIsMissed + bt_n *x = iter->bln->self; + if (rx) *rx = x; + int i = iter->bln->ik; + if (ri) *ri = i; + void *curr = KEYS(iter->btr, x, i); + siter->nim = getDR(iter->btr, x, i) ? 1 : 0; + if (iter->bln->self->leaf) (*iter->iLeaf)(iter); + else (*iter->iNode)(iter); + return curr; +} + +void to_child_rev(btIterator *iter, bt_n* self) { + iter->depth++; + iter->bln->child->parent = iter->bln; + iter->bln->child->ik = self->n - 1; + iter->bln->child->in = self->n; + iter->bln->child->self = self; + iter->bln = iter->bln->child; +} +static void tochildrecurserev(btIterator *iter, bt_n* self) { + to_child_rev(iter, self); + if (!iter->bln->self->leaf) { // depth-first + GET_NEW_CHILD(iter) + tochildrecurserev(iter, + NODES(iter->btr, iter->bln->self)[iter->bln->in]); + } +} +static void toparentrecurserev(btIterator *iter) { + if (!iter->bln->parent) { + iter->finished = 1; /* finished */ + return; + } + iter->depth--; + bt *btr = iter->btr; + void *child = KEYS(btr, iter->bln->self, iter->bln->ik); + iter->bln = iter->bln->parent; /* -> parent */ + void *parent = KEYS(btr, iter->bln->self, iter->bln->ik); + int x = btr->cmp(child, parent); + if (x < 0) { + if (iter->bln->ik) iter->bln->ik--; + if (iter->bln->in) iter->bln->in--; + else toparentrecurserev(iter); + } + if (iter->bln->in == iter->bln->self->n) iter->bln->in--; +} +static void iter_leaf_rev(btIterator *iter) { //printf("iter_leaf_rev\n"); + if (iter->bln->ik) iter->bln->ik--; + else toparentrecurserev(iter); +} +static void iter_node_rev(btIterator *iter) { + GET_NEW_CHILD(iter) + tochildrecurserev(iter, NODES(iter->btr, iter->bln->self)[iter->bln->in]); +} + +// INIT_ITERATOR INIT_ITERATOR INIT_ITERATOR INIT_ITERATOR INIT_ITERATOR +static void *setIter(bt *btr, bt_data_t bkey, btSIter *siter, ai_obj *alow, + bt_n **rx, int *ri, bool asc) { + btIterator *iter = &(siter->x); + int ret = bt_init_iterator(btr, bkey, iter, alow); + //printf("setIter: ret: %d\n", ret); + if (ret == II_FAIL) return NULL; + siter->empty = 0; + if (ret == II_L_MISS) { + siter->nim = siter->missed = 1; + return NULL; + } + else if (ret == II_MISS) siter->nim = siter->missed = 1; + else if (ret != II_OK) { /* range queries, find nearest match */ + int x = btr->cmp(bkey, KEYS(btr, iter->bln->self, iter->bln->ik)); + if (x > 0) { + if (ret == II_ONLY_RIGHT) { // off end of B-tree + siter->empty = 1; + return NULL; + } else { // II_LEAF_EXIT + //printf("setIter: [II_LEAF_EXIT\n"); //TODO needed? + return btNext(siter, rx, ri, asc); // find next + } + } + } + if (rx) *rx = iter->bln->self; + if (ri) *ri = iter->bln->ik; + return KEYS(iter->btr, iter->bln->self, iter->bln->ik); +} +static void init_iter(btIterator *iter, bt *btr, + iter_single *itl, iter_single *itn) { + iter->btr = btr; + iter->high = LONG_MIN; + iter->iLeaf = itl; + iter->iNode = itn; + iter->finished = 0; + iter->num_nodes = 0; + iter->bln = &(iter->nodes[0]); + iter->bln->ik = iter->bln->in = 0; + iter->num_nodes++; + iter->bln->self = btr->root; + iter->bln->parent = iter->bln->child = NULL; + iter->depth = 0; +} + +// AEROSPIKE MULTI_THREAD +static btSIter *newIter() { + btSIter *siter = cf_malloc(sizeof(btSIter)); + bzero(siter, sizeof(btSIter)); + return siter; +} + +static btSIter *getIterator() { + return newIter(); +} + +static void releaseIterator(btSIter *siter) { + if (siter) { + cf_free(siter); + } + return; +} + +static btSIter *createIterator(bt *btr, iter_single *itl, iter_single *itn) { + btSIter *siter = getIterator(); + siter->dofree = 1; + siter->missed = 0; + siter->nim = 0; + siter->empty = 1; + siter->scan = 0; + siter->ktype = btr->s.ktype; + init_ai_obj(&siter->key); + siter->be.key = &(siter->key); + siter->be.val = NULL; + init_iter(&siter->x, btr, itl, itn); + return siter; +} +//extra insertion + +static btSIter *setIterator(btSIter *iter, bt *btr, iter_single *itl, iter_single *itn) { + btSIter *siter = iter; + siter->dofree = 0; + siter->missed = 0; + siter->nim = 0; + siter->empty = 1; + siter->scan = 0; + siter->ktype = btr->s.ktype; + init_ai_obj(&siter->key); + siter->be.key = &(siter->key); + siter->be.val = NULL; + init_iter(&siter->x, btr, itl, itn); + return siter; +} +void btReleaseRangeIterator(btSIter *siter) { + if (!siter) return; + if (siter->dofree) { + releaseIterator(siter); + } +} +static void setHigh(btSIter *siter, ai_obj *high, col_type_t ktype) { + if (C_IS_L(ktype) || C_IS_G(ktype)) { + siter->x.high = high->l; + } + else if (C_IS_DG(ktype)) { + siter->x.highy = high->y; + } +} + +static bool streamToBTEntry(uchar *stream, btSIter *siter, bt_n *x, int i) { + if (!stream) return 0; + if (i < 0) i = 0; + convertStream2Key(stream, siter->be.key, siter->x.btr); + siter->be.val = parseStream(stream, siter->x.btr); + bool gost = IS_GHOST(siter->x.btr, siter->be.val); + if (gost) { + siter->missed = 1; // GHOST key + siter->nim = 0; + } + siter->be.dr = x ? getDR(siter->x.btr, x, i) : 0; + siter->be.stream = stream; + siter->be.x = x; + siter->be.i = i; //NOTE: used by bt_validate_dirty + //DUMP_STREAM_TO_BT_ENTRY + return 1; +} +btSIter *btGetRangeIter(bt *btr, ai_obj *alow, ai_obj *ahigh, bool asc) { + if (!btr->root || !btr->numkeys) return NULL; + btk_t btk; + bool med; + uint32 ksize; //bt_dumptree(btr, btr->ktype); + CR8ITER8R(btr, asc, iter_leaf, iter_leaf_rev, iter_node, iter_node_rev); + setHigh(siter, asc ? ahigh : alow, btr->s.ktype); + char *bkey = createBTKey(asc ? alow : ahigh, + &med, &ksize, btr, &btk); //D032 + if (!bkey) goto rangeiter_err; + bt_n *x = NULL; + int i = -1; + uchar *stream = setIter(btr, bkey, siter, asc ? alow : ahigh, &x, &i, asc); + destroyBTKey(bkey, med); /* DESTROYED 032 */ + if (!streamToBTEntry(stream, siter, x, i)) goto rangeiter_err; + return siter; + +rangeiter_err: + btReleaseRangeIterator(siter); + return NULL; +} + + +btSIter *btSetRangeIter(btSIter * iter, bt *btr, ai_obj *alow, ai_obj *ahigh, bool asc) { + if (!btr->root || !btr->numkeys) return NULL; + btk_t btk; + bool med; + uint32 ksize; //bt_dumptree(btr, btr->ktype); + SETITER8R(iter, btr, asc, iter_leaf, iter_leaf_rev, iter_node, iter_node_rev); + setHigh(siter, asc ? ahigh : alow, btr->s.ktype); + char *bkey = createBTKey(asc ? alow : ahigh, + &med, &ksize, btr, &btk); //D032 + if (!bkey) goto rangeiter_err; + bt_n *x = NULL; + int i = -1; + uchar *stream = setIter(btr, bkey, siter, asc ? alow : ahigh, &x, &i, asc); + destroyBTKey(bkey, med); /* DESTROYED 032 */ + if (!streamToBTEntry(stream, siter, x, i)) goto rangeiter_err; + return siter; + +rangeiter_err: + btReleaseRangeIterator(siter); + return NULL; +} +btEntry *btRangeNext(btSIter *siter, bool asc) { //printf("btRangeNext\n"); + //printf("btRangeNext: siter: %p\n", (void *)siter); + //if (siter) printf("btRangeNext: empty: %d\n", siter->empty); + if (!siter || siter->empty) return NULL; + bt_n *x = NULL; + int i = -1; + uchar *stream = btNext(siter, &x, &i, asc); + if (!streamToBTEntry(stream, siter, x, i)) return NULL; + if (C_IS_L(siter->ktype) || C_IS_G(siter->ktype)) { + long l = siter->key.l; + if (l == siter->x.high) siter->x.finished = 1; /* exact match */ + if (!asc) { + //printf("btRangeNext: DESC: l: %lu dr: %u\n", + // l, getDR(siter->x.btr, x, i)); + l += getDR(siter->x.btr, x, i); + } + bool over = asc ? (l > siter->x.high) : (l < siter->x.high); + if (over && siter->nim) { + siter->missed = 1; + } + //printf("btRangeNext: over: %d l: %lu high: %lu\n", + // over, l, siter->x.high); + return over ? NULL : &(siter->be); + } else if (C_IS_DG(siter->ktype)) { + uint160 yy = siter->key.y; + int ret = u160Cmp(&yy, &siter->x.highy); + if (!ret) siter->x.finished = 1; /* exact match */ + if (!asc) { //TODO is ENDIANness of memcpy() correct + uint32 low; + char *spot = ((char *)&yy) + 12; + memcpy(&low, spot, 4); + low += getDR(siter->x.btr, x, i); + memcpy(spot, &low, 4); + } + bool over = asc ? (ret > 0) : (ret < 0); + return over ? NULL : &(siter->be); + } else { + return NULL; + } +} + +// FULL_BTREE_ITERATOR FULL_BTREE_ITERATOR FULL_BTREE_ITERATOR +bool assignMinKey(bt *btr, ai_obj *akey) { //TODO combine w/ setIter() + void *e = bt_min(btr); + if (!e) return 0; // iter can be initialised + convertStream2Key(e, akey, btr); + return 1; // w/ this lookup +} +bool assignMaxKey(bt *btr, ai_obj *akey) { + void *e = bt_max(btr); + if (!e) return 0; + convertStream2Key(e, akey, btr); + return 1; +} +btSIter *btGetFullRangeIter(bt *btr, bool asc, cswc_t *w) { + cswc_t W; // used in setHigh() + if (!btr->root || !btr->numkeys) return NULL; + if (!w) w = &W; + ai_obj *aL = &w->wf.alow, *aH = &w->wf.ahigh; + if (!assignMinKey(btr, aL) || !assignMaxKey(btr, aH)) return NULL; + btk_t btk; + bool med; + uint32 ksize; + CR8ITER8R(btr, asc, iter_leaf, iter_leaf_rev, iter_node, iter_node_rev); + siter->scan = 1; + setHigh(siter, asc ? aH : aL, btr->s.ktype); + char *bkey = createBTKey(asc ? aL : aH, + &med, &ksize, btr, &btk); //DEST 030 + if (!bkey) goto frangeiter_err; + bt_n *x = NULL; + int i = -1; + uchar *stream = setIter(btr, bkey, siter, asc ? aL : aH, &x, &i, asc); + destroyBTKey(bkey, med); /* DESTROYED 030 */ + if (!stream && siter->missed) return siter;//IILMISS + if (!streamToBTEntry(stream, siter, x, i)) goto frangeiter_err; + if (btr->dirty_left) siter->missed = 1; // FULL means 100% FULL + return siter; + +frangeiter_err: + btReleaseRangeIterator(siter); + return NULL; +} + +btSIter *btSetFullRangeIter(btSIter *iter, bt *btr, bool asc, cswc_t *w) { + cswc_t W; // used in setHigh() + if (!btr->root || !btr->numkeys) return NULL; + if (!w) w = &W; + ai_obj *aL = &w->wf.alow, *aH = &w->wf.ahigh; + if (!assignMinKey(btr, aL) || !assignMaxKey(btr, aH)) return NULL; + btk_t btk; + bool med; + uint32 ksize; + SETITER8R(iter, btr, asc, iter_leaf, iter_leaf_rev, iter_node, iter_node_rev); + siter->scan = 1; + setHigh(siter, asc ? aH : aL, btr->s.ktype); + char *bkey = createBTKey(asc ? aL : aH, + &med, &ksize, btr, &btk); //DEST 030 + if (!bkey) goto frangeiter_err; + bt_n *x = NULL; + int i = -1; + uchar *stream = setIter(btr, bkey, siter, asc ? aL : aH, &x, &i, asc); + destroyBTKey(bkey, med); /* DESTROYED 030 */ + if (!stream && siter->missed) return siter;//IILMISS + if (!streamToBTEntry(stream, siter, x, i)) goto frangeiter_err; + if (btr->dirty_left) siter->missed = 1; // FULL means 100% FULL + return siter; + +frangeiter_err: + btReleaseRangeIterator(siter); + return NULL; +} + +typedef struct four_longs { + long cnt; + long ofst; + long diff; + long over; +} fol_t; + +#define INIT_ITER_BEENTRY(siter, btr, x, i) \ + { uchar *iistream = KEYS(btr, x, i); streamToBTEntry(iistream, siter, x, i); } +static bool btScionFind(btSIter *siter, bt_n *x, ulong ofst, bt *btr, bool asc, + cswc_t *w, long lim) { + int i = asc ? 0 : x->n; + int fin = asc ? x->n + 1 : -1; + while (i != fin) { + if (x->leaf) break; + uint32_t scion = NODES(btr, x)[i]->scion; + if (scion >= ofst) { + bool i_end_n = (i == siter->x.bln->self->n); + siter->x.bln->in = i; + siter->x.bln->ik = (i_end_n) ? i - 1 : i; + if (scion == ofst) { + if (!asc) { + siter->x.bln->in = siter->x.bln->ik = i - 1; + } + return 1; + } + siter->x.bln->child = get_new_iter_child(&siter->x); + to_child(&siter->x, NODES(btr, x)[i]); + bt_n *kid = NODES(btr, x)[i]; + if (!kid->leaf) { + btScionFind(siter, kid, ofst, btr, asc, w, lim); + return 1; + } else x = kid; + break; + } else ofst -= (scion + 1); // +1 for NODE itself + i = asc ? i + 1 : i - 1; // loop increment + } + // Now Find the rest of the OFFSET (respecting DRs) + uint32 n = siter->x.bln->self->n; + i = asc ? 0 : n - 1; + fin = asc ? MIN(ofst, n) : MAX(-1, (n - ofst)); + int last = asc ? n - 1 : 0; + ulong cnt = 0; + //TODO findminnode() is too inefficient -> needs to be a part of btr + bt_n *minx = findminnode(btr, btr->root); + int btdl = btr->dirty_left; + int dr = 0; + while (i != fin) { + dr = getDR(btr, x, i); + cnt += dr; + if (!i && x == minx) cnt += btdl; + if (cnt >= ofst) break; + cnt++; + i = asc ? i + 1 : i - 1; // loop increment + } + if (i == fin && i == last) { + if (cnt >= x->scion) return 0; + } + else if (cnt < ofst) return 0; //OFST 2big + siter->x.bln->ik = i; + INIT_ITER_BEENTRY(siter, btr, x, siter->x.bln->ik); + if (asc) { + if ((ofst + dr) != cnt) siter->missed = 1; + } + else { + if (!i && x == minx) { + if (ofst != (cnt - btdl)) siter->missed = 1; + } + else { + if (ofst != cnt) siter->missed = 1; + } + } + return 1; +} +btSIter *btGetFullXthIter(bt *btr, ulong oofst, bool asc, cswc_t *w, long lim) { + ulong ofst = oofst; + cswc_t W; // used in setHigh() + if (!btr->root || !btr->numkeys) return NULL; + if (!w) w = &W; + ai_obj *aL = &w->wf.alow, *aH = &w->wf.ahigh; + if (!assignMinKey(btr, aL) || !assignMaxKey(btr, aH)) return NULL; + CR8ITER8R(btr, asc, iter_leaf, iter_leaf_rev, iter_node, iter_node_rev); + setHigh(siter, asc ? aH : aL, btr->s.ktype); + if (btScionFind(siter, btr->root, ofst, btr, asc, w, lim)) siter->empty = 0; + return siter; +} diff --git a/ai/src/bt_output.c b/ai/src/bt_output.c new file mode 100644 index 00000000..440fbf01 --- /dev/null +++ b/ai/src/bt_output.c @@ -0,0 +1,178 @@ +/* + * Copyright 1997-1998, 2001 John-Mark Gurney. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + */ + +#include +#include +#include +#include + +#include "bt_output.h" +#include "bt_iterator.h" +#include "stream.h" + +#define PRINT_EVICTED_KEYS + +#define DEBUG_BT_TYPE(fp, btr) \ + fprintf(fp, "btr: %p NBT: %d NONE: %d " \ + "LL: %d YL: %d " \ + "BIG: %d ksize: %d\n", \ + btr, NBT(btr), NONE_BT(btr), \ + LL(btr), YL(btr), \ + BIG_BT(btr), btr->s.ksize); + +static int treeheight(bt *btr) +{ + bt_n *x = btr->root; + if (!x) { + return 0; + } + + int ret = 0; + while (x && !x->leaf) { + x = NODES(btr, x)[0]; + ret++; + } + + return ++ret; +} + +void bt_dump_info(FILE *fp, bt *btr) +{ + fprintf(fp, "BT: %p t: %d nbits: %d nbyte: %d kbyte: %d " + "ksize: %d koff: %d noff: %d numkeys: %d numnodes: %d " + "height: %d btr: %p btype: %d ktype: %d bflag: %d " + "num: %d root: %p dirty_left: %u msize: %ld dsize: %ld " + "dirty: %u\n", + btr, btr->t, btr->nbits, btr->nbyte, btr->kbyte, btr->s.ksize, + btr->keyofst, btr->nodeofst, btr->numkeys, btr->numnodes, + treeheight(btr), (void *)btr, btr->s.btype, btr->s.ktype, + btr->s.bflag, btr->s.num, btr->root, + btr->dirty_left, btr->msize, btr->dsize, btr->dirty); + DEBUG_BT_TYPE(fp, btr); +} + +static void bt_dump_array(FILE *fp, ai_arr *arr, bool verbose) +{ + fprintf(fp, "Array: capacity: %d used: %d\n", arr->capacity, arr->used); + if (verbose) { + for (int i = 0; i < arr->used; i++) { + const int len = 20; + char digest_str[2 + (len * 2) + 1]; + digest_str[0] = '\0'; + generate_packed_hex_string((uint8_t *) &arr->data[i * CF_DIGEST_KEY_SZ], len, digest_str); + fprintf(fp, "\tData[%d]: %s\n", i, digest_str); + } + } +} + +static void bt_dump_nbtr(FILE *fp, ai_nbtr *nbtr, bool is_index, bool verbose) +{ + if (nbtr->is_btree) { + bt_dumptree(fp, nbtr->u.nbtr, is_index, verbose); + } else { + bt_dump_array(fp, nbtr->u.arr, verbose); + } +} + +static void dump_tree_node(FILE *fp, bt *btr, bt_n *x, int depth, bool is_index, int slot, bool verbose) +{ + if (!x->leaf) { + fprintf(fp, "%d: NODE: ", depth); + if (x->dirty > 0) { + GET_BTN_SIZE(x->leaf); + void *ds = GET_DS(x, nsize); + fprintf(fp, "slot: %d n: %d scion: %d -> (%p) ds: %p dirty: %u\n", + slot, x->n, x->scion, (void *)x, ds, x->dirty); + } else { + fprintf(fp, "slot: %d n: %d scion: %d -> (%p)\n", + slot, x->n, x->scion, (void *) x); + } + } else if (verbose) { + if (x->dirty > 0) { + GET_BTN_SIZE(x->leaf) void *ds = GET_DS(x, nsize); + fprintf(fp, "%d: LEAF: slot: %d n: %d scion: %d -> (%p) ds: %p dirty: %u\n", + depth, slot, x->n, x->scion, (void *)x, ds, x->dirty); + } else { + fprintf(fp, "%d: LEAF: slot: %d n: %d scion: %d -> (%p)\n", + depth, slot, x->n, x->scion, (void *)x); + } + if (btr->dirty_left) { + if (findminnode(btr, btr->root) == x) { +#ifdef PRINT_EVICTED_KEYS + for (uint32 i = 1; i <= btr->dirty_left; i++) { + fprintf(fp, "\t\t\t\t\tEVICTED KEY:\t\t\t%u\n", i); + } +#else + fprintf(fp, "\t\tDL: %u\n", btr->dirty_left); +#endif + } + } + } + + for (int i = 0; i < x->n; i++) { + void *be = KEYS(btr, x, i); + ai_obj akey; + convertStream2Key(be, &akey, btr); + void *rrow = parseStream(be, btr); + if (is_index) { + fprintf(fp, "\tINDEX-KEY: "); + dump_ai_obj_as_digest(fp, &akey); + if (!rrow) { fprintf(fp, "\t\tTOTAL EVICTION\n"); } + else { bt_dump_nbtr(fp, (ai_nbtr *) rrow, 0, verbose); } + } else if (verbose) { + bool key_printed = 0; + if (LL(btr)) { + fprintf(fp, "\t\tLL: PTR: %p\t", rrow); + } else { + bool gost = IS_GHOST(btr, rrow); + if (gost) { fprintf(fp, "\t\tROW [%d]: %p \tGHOST-", i, rrow); } + else { fprintf(fp, "\t\tROW [%d]: %p\t", i, rrow); } + } + if (!key_printed) { + fprintf(fp, "KEY: "); + dump_ai_obj_as_digest(fp, &akey); + } + if (x->dirty > 0) { +#ifdef PRINT_EVICTED_KEYS + uint32 dr = getDR(btr, x, i); + if (dr) { fprintf(fp, "\t\t\t\tDR: %d\n", dr); } + else { + ulong beg = akey.l; + for (ulong j = 1; j <= (ulong)dr; j++) { + fprintf(fp, "\t\t\t\t\tEVICTED KEY:\t\t\t%lu\n", beg + j); + } + } +#else + fprintf(fp, "\t\t\t\tDR: %d\n", getDR(btr, x, i)); +#endif + } + } + } + if (!x->leaf && verbose) { + depth++; + for (int i = 0; i <= x->n; i++) { + fprintf(fp, "\t\tNPTR[%d]: %p\n", i, NODES(btr, x)[i]); + dump_tree_node(fp, btr, NODES(btr, x)[i], depth, is_index, i, verbose); + } + } +} + +void bt_dumptree(FILE *fp, bt *btr, bool is_index, bool verbose) +{ + bt_dump_info(fp, btr); + if (btr->root && btr->numkeys > 0) { + dump_tree_node(fp, btr, btr->root, 0, is_index, 0, verbose); + } + fprintf(fp, "\n"); +} diff --git a/ai/src/stream.c b/ai/src/stream.c new file mode 100644 index 00000000..9fe35bef --- /dev/null +++ b/ai/src/stream.c @@ -0,0 +1,166 @@ +/* + * stream.c + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +/* + * This file implements stream parsing for rows. + */ + +#include +#include +#include +#include +#include +#include + +#include "ai_obj.h" +#include "bt.h" +#include "stream.h" + +#include + +/* COMPARE COMPARE COMPARE COMPARE COMPARE COMPARE COMPARE COMPARE */ +int u160Cmp(void *s1, void *s2) { + char *p1 = (char *)s1; + char *p2 = (char *)s2; + uint128 x1, x2; + memcpy(&x1, p1 + 4, 16); + memcpy(&x2, p2 + 4, 16); + if (x1 == x2) { + uint32 u1; + memcpy(&u1, p1, 4); + uint32 u2; + memcpy(&u2, p2, 4); + return u1 == u2 ? 0 : (u1 > u2) ? 1 : -1; + } else return (x1 > x2) ? 1 : -1; +} + +static inline int LCmp(void *s1, void *s2) { + llk *ll1 = (llk *)s1; + llk *ll2 = (llk *)s2; + long l1 = ll1->key; + long l2 = ll2->key; + return l1 == l2 ? 0 : (l1 > l2) ? 1 : -1; +} + +int llCmp(void *s1, void *s2) { + return LCmp(s1, s2); +} + +static inline int YCmp(void *s1, void *s2) { + ylk *yl1 = (ylk *)s1; + ylk *yl2 = (ylk *)s2; + uint160 y1 = yl1->key; + uint160 y2 = yl2->key; + return u160Cmp(&y1, &y2); +} +int ylCmp(void *s1, void *s2) { + return YCmp(s1, s2); +} + +void destroyBTKey(char *btkey, bool med) { + if (med) cf_free(btkey); +} + +char *createBTKey(ai_obj *akey, bool *med, uint32 *ksize, bt *btr, btk_t *btk) { + *med = 0; + *ksize = VOIDSIZE; + + if (NBT_DG(btr)) { + return (char *)&akey->y; + } else if (LL(btr)) { + btk->LL.key = akey->l; + return (char *)&btk->LL; + } else if (YL(btr)) { + btk->YL.key = akey->y; + return (char *)&btk->YL; + } + + assert(! "Unsupport Btree type"); + return NULL; +} + +uchar *parseStream(uchar *stream, bt *btr) { + if (!stream || NBT_DG(btr)) { + return NULL; + } else if (LL(btr)) { + return (uchar *)(*(llk *)(stream)).val; + } else if (YL(btr)) { + return (uchar *)(long)(*(ylk *)(stream)).val; + } + assert(! "Unsupported Btree type"); + return NULL; +} + +void convertStream2Key(uchar *stream, ai_obj *key, bt *btr) { + init_ai_obj(key); + if (NBT_DG(btr)) { + key->type = COL_TYPE_DIGEST; + memcpy(&key->y, stream, AS_DIGEST_KEY_SZ); + } else if (LL(btr)) { + key->type = COL_TYPE_LONG; + key->l = ((llk *)stream)->key; + } else if (YL(btr)) { + key->type = COL_TYPE_DIGEST; + key->y = ((ylk *)stream)->key; + } else { + assert(! "Unsupported Btree type"); + } +} + +static void *OBT_createStream(bt *btr, void *val, char *btkey, crs_t *crs) { + + if (LL(btr)) { + llk *ll = (llk *)btkey; + crs->LL_StreamPtr.key = ll->key; + crs->LL_StreamPtr.val = (ulong) val; + return &crs->LL_StreamPtr; + } else if (YL(btr)) { + ylk *yl = (ylk *)btkey; + crs->YL_StreamPtr.key = yl->key; + crs->YL_StreamPtr.val = (ulong) val; + return &crs->YL_StreamPtr; + } + + assert(! "OBT_createStream ERROR"); + return NULL; +} + +void *createStream(bt *btr, void *val, char *btkey, uint32 klen, uint32 *size, + crs_t *crs) { + *size = 0; + if (NBT(btr)) { + return btkey; + } else if (OTHER_BT(btr)) { + return OBT_createStream(btr, val, btkey, crs); + } + + assert(! "Unsupported Btree type"); + return NULL; +} + +bool destroyStream(bt *btr, uchar *ostream) { + if (!ostream || NBT(btr) || OTHER_BT(btr)) { + return 0; + } + + assert(! "Unsupported Btree Type"); + return 1; +} diff --git a/apidocs/Makefile b/apidocs/Makefile new file mode 100644 index 00000000..34da3990 --- /dev/null +++ b/apidocs/Makefile @@ -0,0 +1,10 @@ + +.default: docs + +.PHONY: docs +docs: + doxygen src/doxyfile + +.PHONY: docs-clean +docs-clean: + rm -rf target diff --git a/apidocs/src/doxyfile b/apidocs/src/doxyfile new file mode 100644 index 00000000..aa958e58 --- /dev/null +++ b/apidocs/src/doxyfile @@ -0,0 +1,1792 @@ +# Doxyfile 1.8.1.2 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" "). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or sequence of words) that should +# identify the project. Note that if you do not use Doxywizard you need +# to put quotes around the project name if it contains spaces. + +PROJECT_NAME = "Aerospike Server" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer +# a quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify an logo or icon that is +# included in the documentation. The maximum height of the logo should not +# exceed 55 pixels and the maximum width should not exceed 200 pixels. +# Doxygen will copy the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = target + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, +# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English +# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, +# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, +# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = NO + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful if your file system +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding +# "class=itcl::class" will allow you to use the command class in the +# itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = YES + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given extension. +# Doxygen has a built-in mapping, but you can override or extend it using this +# tag. The format is ext=language, where ext is a file extension, and language +# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C, +# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make +# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C +# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions +# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all +# comments according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you +# can mix doxygen, HTML, and XML commands with Markdown formatting. +# Disable only in case of backward compatibilities issues. + +MARKDOWN_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also makes the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate getter +# and setter methods for a property. Setting this option to YES (the default) +# will make doxygen replace the get and set methods by a property in the +# documentation. This will only work if the methods are indeed getting or +# setting a simple type. If this is not the case, or you want to show the +# methods anyway, you should set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and +# unions are shown inside the group in which they are included (e.g. using +# @ingroup) instead of on a separate page (for HTML and Man pages) or +# section (for LaTeX and RTF). + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and +# unions with only public data fields will be shown inline in the documentation +# of the scope in which they are defined (i.e. file, namespace, or group +# documentation), provided this scope is documented. If set to NO (the default), +# structs, classes, and unions are shown on a separate page (for HTML and Man +# pages) or section (for LaTeX and RTF). + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = YES + +# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to +# determine which symbols to keep in memory and which to flush to disk. +# When the cache is full, less often used symbols will be written to disk. +# For small to medium size projects (<1000 input files) the default value is +# probably good enough. For larger projects a too small cache size can cause +# doxygen to be busy swapping symbols to and from disk most of the time +# causing a significant performance penalty. +# If the system has enough physical memory increasing the cache will improve the +# performance by keeping more symbols in memory. Note that the value works on +# a logarithmic scale so increasing the size by one will roughly double the +# memory usage. The cache size is given by this formula: +# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols. + +SYMBOL_CACHE_SIZE = 0 + +# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be +# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given +# their name and scope. Since this can be an expensive process and often the +# same symbol appear multiple times in the code, doxygen keeps a cache of +# pre-resolved symbols. If the cache is too small doxygen will become slower. +# If the cache is too large, memory is wasted. The cache size is given by this +# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal scope will be included in the documentation. + +EXTRACT_PACKAGE = YES + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = YES + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespaces are hidden. + +EXTRACT_ANON_NSPACES = YES + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = YES + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = YES + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = YES + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = YES + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen +# will list include files with double quotes in the documentation +# rather than with sharp brackets. + +FORCE_LOCAL_INCLUDES = YES + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = YES + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen +# will sort the (brief and detailed) documentation of class members so that +# constructors and destructors are listed first. If set to NO (the default) +# the constructors will appear in the respective orders defined by +# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. +# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO +# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = YES + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to +# do proper type resolution of all parameters of a function it will reject a +# match between the prototype and the implementation of a member function even +# if there is only one candidate or it is obvious which candidate to choose +# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen +# will still accept a match between prototype and implementation in such cases. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or macro consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and macros in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. +# This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = NO + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. +# You can optionally specify a file name after the option, if omitted +# DoxygenLayout.xml will be used as the name of the layout file. + +LAYOUT_FILE = src/layout.xml + +# The CITE_BIB_FILES tag can be used to specify one or more bib files +# containing the references data. This must be a list of .bib files. The +# .bib extension is automatically appended if omitted. Using this command +# requires the bibtex tool to be installed. See also +# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style +# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this +# feature you need bibtex and perl available in the search path. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# The WARN_NO_PARAMDOC option can be enabled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = ../as/src ../cf/src ../modules/common/src ../modules/mod-lua/src + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh +# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py +# *.f90 *.f *.for *.vhd *.vhdl + +FILE_PATTERNS = *.h *.c + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = citrusleaf + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +#EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. +# If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. +# Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. +# The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty or if +# non of the patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) +# and it is also possible to disable source filtering for a specific pattern +# using *.ext= (so without naming a filter). This option only has effect when +# FILTER_SOURCE_FILES is enabled. + +FILTER_SOURCE_PATTERNS = + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = YES + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C, C++ and Fortran comments will always remain visible. + +STRIP_CODE_COMMENTS = NO + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = YES + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. +# Otherwise they will link to the documentation. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = YES + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = aerospike_ AEROSPIKE_ as_ AS_ + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. Note that when using a custom header you are responsible +# for the proper inclusion of any scripts and style sheets that doxygen +# needs, which is dependent on the configuration options used. +# It is advised to generate a default header using "doxygen -w html +# header.html footer.html stylesheet.css YourConfigFile" and then modify +# that header. Note that the header is subject to change so you typically +# have to redo this when upgrading to a newer version of doxygen or when +# changing the value of configuration settings such as GENERATE_TREEVIEW! + +HTML_HEADER = src/header.html + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = src/footer.html + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# style sheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = src/style.css + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that +# the files will be copied as-is; there are no commands or markers available. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. +# Doxygen will adjust the colors in the style sheet and background images +# according to this color. Hue is specified as an angle on a colorwheel, +# see http://en.wikipedia.org/wiki/Hue for more information. +# For instance the value 0 represents red, 60 is yellow, 120 is green, +# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. +# The allowed range is 0 to 359. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of +# the colors in the HTML output. For a value of 0 the output will use +# grayscales only. A value of 255 will produce the most vivid colors. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to +# the luminance component of the colors in the HTML output. Values below +# 100 gradually make the output lighter, whereas values above 100 make +# the output darker. The value divided by 100 is the actual gamma applied, +# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, +# and 100 does not change the gamma. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting +# this to NO can help when comparing the output of multiple runs. + +HTML_TIMESTAMP = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of +# entries shown in the various tree structured indices initially; the user +# can expand and collapse entries dynamically later on. Doxygen will expand +# the tree to such a level that at most the specified number of entries are +# visible (unless a fully collapsed tree already exceeds this amount). +# So setting the number of entries 1 will produce a full collapsed tree by +# default. 0 is a special value representing an infinite number of entries +# and will result in a full expanded tree by default. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. +# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Aerospike API" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. + +DOCSET_PUBLISHER_ID = com.aerospike + +# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. + +DOCSET_PUBLISHER_NAME = Aerospike Inc. + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated +# that can be used as input for Qt's qhelpgenerator to generate a +# Qt Compressed Help (.qch) of the generated HTML documentation. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can +# be used to specify the file name of the resulting .qch file. +# The path specified is relative to the HTML output folder. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#namespace + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#virtual-folders + +QHP_VIRTUAL_FOLDER = doc + +# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to +# add. For more information please see +# http://doc.trolltech.com/qthelpproject.html#custom-filters + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see +# +# Qt Help Project / Custom Filters. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's +# filter section matches. +# +# Qt Help Project / Filter Attributes. + +QHP_SECT_FILTER_ATTRS = + +# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can +# be used to specify the location of Qt's qhelpgenerator. +# If non-empty doxygen will try to run qhelpgenerator on the generated +# .qhp file. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files +# will be generated, which together with the HTML files, form an Eclipse help +# plugin. To install this plugin and make it available under the help contents +# menu in Eclipse, the contents of the directory containing the HTML and XML +# files needs to be copied into the plugins directory of eclipse. The name of +# the directory within the plugins directory should be the same as +# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before +# the help appears. + +GENERATE_ECLIPSEHELP = YES + +# A unique identifier for the eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have +# this name. + +ECLIPSE_DOC_ID = com.aerospike + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) +# at top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. Since the tabs have the same information as the +# navigation tree you can set this option to NO if you already set +# GENERATE_TREEVIEW to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to YES, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). +# Windows users are probably better off using the HTML help feature. +# Since the tree basically has the same information as the tab index you +# could consider to set DISABLE_INDEX to NO when enabling this option. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values +# (range [0,1..20]) that doxygen will group on one line in the generated HTML +# documentation. Note that a value of 0 will completely suppress the enum +# values from appearing in the overview section. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open +# links to external symbols imported via tag files in a separate window. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are +# not supported properly for IE 6.0, but are supported on all modern browsers. +# Note that when changing this option you need to delete any form_*.png files +# in the HTML output before the changes have effect. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax +# (see http://www.mathjax.org) which uses client side Javascript for the +# rendering instead of using prerendered bitmaps. Use this if you do not +# have LaTeX installed or if you want to formulas look prettier in the HTML +# output. When enabled you may also need to install MathJax separately and +# configure the path to it using the MATHJAX_RELPATH option. + +USE_MATHJAX = NO + +# When MathJax is enabled you need to specify the location relative to the +# HTML output directory using the MATHJAX_RELPATH option. The destination +# directory should contain the MathJax.js script. For instance, if the mathjax +# directory is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to +# the MathJax Content Delivery Network so you can quickly see the result without +# installing MathJax. +# However, it is strongly recommended to install a local +# copy of MathJax from http://www.mathjax.org before deployment. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension +# names that should be enabled during MathJax rendering. + +MATHJAX_EXTENSIONS = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box +# for the HTML output. The underlying search engine uses javascript +# and DHTML and should work on any modern browser. Note that when using +# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets +# (GENERATE_DOCSET) there is already a search function so this one should +# typically be disabled. For large projects the javascript based search engine +# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. + +SEARCHENGINE = YES + +# When the SERVER_BASED_SEARCH tag is enabled the search engine will be +# implemented using a PHP enabled web server instead of at the web client +# using Javascript. Doxygen will generate the search PHP script and index +# file to put on the web server. The advantage of the server +# based approach is that it scales better to large projects and allows +# full text search. The disadvantages are that it is more difficult to setup +# and does not have live searching capabilities. + +SERVER_BASED_SEARCH = NO + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = NO + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. +# Note that when enabling USE_PDFLATEX this option is only used for +# generating bitmaps for formulas in the HTML output, but not in the +# Makefile that is written to the output directory. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4 + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for +# the generated latex document. The footer should contain everything after +# the last chapter. If it is left blank doxygen will generate a +# standard footer. Notice: only use this tag if you know what you are doing! + +LATEX_FOOTER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +# If LATEX_SOURCE_CODE is set to YES then doxygen will include +# source code with syntax highlighting in the LaTeX output. +# Note that which sources are shown also depends on other settings +# such as SOURCE_BROWSER. + +LATEX_SOURCE_CODE = NO + +# The LATEX_BIB_STYLE tag can be used to specify the style to use for the +# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See +# http://en.wikipedia.org/wiki/BibTeX for more info. + +LATEX_BIB_STYLE = plain + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load style sheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. +# This is useful +# if you want to understand what is going on. +# On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# pointed to by INCLUDE_PATH will be searched when a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition that +# overrules the definition found in the source code. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all references to function-like macros +# that are alone on a line, have an all uppercase name, and do not end with a +# semicolon, because these will confuse the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. For each +# tag file the location of the external documentation should be added. The +# format of a tag file without this location is as follows: +# +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths +# or URLs. Note that each tag file must have a unique name (where the name does +# NOT include the path). If a tag file is not located in the directory in which +# doxygen is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option also works with HAVE_DOT disabled, but it is recommended to +# install and use dot, since it yields more powerful graphs. + +CLASS_DIAGRAMS = YES + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = NO + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = YES + +# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is +# allowed to run in parallel. When set to 0 (the default) doxygen will +# base this on the number of processors available in the system. You can set it +# explicitly to a value larger than 0 to get control over the balance +# between CPU load and processing speed. + +DOT_NUM_THREADS = 0 + +# By default doxygen will use the Helvetica font for all dot files that +# doxygen generates. When you want a differently looking font you can specify +# the font name using DOT_FONTNAME. You need to make sure dot is able to find +# the font, which can be done by putting it in a standard location or by setting +# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the +# directory containing the font. + +DOT_FONTNAME = Helvetica + +# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. +# The default size is 10pt. + +DOT_FONTSIZE = 10 + +# By default doxygen will tell dot to use the Helvetica font. +# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to +# set the path where dot can find it. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = YES + +# If the UML_LOOK tag is enabled, the fields and methods are shown inside +# the class node. If there are many fields or methods and many nodes the +# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS +# threshold limits the number of items for each type to make the size more +# managable. Set this to 0 for no limit. Note that the threshold may be +# exceeded by 50% before the limit is enforced. + +UML_LIMIT_NUM_FIELDS = 10 + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = YES + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = YES + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will generate a graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are svg, png, jpg, or gif. +# If left blank png will be used. If you choose svg you need to set +# HTML_FILE_EXTENSION to xhtml in order to make the SVG files +# visible in IE 9+ (other browsers do not have this requirement). + +DOT_IMAGE_FORMAT = png + +# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to +# enable generation of interactive SVG images that allow zooming and panning. +# Note that this requires a modern browser other than Internet Explorer. +# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you +# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files +# visible. Older versions of IE do not have SVG support. + +INTERACTIVE_SVG = YES + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MSCFILE_DIRS tag can be used to specify one or more directories that +# contain msc files that are included in the documentation (see the +# \mscfile command). + +MSCFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, because dot on Windows does not +# seem to support this out of the box. Warning: Depending on the platform used, +# enabling this option may lead to badly anti-aliased labels on the edges of +# a graph (i.e. they become hard to read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES diff --git a/apidocs/src/footer.html b/apidocs/src/footer.html new file mode 100644 index 00000000..80222ad9 --- /dev/null +++ b/apidocs/src/footer.html @@ -0,0 +1,20 @@ + + + + + + + + + diff --git a/apidocs/src/header.html b/apidocs/src/header.html new file mode 100644 index 00000000..f5dbcbb5 --- /dev/null +++ b/apidocs/src/header.html @@ -0,0 +1,52 @@ + + + + + +$projectname: $title +$title + + + +$treeview +$search +$mathjax + + + +
+ + +
+ + + + + + + + + + + + + + + + + + + + + +
+
$projectname +  $projectnumber +
+
$projectbrief
+
+
$projectbrief
+
$searchbox
+
+ + diff --git a/apidocs/src/layout.xml b/apidocs/src/layout.xml new file mode 100644 index 00000000..0e7ea327 --- /dev/null +++ b/apidocs/src/layout.xml @@ -0,0 +1,187 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apidocs/src/style.css b/apidocs/src/style.css new file mode 100644 index 00000000..403c886a --- /dev/null +++ b/apidocs/src/style.css @@ -0,0 +1,1174 @@ +/* The standard CSS for doxygen */ + +body, table, div, p, dl { + font: 400 11pt Helvetica, sans-serif; +} + +/* @group Heading Levels */ + +h1 { + font-size: 150%; +} + +.title { + font-size: 1.4em; + font-weight: bold; + margin: 10px 2px; +} + +h2 { + border-bottom: 1px solid #879ECB; + color: #354C7B; + font-size: 1.2em; + font-weight: normal; + margin-top: 1.75em; + padding-top: 8px; + padding-bottom: 4px; + width: 100%; +} + +#titlearea * { + background: #FFF; + width: 100%; + font-size: 16pt; +} + +#projectname { + +} + +h3 { + font-size: 100%; +} + +h1, h2, h3, h4, h5, h6 { + -webkit-transition: text-shadow 0.5s linear; + -moz-transition: text-shadow 0.5s linear; + -ms-transition: text-shadow 0.5s linear; + -o-transition: text-shadow 0.5s linear; + transition: text-shadow 0.5s linear; + margin-right: 15px; +} + +h1.glow, h2.glow, h3.glow, h4.glow, h5.glow, h6.glow { + text-shadow: 0 0 15px cyan; +} + +dt { + font-weight: bold; +} + +div.multicol { + -moz-column-gap: 1em; + -webkit-column-gap: 1em; + -moz-column-count: 3; + -webkit-column-count: 3; +} + +p.startli, p.startdd, p.starttd { + margin-top: 2px; +} + +p.endli { + margin-bottom: 0px; +} + +p.enddd { + margin-bottom: 4px; +} + +p.endtd { + margin-bottom: 2px; +} + +/* @end */ + +caption { + font-weight: bold; +} + +span.legend { + font-size: 70%; + text-align: center; +} + +h3.version { + font-size: 90%; + text-align: center; +} + +div.qindex, div.navtab{ + background-color: #EBEFF6; + border: 1px solid #A3B4D7; + text-align: center; +} + +div.qindex, div.navpath { + width: 100%; + line-height: 140%; +} + +div.navtab { + margin-right: 15px; +} + +/* @group Link Styling */ + +a { + color: #3D578C; + font-weight: normal; + text-decoration: none; +} + +.contents a:visited { + color: #4665A2; +} + +a:hover { + text-decoration: underline; +} + +a.qindex { + font-weight: bold; +} + +a.qindexHL { + font-weight: bold; + background-color: #9CAFD4; + color: #ffffff; + border: 1px double #869DCA; +} + +.contents a.qindexHL:visited { + color: #ffffff; +} + +a.el { + font-weight: bold; +} + +a.elRef { +} + +a.code, a.code:visited { + color: #4665A2; +} + +a.codeRef, a.codeRef:visited { + color: #4665A2; +} + +/* @end */ + +dl.el { + margin-left: -1cm; +} + +pre.fragment { + border: 1px solid #C4CFE5; + background-color: #FBFCFD; + padding: 4px 6px; + margin: 4px 8px 4px 2px; + overflow: auto; + word-wrap: break-word; + font-size: 9pt; + line-height: 125%; + font-family: monospace, fixed; + font-size: 105%; +} + +div.fragment { + padding: 4px; + margin: 4px; + background-color: #FBFCFD; + border: 1px solid #C4CFE5; +} + +div.line { + font-family: monospace, fixed; + font-size: 13px; + min-height: 13px; + line-height: 1.0; + text-wrap: unrestricted; + white-space: -moz-pre-wrap; /* Moz */ + white-space: -pre-wrap; /* Opera 4-6 */ + white-space: -o-pre-wrap; /* Opera 7 */ + white-space: pre-wrap; /* CSS3 */ + word-wrap: break-word; /* IE 5.5+ */ + text-indent: -53px; + padding-left: 53px; + padding-bottom: 0px; + margin: 0px; + -webkit-transition-property: background-color, box-shadow; + -webkit-transition-duration: 0.5s; + -moz-transition-property: background-color, box-shadow; + -moz-transition-duration: 0.5s; + -ms-transition-property: background-color, box-shadow; + -ms-transition-duration: 0.5s; + -o-transition-property: background-color, box-shadow; + -o-transition-duration: 0.5s; + transition-property: background-color, box-shadow; + transition-duration: 0.5s; +} + +div.line.glow { + background-color: cyan; + box-shadow: 0 0 10px cyan; +} + + +span.lineno { + padding-right: 4px; + text-align: right; + border-right: 2px solid #0F0; + background-color: #E8E8E8; + white-space: pre; +} +span.lineno a { + background-color: #D8D8D8; +} + +span.lineno a:hover { + background-color: #C8C8C8; +} + +div.ah { + background-color: black; + font-weight: bold; + color: #ffffff; + margin-bottom: 3px; + margin-top: 3px; + padding: 0.2em; + border: solid thin #333; + border-radius: 0.5em; + -webkit-border-radius: .5em; + -moz-border-radius: .5em; + box-shadow: 2px 2px 3px #999; + -webkit-box-shadow: 2px 2px 3px #999; + -moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px; + background-image: -webkit-gradient(linear, left top, left bottom, from(#eee), to(#000),color-stop(0.3, #444)); + background-image: -moz-linear-gradient(center top, #eee 0%, #444 40%, #000); +} + +div.groupHeader { + margin-left: 16px; + margin-top: 12px; + font-weight: bold; +} + +div.groupText { + margin-left: 16px; + font-style: italic; +} + +body { + background-color: white; + color: black; + margin: 0; +} + +div.contents { + margin-top: 10px; + margin-left: 12px; + margin-right: 8px; +} + +td.indexkey { + background-color: #EBEFF6; + font-weight: bold; + border: 1px solid #C4CFE5; + margin: 2px 0px 2px 0; + padding: 2px 10px; + white-space: nowrap; + vertical-align: top; +} + +td.indexvalue { + background-color: #EBEFF6; + border: 1px solid #C4CFE5; + padding: 2px 10px; + margin: 2px 0px; +} + +tr.memlist { + background-color: #EEF1F7; +} + +p.formulaDsp { + text-align: center; +} + +img.formulaDsp { + +} + +img.formulaInl { + vertical-align: middle; +} + +div.center { + text-align: center; + margin-top: 0px; + margin-bottom: 0px; + padding: 0px; +} + +div.center img { + border: 0px; +} + +address.footer { + text-align: right; + padding-right: 12px; +} + +img.footer { + border: 0px; + vertical-align: middle; +} + +/* @group Code Colorization */ + +span.keyword { + color: #008000 +} + +span.keywordtype { + color: #604020 +} + +span.keywordflow { + color: #e08000 +} + +span.comment { + color: #800000 +} + +span.preprocessor { + color: #806020 +} + +span.stringliteral { + color: #002080 +} + +span.charliteral { + color: #008080 +} + +span.vhdldigit { + color: #ff00ff +} + +span.vhdlchar { + color: #000000 +} + +span.vhdlkeyword { + color: #700070 +} + +span.vhdllogic { + color: #ff0000 +} + +blockquote { + background-color: #F7F8FB; + border-left: 2px solid #9CAFD4; + margin: 0 24px 0 4px; + padding: 0 12px 0 16px; +} + +/* @end */ + +/* +.search { + color: #003399; + font-weight: bold; +} + +form.search { + margin-bottom: 0px; + margin-top: 0px; +} + +input.search { + font-size: 75%; + color: #000080; + font-weight: normal; + background-color: #e8eef2; +} +*/ + +td.tiny { + font-size: 75%; +} + +.dirtab { + padding: 4px; + border-collapse: collapse; + border: 1px solid #A3B4D7; +} + +th.dirtab { + background: #EBEFF6; + font-weight: bold; +} + +hr { + height: 0px; + border: none; + border-top: 1px solid #4A6AAA; +} + +hr.footer { + height: 1px; +} + +/* @group Member Descriptions */ + +table.memberdecls { + border-spacing: 0px; + padding: 0px; +} + +.memberdecls td, .fieldtable tr { + -webkit-transition-property: background-color, box-shadow; + -webkit-transition-duration: 0.5s; + -moz-transition-property: background-color, box-shadow; + -moz-transition-duration: 0.5s; + -ms-transition-property: background-color, box-shadow; + -ms-transition-duration: 0.5s; + -o-transition-property: background-color, box-shadow; + -o-transition-duration: 0.5s; + transition-property: background-color, box-shadow; + transition-duration: 0.5s; +} + +.memberdecls td.glow, .fieldtable tr.glow { + background-color: cyan; + box-shadow: 0 0 15px cyan; +} + +.mdescLeft, .mdescRight, +.memItemLeft, .memItemRight, +.memTemplItemLeft, .memTemplItemRight, .memTemplParams { + background-color: #F9FAFC; + border: none; + margin: 4px; + padding: 1px 0 0 8px; +} + +.mdescLeft, .mdescRight { + padding: 0px 8px 4px 8px; + color: #555; +} + +.memItemLeft, .memItemRight, .memTemplParams { + border-bottom: 1px solid #DEE4F0; +} + +.memItemLeft, .memTemplItemLeft { + white-space: nowrap; +} + +.memItemRight { + width: 100%; +} + +.memTemplParams { + color: #4665A2; + white-space: nowrap; +} + +/* @end */ + +/* @group Member Details */ + +/* Styles for detailed member documentation */ + +.memtemplate { + font-size: 80%; + color: #4665A2; + font-weight: normal; + margin-left: 9px; +} + +.memnav { + background-color: #EBEFF6; + border: 1px solid #A3B4D7; + text-align: center; + margin: 2px; + margin-right: 15px; + padding: 2px; +} + +.mempage { + width: 100%; +} + +.memitem { + padding: 0; + margin-bottom: 10px; + margin-right: 5px; + -webkit-transition: box-shadow 0.5s linear; + -moz-transition: box-shadow 0.5s linear; + -ms-transition: box-shadow 0.5s linear; + -o-transition: box-shadow 0.5s linear; + transition: box-shadow 0.5s linear; + display: table !important; + width: 100%; +} + +.memitem.glow { + box-shadow: 0 0 15px cyan; +} + +.memname { + font-weight: bold; + margin-left: 6px; +} + +.memname td { + vertical-align: bottom; +} + +.memproto, dl.reflist dt { + border-top: 1px solid #A8B8D9; + border-left: 1px solid #A8B8D9; + border-right: 1px solid #A8B8D9; + padding: 6px 0px 6px 0px; + color: #253555; + font-weight: bold; + text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9); + background-image:url('nav_f.png'); + background-repeat:repeat-x; + background-color: #E2E8F2; + /* opera specific markup */ + box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15); + border-top-right-radius: 4px; + border-top-left-radius: 4px; + /* firefox specific markup */ + -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px; + -moz-border-radius-topright: 4px; + -moz-border-radius-topleft: 4px; + /* webkit specific markup */ + -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15); + -webkit-border-top-right-radius: 4px; + -webkit-border-top-left-radius: 4px; + +} + +.memdoc, dl.reflist dd { + border-bottom: 1px solid #A8B8D9; + border-left: 1px solid #A8B8D9; + border-right: 1px solid #A8B8D9; + padding: 6px 10px 2px 10px; + background-color: #FBFCFD; + border-top-width: 0; + background-image:url('nav_g.png'); + background-repeat:repeat-x; + background-color: #FFFFFF; + /* opera specific markup */ + border-bottom-left-radius: 4px; + border-bottom-right-radius: 4px; + box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15); + /* firefox specific markup */ + -moz-border-radius-bottomleft: 4px; + -moz-border-radius-bottomright: 4px; + -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px; + /* webkit specific markup */ + -webkit-border-bottom-left-radius: 4px; + -webkit-border-bottom-right-radius: 4px; + -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15); +} + +dl.reflist dt { + padding: 5px; +} + +dl.reflist dd { + margin: 0px 0px 10px 0px; + padding: 5px; +} + +.paramkey { + text-align: right; +} + +.paramtype { + white-space: nowrap; +} + +.paramname { + color: #602020; + white-space: nowrap; +} +.paramname em { + font-style: normal; +} +.paramname code { + line-height: 14px; +} + +.params, .retval, .exception, .tparams { + margin-left: 0px; + padding-left: 0px; +} + +.params .paramname, .retval .paramname { + font-weight: bold; + vertical-align: top; +} + +.params .paramtype { + font-style: italic; + vertical-align: top; +} + +.params .paramdir { + font-family: "courier new",courier,monospace; + vertical-align: top; +} + +table.mlabels { + border-spacing: 0px; +} + +td.mlabels-left { + width: 100%; + padding: 0px; +} + +td.mlabels-right { + vertical-align: bottom; + padding: 0px; + white-space: nowrap; +} + +span.mlabels { + margin-left: 8px; +} + +span.mlabel { + background-color: #728DC1; + border-top:1px solid #5373B4; + border-left:1px solid #5373B4; + border-right:1px solid #C4CFE5; + border-bottom:1px solid #C4CFE5; + text-shadow: none; + color: white; + margin-right: 4px; + padding: 2px 3px; + border-radius: 3px; + font-size: 7pt; + white-space: nowrap; +} + + + +/* @end */ + +/* these are for tree view when not used as main index */ + +div.directory { + margin: 10px 0px; + border-top: 1px solid #A8B8D9; + border-bottom: 1px solid #A8B8D9; + width: 100%; +} + +.directory table { + border-collapse:collapse; +} + +.directory td { + margin: 0px; + padding: 0px; + vertical-align: top; +} + +.directory td.entry { + white-space: nowrap; + padding-right: 6px; +} + +.directory td.entry a { + outline:none; +} + +.directory td.entry a img { + border: none; +} + +.directory td.desc { + width: 100%; + padding-left: 6px; + padding-right: 6px; + padding-top: 3px; + border-left: 1px solid rgba(0,0,0,0.05); +} + +.directory tr.even { + padding-left: 6px; + background-color: #F7F8FB; +} + +.directory img { + vertical-align: -30%; +} + +.directory .levels { + white-space: nowrap; + width: 100%; + text-align: right; + font-size: 9pt; +} + +.directory .levels span { + cursor: pointer; + padding-left: 2px; + padding-right: 2px; + color: #3D578C; +} + +div.dynheader { + margin-top: 8px; + -webkit-touch-callout: none; + -webkit-user-select: none; + -khtml-user-select: none; + -moz-user-select: none; + -ms-user-select: none; + user-select: none; +} + +address { + font-style: normal; + color: #2A3D61; +} + +table.doxtable { + border-collapse:collapse; + margin-top: 4px; + margin-bottom: 4px; +} + +table.doxtable td, table.doxtable th { + border: 1px solid #2D4068; + padding: 3px 7px 2px; +} + +table.doxtable th { + background-color: #374F7F; + color: #FFFFFF; + font-size: 110%; + padding-bottom: 4px; + padding-top: 5px; +} + +table.fieldtable { + width: 100%; + margin-bottom: 10px; + border: 1px solid #A8B8D9; + border-spacing: 0px; + -moz-border-radius: 4px; + -webkit-border-radius: 4px; + border-radius: 4px; + -moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px; + -webkit-box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15); + box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15); +} + +.fieldtable td, .fieldtable th { + padding: 3px 7px 2px; +} + +.fieldtable td.fieldtype, .fieldtable td.fieldname { + white-space: nowrap; + border-right: 1px solid #A8B8D9; + border-bottom: 1px solid #A8B8D9; + vertical-align: top; +} + +.fieldtable td.fielddoc { + border-bottom: 1px solid #A8B8D9; + width: 100%; +} + +.fieldtable tr:last-child td { + border-bottom: none; +} + +.fieldtable th { + background-image:url('nav_f.png'); + background-repeat:repeat-x; + background-color: #E2E8F2; + font-size: 90%; + color: #253555; + padding-bottom: 4px; + padding-top: 5px; + text-align:left; + -moz-border-radius-topleft: 4px; + -moz-border-radius-topright: 4px; + -webkit-border-top-left-radius: 4px; + -webkit-border-top-right-radius: 4px; + border-top-left-radius: 4px; + border-top-right-radius: 4px; + border-bottom: 1px solid #A8B8D9; +} + + +.tabsearch { + top: 0px; + left: 10px; + height: 36px; + background-image: url('tab_b.png'); + z-index: 101; + overflow: hidden; + font-size: 13px; +} + +.navpath ul +{ + font-size: 11px; + background-image:url('tab_b.png'); + background-repeat:repeat-x; + height:30px; + line-height:30px; + color:#8AA0CC; + border:solid 1px #C2CDE4; + overflow:hidden; + margin:0px; + padding:0px; +} + +.navpath li +{ + list-style-type:none; + float:left; + padding-left:10px; + padding-right:15px; + background-image:url('bc_s.png'); + background-repeat:no-repeat; + background-position:right; + color:#364D7C; +} + +.navpath li.navelem a +{ + height:32px; + display:block; + text-decoration: none; + outline: none; + font-family: 'Lucida Grande',Geneva,Helvetica,Arial,sans-serif; +} + +.navpath li.navelem a:hover +{ + color:#6884BD; +} + +.navpath li.footer +{ + list-style-type:none; + float:right; + padding-left:10px; + padding-right:15px; + background-image:none; + background-repeat:no-repeat; + background-position:right; + color:#364D7C; + font-size: 8pt; +} + + +div.summary +{ + float: right; + font-size: 8pt; + padding-right: 5px; + width: 50%; + text-align: right; +} + +div.summary a +{ + white-space: nowrap; +} + +div.ingroups +{ + font-size: 8pt; + width: 50%; + text-align: left; +} + +div.ingroups a +{ + white-space: nowrap; +} + +div.header +{ + background-image:url('nav_h.png'); + background-repeat:repeat-x; + background-color: #F9FAFC; + margin: 0px; + border-bottom: 1px solid #C4CFE5; +} + +div.headertitle +{ + padding: 5px 5px 5px 10px; +} + +dl +{ + padding: 0 0 0 10px; +} + +/* dl.note, dl.warning, dl.attention, dl.pre, dl.post, dl.invariant, dl.deprecated, dl.todo, dl.test, dl.bug */ +dl.section +{ + margin-left: 0px; + padding-left: 0px; +} + +dl.note +{ + margin-left:-7px; + padding-left: 3px; + border-left:4px solid; + border-color: #D0C000; +} + +dl.warning, dl.attention +{ + margin-left:-7px; + padding-left: 3px; + border-left:4px solid; + border-color: #FF0000; +} + +dl.pre, dl.post, dl.invariant +{ + margin-left:-7px; + padding-left: 3px; + border-left:4px solid; + border-color: #00D000; +} + +dl.deprecated +{ + margin-left:-7px; + padding-left: 3px; + border-left:4px solid; + border-color: #505050; +} + +dl.todo +{ + margin-left:-7px; + padding-left: 3px; + border-left:4px solid; + border-color: #00C0E0; +} + +dl.test +{ + margin-left:-7px; + padding-left: 3px; + border-left:4px solid; + border-color: #3030E0; +} + +dl.bug +{ + margin-left:-7px; + padding-left: 3px; + border-left:4px solid; + border-color: #C08050; +} + +dl.section dd { + margin-bottom: 6px; +} + + +#projectlogo +{ + text-align: center; + vertical-align: bottom; + border-collapse: separate; +} + +#projectlogo img +{ + border: 0px none; +} + +#projectname +{ + font-size: 1.5em; + font-weight: 600; + margin: 0px; + padding: 2px 0px; +} + +#projectbrief +{ + font-size: 1.2em; + margin: 0px; + padding: 0px; +} + +#projectnumber +{ + font-size: 1em; + margin: 0px; + padding: 0px; +} + +#titlearea +{ + padding: 0px; + margin: 0px; + width: 100%; + border-bottom: 1px solid #5373B4; +} + +.image +{ + text-align: center; +} + +.dotgraph +{ + text-align: center; +} + +.mscgraph +{ + text-align: center; +} + +.caption +{ + font-weight: bold; +} + +div.zoom +{ + border: 1px solid #90A5CE; +} + +dl.citelist { + margin-bottom:50px; +} + +dl.citelist dt { + color:#334975; + float:left; + font-weight:bold; + margin-right:10px; + padding:5px; +} + +dl.citelist dd { + margin:2px 0; + padding:5px 0; +} + +div.toc { + padding: 14px 25px; + background-color: #F4F6FA; + border: 1px solid #D8DFEE; + border-radius: 7px 7px 7px 7px; + float: right; + height: auto; + margin: 0 20px 10px 10px; + width: 200px; +} + +div.toc li { + background: url("bdwn.png") no-repeat scroll 0 5px transparent; + font: 10px/1.2 Verdana,DejaVu Sans,Geneva,sans-serif; + margin-top: 5px; + padding-left: 10px; + padding-top: 2px; +} + +div.toc h3 { + font: bold 12px/1.2 Arial,FreeSans,sans-serif; + color: #4665A2; + border-bottom: 0 none; + margin: 0; +} + +div.toc ul { + list-style: none outside none; + border: medium none; + padding: 0px; +} + +div.toc li.level1 { + margin-left: 0px; +} + +div.toc li.level2 { + margin-left: 15px; +} + +div.toc li.level3 { + margin-left: 30px; +} + +div.toc li.level4 { + margin-left: 45px; +} + +.inherit_header { + font-weight: bold; + color: gray; + cursor: pointer; + -webkit-touch-callout: none; + -webkit-user-select: none; + -khtml-user-select: none; + -moz-user-select: none; + -ms-user-select: none; + user-select: none; +} + +.inherit_header td { + padding: 6px 0px 2px 5px; +} + +.inherit { + display: none; +} + +tr.heading h2 { + margin-top: 12px; + margin-bottom: 4px; +} + +@media print +{ + #top { display: none; } + #side-nav { display: none; } + #nav-path { display: none; } + body { overflow:visible; } + h1, h2, h3, h4, h5, h6 { page-break-after: avoid; } + .summary { display: none; } + .memitem { page-break-inside: avoid; } + #doc-content + { + margin-left:0 !important; + height:auto !important; + width:auto !important; + overflow:inherit; + display:inline; + } +} + diff --git a/as/Makefile b/as/Makefile new file mode 100644 index 00000000..e914d9ec --- /dev/null +++ b/as/Makefile @@ -0,0 +1,9 @@ +# Aerospike Server +# Makefile + +.PHONY: default +default: all + @echo "done." + +%: + $(MAKE) -C src $@ diff --git a/as/etc/README.sample.conf.md b/as/etc/README.sample.conf.md new file mode 100644 index 00000000..97909cb1 --- /dev/null +++ b/as/etc/README.sample.conf.md @@ -0,0 +1,15 @@ +# Aerospike Server Sample Configuration Files + +This directory contains sample Aerospike Server configuration files for +various use cases. + +To use a sample configuration, first copy the appropriate file to be +`/etc/aerospike/aerospike.conf`, and then modify it for your particular +environment and use case. + +## List of Sample Configuration Files + +| Filename | Description | +| ------------------- | -------------------------------------------------- | +| aerospike_mesh.conf | Sample using TCP mesh for clustering | +| aerospike_ssd.conf | Sample using SSD devices for storage | diff --git a/as/etc/aerospike-server.sysconfig b/as/etc/aerospike-server.sysconfig new file mode 100644 index 00000000..204b3160 --- /dev/null +++ b/as/etc/aerospike-server.sysconfig @@ -0,0 +1,4 @@ +ASD_CONFIG_FILE=/etc/aerospike/aerospike.conf + +# Uncomment to start with cold start +#ASD_COLDSTART="--cold-start" diff --git a/as/etc/aerospike-server.tmpfiles b/as/etc/aerospike-server.tmpfiles new file mode 100644 index 00000000..e5f83b62 --- /dev/null +++ b/as/etc/aerospike-server.tmpfiles @@ -0,0 +1 @@ +d /run/aerospike 0755 aerospike aerospike - diff --git a/as/etc/aerospike.conf b/as/etc/aerospike.conf new file mode 100644 index 00000000..f3731133 --- /dev/null +++ b/as/etc/aerospike.conf @@ -0,0 +1,67 @@ +# Aerospike database configuration file. + +service { + user root + group root + paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1. + pidfile /var/run/aerospike/asd.pid + proto-fd-max 15000 +} + +logging { + # Log file must be an absolute path. + file /var/log/aerospike/aerospike.log { + context any info + } +} + +network { + service { + address any + port 3000 + } + + heartbeat { + mode multicast + multicast-group 239.1.99.222 + port 9918 + + # To use unicast-mesh heartbeats, remove the 3 lines above, and see + # aerospike_mesh.conf for alternative. + + interval 150 + timeout 10 + } + + fabric { + port 3001 + } + + info { + port 3003 + } +} + +namespace test { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + + storage-engine memory +} + +namespace bar { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + + storage-engine memory + + # To use file storage backing, comment out the line above and use the + # following lines instead. +# storage-engine device { +# file /opt/aerospike/data/bar.dat +# filesize 16G +# data-in-memory true # Store data in memory in addition to file. +# } +} diff --git a/as/etc/aerospike.service.d/aerospike.conf b/as/etc/aerospike.service.d/aerospike.conf new file mode 100644 index 00000000..e69de29b diff --git a/as/etc/aerospike.service.d/aerospike.conf.coldstart b/as/etc/aerospike.service.d/aerospike.conf.coldstart new file mode 100644 index 00000000..07dfca62 --- /dev/null +++ b/as/etc/aerospike.service.d/aerospike.conf.coldstart @@ -0,0 +1,2 @@ +[Service] +Environment="ASD_OPTIONS=--cold-start" diff --git a/as/etc/aerospike.service.d/aerospike.conf.default b/as/etc/aerospike.service.d/aerospike.conf.default new file mode 100644 index 00000000..e69de29b diff --git a/as/etc/aerospike.service.head b/as/etc/aerospike.service.head new file mode 100644 index 00000000..da04d6bc --- /dev/null +++ b/as/etc/aerospike.service.head @@ -0,0 +1,14 @@ +[Unit] +Description=Aerospike Server +After=network.target +Wants=network.target + +[Service] +LimitNOFILE=100000 +TimeoutSec=15 +User=root +Group=root +EnvironmentFile=/etc/sysconfig/aerospike +PermissionsStartOnly=True +ExecStartPre=/usr/bin/asd-systemd-helper +ExecStart=/usr/bin/asd $ASD_OPTIONS --config-file $ASD_CONFIG_FILE --fgdaemon diff --git a/as/etc/aerospike.service.tail b/as/etc/aerospike.service.tail new file mode 100644 index 00000000..140e4113 --- /dev/null +++ b/as/etc/aerospike.service.tail @@ -0,0 +1,3 @@ + +[Install] +WantedBy=multi-user.target diff --git a/as/etc/aerospike.service.telemetry b/as/etc/aerospike.service.telemetry new file mode 100644 index 00000000..62362af1 --- /dev/null +++ b/as/etc/aerospike.service.telemetry @@ -0,0 +1,2 @@ +ExecStartPre=-/bin/systemctl start aerospike_telemetry +ExecStopPost=-/bin/systemctl stop aerospike_telemetry diff --git a/as/etc/aerospike_dev.conf b/as/etc/aerospike_dev.conf new file mode 100644 index 00000000..1715d249 --- /dev/null +++ b/as/etc/aerospike_dev.conf @@ -0,0 +1,81 @@ +# Aerospike database developer configuration file. + +service { + run-as-daemon false # To work with gdb, and make console logging visible. + paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1. + + # The number of concurrent connections to the database is limited by + # proto-fd-max, and by the system's maximum number of open file descriptors. + # See "man limits.conf" for how to set the system's "nofile" limit. + proto-fd-max 1024 + + work-directory run/work + pidfile run/asd.pid +} + +mod-lua { + user-path run/work/usr/udf/lua + system-path run/work/sys/udf/lua +} + +logging { + # Log file must be an absolute path. + file run/log/aerospike.log { + context any info + } + + console { + context any info + } +} + +network { + service { + address any + port 3000 + } + + heartbeat { + mode multicast + multicast-group 239.1.99.222 + port 9918 + + # To use unicast-mesh heartbeats, remove the 3 lines above, and see + # aerospike_mesh.conf for alternative. + + interval 150 + timeout 10 + } + + fabric { + port 3001 + } + + info { + port 3003 + } +} + +namespace test { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + + storage-engine memory +} + +namespace bar { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + + storage-engine memory + + # To use file storage backing, comment out the line above and use the + # following lines instead. +# storage-engine device { +# file /opt/aerospike/data/bar.dat +# filesize 16G +# data-in-memory true # Store data in memory in addition to file. +# } +} diff --git a/as/etc/aerospike_mesh.conf b/as/etc/aerospike_mesh.conf new file mode 100644 index 00000000..5dbc147f --- /dev/null +++ b/as/etc/aerospike_mesh.conf @@ -0,0 +1,70 @@ +# Aerospike database configuration file for deployments using mesh heartbeats. + +service { + user root + group root + paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1. + pidfile /var/run/aerospike/asd.pid + proto-fd-max 15000 +} + +logging { + # Log file must be an absolute path. + file /var/log/aerospike/aerospike.log { + context any info + } +} + +network { + service { + address any + port 3000 + } + + heartbeat { + mode mesh + port 3002 # Heartbeat port for this node. + + # List one or more other nodes, one ip-address & port per line: + mesh-seed-address-port 10.10.10.10 3002 +# mesh-seed-address-port 10.10.10.11 3002 +# mesh-seed-address-port 10.10.10.12 3002 +# mesh-seed-address-port 10.10.10.13 3002 +# mesh-seed-address-port 10.10.10.14 3002 + + interval 250 + timeout 10 + } + + fabric { + port 3001 + } + + info { + port 3003 + } +} + +namespace test { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + + storage-engine memory +} + +namespace bar { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + + storage-engine memory + + # To use file storage backing, comment out the line above and use the + # following lines instead. +# storage-engine device { +# file /opt/aerospike/data/bar.dat +# filesize 16G +# data-in-memory true # Store data in memory in addition to file. +# } +} diff --git a/as/etc/aerospike_mesh_systemd.conf b/as/etc/aerospike_mesh_systemd.conf new file mode 100644 index 00000000..4c5b6046 --- /dev/null +++ b/as/etc/aerospike_mesh_systemd.conf @@ -0,0 +1,66 @@ +# Aerospike database configuration file for deployments using mesh heartbeats with systemd. + +service { + paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1. + proto-fd-max 15000 +} + +logging { + console { + context any info + } +} + +network { + service { + address any + port 3000 + } + + heartbeat { + mode mesh + port 3002 # Heartbeat port for this node. + + # List one or more other nodes, one ip-address & port per line: + mesh-seed-address-port 10.10.10.10 3002 +# mesh-seed-address-port 10.10.10.11 3002 +# mesh-seed-address-port 10.10.10.12 3002 +# mesh-seed-address-port 10.10.10.13 3002 +# mesh-seed-address-port 10.10.10.14 3002 + + interval 250 + timeout 10 + } + + fabric { + port 3001 + } + + info { + port 3003 + } +} + +namespace test { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + + storage-engine memory +} + +namespace bar { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + + storage-engine memory + + # To use file storage backing, comment out the line above and use the + # following lines instead. +# storage-engine device { +# file /opt/aerospike/data/bar.dat +# filesize 16G +# data-in-memory true # Store data in memory in addition to file. +# } +} diff --git a/as/etc/aerospike_ssd.conf b/as/etc/aerospike_ssd.conf new file mode 100644 index 00000000..c79a7251 --- /dev/null +++ b/as/etc/aerospike_ssd.conf @@ -0,0 +1,65 @@ +# Aerospike database configuration file for deployments using raw storage. + +service { + user root + group root + paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1. + pidfile /var/run/aerospike/asd.pid + proto-fd-max 15000 +} + +logging { + # Log file must be an absolute path. + file /var/log/aerospike/aerospike.log { + context any info + } +} + +network { + service { + address any + port 3000 + } + + heartbeat { + mode multicast + multicast-group 239.1.99.222 + port 9918 + + # To use unicast-mesh heartbeats, remove the 3 lines above, and see + # aerospike_mesh.conf for alternative. + + interval 150 + timeout 10 + } + + fabric { + port 3001 + } + + info { + port 3003 + } +} + +namespace test { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + + # Warning - legacy data in defined raw partition devices will be erased. + # These partitions must not be mounted by the file system. + storage-engine device { + # Use one or more lines like those below with actual device paths. +# device /dev/sdb +# device /dev/sdc + + # The 2 lines below optimize for SSD. + scheduler-mode noop + write-block-size 128K + + # Use the line below to store data in memory in addition to devices. +# data-in-memory true + } +} + diff --git a/as/etc/aerospike_ssd_systemd.conf b/as/etc/aerospike_ssd_systemd.conf new file mode 100644 index 00000000..06392cd7 --- /dev/null +++ b/as/etc/aerospike_ssd_systemd.conf @@ -0,0 +1,61 @@ +# Aerospike database configuration file for deployments using raw storage with systemd. + +service { + paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1. + proto-fd-max 15000 +} + +logging { + console { + context any info + } +} + +network { + service { + address any + port 3000 + } + + heartbeat { + mode multicast + multicast-group 239.1.99.222 + port 9918 + + # To use unicast-mesh heartbeats, remove the 3 lines above, and see + # aerospike_mesh.conf for alternative. + + interval 150 + timeout 10 + } + + fabric { + port 3001 + } + + info { + port 3003 + } +} + +namespace test { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + + # Warning - legacy data in defined raw partition devices will be erased. + # These partitions must not be mounted by the file system. + storage-engine device { + # Use one or more lines like those below with actual device paths. +# device /dev/sdb +# device /dev/sdc + + # The 2 lines below optimize for SSD. + scheduler-mode noop + write-block-size 128K + + # Use the line below to store data in memory in addition to devices. +# data-in-memory true + } +} + diff --git a/as/etc/aerospike_systemd.conf b/as/etc/aerospike_systemd.conf new file mode 100644 index 00000000..58fa30c0 --- /dev/null +++ b/as/etc/aerospike_systemd.conf @@ -0,0 +1,63 @@ +# Aerospike database configuration file for use with systemd. + +service { + paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1. + proto-fd-max 15000 +} + +logging { + console { + context any info + } +} + +network { + service { + address any + port 3000 + } + + heartbeat { + mode multicast + multicast-group 239.1.99.222 + port 9918 + + # To use unicast-mesh heartbeats, remove the 3 lines above, and see + # aerospike_mesh.conf for alternative. + + interval 150 + timeout 10 + } + + fabric { + port 3001 + } + + info { + port 3003 + } +} + +namespace test { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + + storage-engine memory +} + +namespace bar { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + + storage-engine memory + + # To use file storage backing, comment out the line above and use the + # following lines instead. +# storage-engine device { +# file /opt/aerospike/data/bar.dat +# filesize 16G +# data-in-memory true # Store data in memory in addition to file. +# } +} diff --git a/as/etc/aerospike_telemetry.service b/as/etc/aerospike_telemetry.service new file mode 100644 index 00000000..694de3eb --- /dev/null +++ b/as/etc/aerospike_telemetry.service @@ -0,0 +1,11 @@ +[Unit] +Description=Aerospike Telemetry Agent +After=network.target +Wants=network.target + +[Service] +User=aerospike +Group=aerospike +EnvironmentFile=/etc/sysconfig/aerospike_telemetry +PermissionsStartOnly=True +ExecStart=/opt/aerospike/telemetry/telemetry.py $TELEMETRY_CONFIG_FILE start --fgdaemon diff --git a/as/etc/aerospike_telemetry.sysconfig b/as/etc/aerospike_telemetry.sysconfig new file mode 100644 index 00000000..6c1d364a --- /dev/null +++ b/as/etc/aerospike_telemetry.sysconfig @@ -0,0 +1 @@ +TELEMETRY_CONFIG_FILE=/etc/aerospike/telemetry.conf diff --git a/as/etc/asd-systemd-helper b/as/etc/asd-systemd-helper new file mode 100644 index 00000000..4f19836c --- /dev/null +++ b/as/etc/asd-systemd-helper @@ -0,0 +1,36 @@ +#!/bin/bash +mem=`/sbin/sysctl -n kernel.shmall` +min=4294967296 +if [ ${#mem} -le ${#min} ]; then + if [ $mem -lt $min ]; then + echo "kernel.shmall too low, setting to 4G pages = 16TB" + /sbin/sysctl -w kernel.shmall=$min + fi +fi + +mem=`/sbin/sysctl -n kernel.shmmax` +min=1073741824 +if [ ${#mem} -le ${#min} ]; then + if [ $mem -lt $min ]; then + echo "kernel.shmmax too low, setting to 1GB" + /sbin/sysctl -w kernel.shmmax=$min + fi +fi + +set_socket_buffer_limit() { + name=${1}; path=${2}; size=${3} + curr=$(cat ${path}) + + if [ ${curr} -lt ${size} ]; then + echo "Increasing ${name} socket buffer limit (${path}): ${curr} -> ${size}" + echo ${size} >${path} + fi +} + +set_socket_buffer_limit read /proc/sys/net/core/rmem_max 15728640 +set_socket_buffer_limit write /proc/sys/net/core/wmem_max 5242880 + +if [ -f /etc/aerospike/initfns ] +then + . /etc/aerospike/initfns +fi diff --git a/as/etc/init-script b/as/etc/init-script new file mode 100644 index 00000000..8112d6d8 --- /dev/null +++ b/as/etc/init-script @@ -0,0 +1,193 @@ +#!/bin/sh +# chkconfig: 2345 85 15 +# description: Starts and stops the Aerospike daemon + +### BEGIN INIT INFO +# Provides: aerospike +# Required-Start: $remote_fs $network +# Required-Stop: $remote_fs $network +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Aerospike Clustered Data Service +### END INIT INFO + +. /etc/rc.d/init.d/functions +. /etc/sysconfig/network +[ "$NETWORKING" = "no" ] && exit 0 + +ASD=/usr/bin/asd +ASDN=$(basename $ASD) +LOCKFILE=/var/lock/subsys/aerospike +CONFIG_FILE=/etc/aerospike/aerospike.conf +CMD="$ASD --config-file $CONFIG_FILE" +PIDDIR="/var/run/aerospike" +ASD_USER="aerospike" +ASD_GROUP=$ASD_USER +STOP_TIMEOUT=${STOP_TIMEOUT-30} +EDITION="@EDITION@" + +INITFNS=/etc/aerospike/initfns +if [ -f $INITFNS ]; then . $INITFNS; fi +if [ -n $LD_PRELOAD ]; then export LD_PRELOAD; fi + +# in production, the corefiles are so huge as to prevent +# quick restarts of servers. Turn this on only if requested +# DAEMON_COREFILE_LIMIT="unlimited" + +set_shmall() { + mem=`/sbin/sysctl -n kernel.shmall` + min=4294967296 + if [ ${#mem} -le ${#min} ]; then + if [ $mem -lt $min ]; then + echo "kernel.shmall too low, setting to 4G pages = 16TB" + /sbin/sysctl -w kernel.shmall=$min + fi + fi +} + +set_shmmax() { + mem=`/sbin/sysctl -n kernel.shmmax` + min=1073741824 + if [ ${#mem} -le ${#min} ]; then + if [ $mem -lt $min ]; then + echo "kernel.shmmax too low, setting to 1GB" + /sbin/sysctl -w kernel.shmmax=$min + fi + fi +} + +set_socket_buffer_limit() { + name=${1}; path=${2}; size=${3} + curr=$(cat ${path}) + + if [ ${curr} -lt ${size} ]; then + echo "Increasing ${name} socket buffer limit (${path}): ${curr} -> ${size}" + echo ${size} >${path} + fi +} + +set_socket_buffer_limits() { + set_socket_buffer_limit read /proc/sys/net/core/rmem_max 15728640 + set_socket_buffer_limit write /proc/sys/net/core/wmem_max 5242880 +} + +#We are adding create_piddir as /var/run is tmpfs on some distributions. +#This causes the piddir to be removed on reboot +#adding this to centos init for parity +create_piddir() { + if [ ! -d $PIDDIR ] + then + (mkdir $PIDDIR && chown $ASD_USER:$ASD_GROUP $PIDDIR) &> /dev/null + fi +} + +start() { + ulimit -n 100000 + logger -t aerospike "ulimit -n="`ulimit -n` + [ -x $ASD ] || exit 0 + set_shmall + set_shmmax + set_socket_buffer_limits + create_piddir + echo -n $"Starting and checking aerospike: " + daemon "$CMD && pgrep $ASDN &> /dev/null" + retval=$? + echo + [ $retval -eq 0 ] && touch $LOCKFILE + return $retval +} + +coldstart() { + ulimit -n 100000 + logger -t aerospike "ulimit -n="`ulimit -n` + [ -x $ASD ] || exit 0 + set_shmall + set_shmmax + set_socket_buffer_limits + create_piddir + echo -n $"Cold-starting aerospike: " + daemon "$CMD --cold-start && pgrep $ASDN &> /dev/null" + retval=$? + echo + [ $retval -eq 0 ] && touch $LOCKFILE + return $retval +} + +stop() { + echo -n $"Stopping aerospike: " + killproc -d ${STOP_TIMEOUT} $ASDN + retval=$? + echo + [ $retval -eq 0 ] && rm -f $LOCKFILE + return $retval +} + +rh_status() { + status $ASDN +} + +rh_status_quiet() { + status $ASDN >/dev/null 2>&1 +} + +do_telemetry_start () { + if [ $EDITION = "community" ]; + then + /sbin/service aerospike_telemetry start >/dev/null 2>&1 + fi +} + +do_telemetry_stop () { + if [ $EDITION = "community" ]; + then + /sbin/service aerospike_telemetry stop >/dev/null 2>&1 + fi +} + +case "$1" in + start) + rh_status_quiet + if [ $? == 0 ]; + then + { echo -n "Already "; $0 status; } + else + $1 + fi + + do_telemetry_start + ;; + coldstart) + rh_status_quiet + if [ $? == 0 ]; + then + { echo -n "Already "; $0 status; } + else + $1 + fi + + do_telemetry_start + ;; + stop) + rh_status_quiet + if [ $? == 3 ]; + then + { echo -n "Already "; $0 status; } + else + $1 + fi + + do_telemetry_stop + ;; + status) + rh_status + ;; + restart) + $0 stop + sleep 3 + $0 start + ;; + *) + echo $"Usage: $0 {start|stop|status|coldstart|restart}" + exit 2 + ;; +esac diff --git a/as/etc/init-script.deb b/as/etc/init-script.deb new file mode 100755 index 00000000..d4afd3b7 --- /dev/null +++ b/as/etc/init-script.deb @@ -0,0 +1,162 @@ +#!/bin/bash +# chkconfig: 2345 85 15 +# description: Starts and stops the Aerospike daemon + +### BEGIN INIT INFO +# Provides: aerospike +# Required-Start: $remote_fs $network +# Required-Stop: $remote_fs $network +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Aerospike Clustered Data Service +### END INIT INFO + +ASD=/usr/bin/asd +ASDN=$(basename $ASD) +CONFIG_FILE=/etc/aerospike/aerospike.conf +OPTS="--config-file $CONFIG_FILE" +COLD_OPTS="$OPTS --cold-start" +PIDDIR="/var/run/aerospike" +PIDFILE=$PIDDIR/asd.pid +ASD_USER="aerospike" +ASD_GROUP=$ASD_USER +EDITION="@EDITION@" + +INITFNS=/etc/aerospike/initfns +if [ -f $INITFNS ]; then . $INITFNS; fi + +. /lib/lsb/init-functions + +set_shmall() { + mem=`/sbin/sysctl -n kernel.shmall` + min=4294967296 + if [ ${#mem} -le ${#min} ]; then + if [ $mem -lt $min ]; then + echo "kernel.shmall too low, setting to 4G pages = 16TB" + /sbin/sysctl -w kernel.shmall=$min + fi + fi +} + +set_shmmax() { + mem=`/sbin/sysctl -n kernel.shmmax` + min=1073741824 + if [ ${#mem} -le ${#min} ]; then + if [ $mem -lt $min ]; then + echo "kernel.shmmax too low, setting to 1GB" + /sbin/sysctl -w kernel.shmmax=$min + fi + fi +} + +set_socket_buffer_limit() { + name=${1}; path=${2}; size=${3} + curr=$(cat ${path}) + + if [ ${curr} -lt ${size} ]; then + echo "Increasing ${name} socket buffer limit (${path}): ${curr} -> ${size}" + echo ${size} >${path} + fi +} + +set_socket_buffer_limits() { + set_socket_buffer_limit read /proc/sys/net/core/rmem_max 15728640 + set_socket_buffer_limit write /proc/sys/net/core/wmem_max 5242880 +} + +#We are adding create_piddir as /var/run is tmpfs on Debian 7+/Ubuntu 12+. This causes +#the piddir to be removed on reboot +create_piddir() { + if [ ! -d $PIDDIR ] + then + (mkdir $PIDDIR && chown $ASD_USER:$ASD_GROUP $PIDDIR) &> /dev/null + fi +} + +start() { + start-stop-daemon --start --quiet --name $ASDN --pidfile $PIDFILE --exec $ASD -- $OPTS +} + +coldstart() { + start-stop-daemon --start --quiet --name $ASDN --pidfile $PIDFILE --exec $ASD -- $COLD_OPTS +} + +stop() { + [ -f $PIDFILE ] && pid=`cat $PIDFILE` + start-stop-daemon --stop --quiet --pidfile $PIDFILE --name $ASDN + rv=$? + [ $pid ] && while [ -e /proc/$pid ]; do sleep 0.1; done + return $rv +} + +do_telemetry_start () { + if [ $EDITION = "community" ]; + then + /usr/sbin/service aerospike_telemetry start >/dev/null 2>&1 + fi +} + +do_telemetry_stop () { + if [ $EDITION = "community" ]; + then + /usr/sbin/service aerospike_telemetry stop >/dev/null 2>&1 + fi +} + +case "$1" in + start|coldstart) + ulimit -n 100000 + logger -t aerospike "ulimit -n=" `ulimit -n` + set_shmall + set_shmmax + set_socket_buffer_limits + create_piddir + + [ -n "$LD_PRELOAD" ] && export LD_PRELOAD + log_daemon_msg "${1^}ing aerospike" + $1 + case $? in + 0) + log_end_msg 0 + ;; + 1) + echo "aerospike already started" + log_end_msg 0 + ;; + *) + log_end_msg 1 + ;; + esac + + do_telemetry_start + ;; + stop) + log_daemon_msg "Stopping aerospike" + $1 + case $? in + 0) + log_end_msg 0 + ;; + 1) + echo "aerospike already stopped" + log_end_msg 0 + ;; + *) + log_end_msg 1 + ;; + esac + + do_telemetry_stop + ;; + status) + status_of_proc -p $PIDFILE $ASDN aerospike + ;; + restart) + [ -n "`pgrep $ASDN`" ] && $0 stop + $0 start + ;; + *) + echo $"Usage: $0 {start|stop|status|coldstart|restart}" + exit 2 + ;; +esac diff --git a/as/etc/init-telemetry-script b/as/etc/init-telemetry-script new file mode 100644 index 00000000..3c3b2ff2 --- /dev/null +++ b/as/etc/init-telemetry-script @@ -0,0 +1,49 @@ +#!/bin/bash +# chkconfig: 2345 85 15 +# description: Starts and stops the Aerospike Telemetry Agent + +### BEGIN INIT INFO +# Provides: aerospike_telemetry +# Required-Start: $remote_fs $network +# Required-Stop: $remote_fs $network +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Aerospike Telemetry Agent +### END INIT INFO + +# Source function library. +. /etc/rc.d/init.d/functions + +DIR=/opt/aerospike/telemetry +DAEMON=$DIR/telemetry.py +CONFIG="/etc/aerospike/telemetry.conf" + +start() { + python $DAEMON $CONFIG start +} + +stop() { + python $DAEMON $CONFIG stop +} + +status() { + python $DAEMON $CONFIG status +} + +restart() { + python $DAEMON $CONFIG restart +} + +try-restart() { + python $DAEMON $CONFIG try-restart +} + +case "$1" in + start|stop|status|restart|try-restart) + ${1} + ;; + *) + echo "Usage: $0 {start|stop|status|restart|try-restart}" + exit 2 + ;; +esac diff --git a/as/etc/init-telemetry-script.deb b/as/etc/init-telemetry-script.deb new file mode 100644 index 00000000..876cf4f7 --- /dev/null +++ b/as/etc/init-telemetry-script.deb @@ -0,0 +1,49 @@ +#!/bin/bash +# chkconfig: 2345 85 15 +# description: Starts and stops the Aerospike Telemetry Agent + +### BEGIN INIT INFO +# Provides: aerospike_telemetry +# Required-Start: $remote_fs $network +# Required-Stop: $remote_fs $network +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Aerospike Telemetry Agent +### END INIT INFO + +# Source function library. +. /lib/lsb/init-functions + +DIR=/opt/aerospike/telemetry +DAEMON=$DIR/telemetry.py +CONFIG="/etc/aerospike/telemetry.conf" + +start() { + python $DAEMON $CONFIG start +} + +stop() { + python $DAEMON $CONFIG stop +} + +status() { + python $DAEMON $CONFIG status +} + +restart() { + python $DAEMON $CONFIG restart +} + +try-restart() { + python $DAEMON $CONFIG try-restart +} + +case "$1" in + start|stop|status|restart|try-restart) + ${1} + ;; + *) + echo "Usage: $0 {start|stop|status|restart|try-restart}" + exit 2 + ;; +esac diff --git a/as/etc/irqbalance-ban.sh b/as/etc/irqbalance-ban.sh new file mode 100755 index 00000000..bd934147 --- /dev/null +++ b/as/etc/irqbalance-ban.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +if [ -d ${1}/net ]; then + echo 'ban=true' +fi diff --git a/as/etc/logrotate_asd b/as/etc/logrotate_asd new file mode 100644 index 00000000..bbaa54fe --- /dev/null +++ b/as/etc/logrotate_asd @@ -0,0 +1,12 @@ +/var/log/aerospike/aerospike.log { + daily + rotate 90 + dateext + compress + olddir /var/log/aerospike + missingok + notifempty + postrotate + (kill -HUP `pgrep asd`) > /dev/null 2>&1 || true + endscript +} diff --git a/as/etc/logrotate_telemetry b/as/etc/logrotate_telemetry new file mode 100644 index 00000000..69548fc3 --- /dev/null +++ b/as/etc/logrotate_telemetry @@ -0,0 +1,12 @@ +/var/log/aerospike/telemetry.log { + daily + rotate 5 + dateext + compress + olddir /var/log/aerospike + missingok + notifempty + postrotate + service aerospike_telemetry try-restart > /dev/null 2>&1 || true + endscript +} diff --git a/as/etc/telemetry.conf b/as/etc/telemetry.conf new file mode 100644 index 00000000..bf14cdad --- /dev/null +++ b/as/etc/telemetry.conf @@ -0,0 +1,13 @@ +[asd] +config-file = /etc/aerospike/aerospike.conf + +[logging] +logfile = /var/log/aerospike/telemetry.log +loglevel = info + +[main] +disable = false +interval = 600 +home-url = https://telemetry.aerospike.com +user = aerospike +group = aerospike diff --git a/as/etc/telemetry_dev.conf b/as/etc/telemetry_dev.conf new file mode 100644 index 00000000..3e105053 --- /dev/null +++ b/as/etc/telemetry_dev.conf @@ -0,0 +1,13 @@ +[asd] +config-file = as/etc/aerospike_dev.conf + +[logging] +logfile = run/log/telemetry.log +loglevel = info + +[main] +disable = false +interval = 600 +home-url = https://telemetry.aerospike.com +user = aerospike +group = aerospike diff --git a/as/etc/valgrind.supp b/as/etc/valgrind.supp new file mode 100644 index 00000000..e4ba17e4 --- /dev/null +++ b/as/etc/valgrind.supp @@ -0,0 +1,190 @@ +# I hope one can put comments in here +# this supression file allows backtraces under valgrind +# Put the following block in `~/valgrind.supp`, then run `valgrind --suppressions=/home/bob/valgrind.supp` +# (note that `valgrind` doesn't understand `~` in pathnames). + +# malloc known supressions +# +{ + alloc-namespaces + Memcheck:Leak + fun:malloc + fun:cf_malloc_at + fun:cf_rc_alloc_at + fun:as_namespace_create + fun:as_config_init + fun:main +} + +{ + index-trees + Memcheck:Leak + fun:malloc + fun:cf_malloc_at + fun:cf_rc_alloc_at + fun:as_index_tree_create + fun:as_partition_reinit + fun:as_partition_balance_new + fun:as_paxos_init + fun:main +} + +## +# we always send uninit data to the network, so it says, and it's always ok +# +{ + SendToUninit-xxx + Memcheck:Param + socketcall.sendto(msg) + fun:send + fun:as_msg_send_reply + fun:single_transaction_response + fun:send_response + fun:send_success + fun:send_result + fun:udf_apply_record + fun:udf_rw_local + fun:internal_rw_start + fun:as_rw_start + fun:as_write_start + fun:thr_tsvc +} + +{ + SendToUninit-222 + Memcheck:Param + socketcall.sendto(msg) + fun:send + fun:as_msg_send_reply + fun:single_transaction_response + fun:thr_tsvc_read + fun:rw_complete + fun:internal_rw_start + fun:as_rw_start + fun:thr_tsvc + fun:start_thread + obj:* +} +{ + SendToUninit-333 + Memcheck:Param + socketcall.sendto(msg) + fun:send + fun:as_msg_send_reply + fun:single_transaction_response + fun:send_response.isra.3.constprop.4 + fun:udf_apply_record + fun:udf_rw_local + fun:internal_rw_start + fun:as_rw_start + fun:thr_tsvc + fun:start_thread +} + +{ + udf_write_1 + Memcheck:Param + socketcall.sendto(msg) + fun:send + fun:as_msg_send_reply + fun:single_transaction_response + fun:send_response + fun:udf_apply_record + fun:udf_rw_local + fun:internal_rw_start + fun:as_rw_start + fun:thr_tsvc + fun:start_thread + obj:* +} + +{ + udf_write_2 + Memcheck:Param + socketcall.sendto(msg) + fun:send + fun:as_msg_send_reply + fun:thr_tsvc_read + fun:rw_complete + fun:internal_rw_start + fun:as_rw_start + fun:thr_tsvc + fun:start_thread + obj:* +} + +{ + udf_write_3 + Memcheck:Param + socketcall.sendto(msg) + fun:send + fun:as_msg_send_reply + fun:single_transaction_response + fun:send_response + fun:send_result + fun:udf_rw_local + fun:internal_rw_start + fun:as_rw_start + fun:thr_tsvc + fun:start_thread + obj:* +} + + +# +# known issues - reasonably well investigated +# + +{ + libc-execinfo-backtrace + Memcheck:Addr4 + obj:/lib/tls/i686/cmov/libc-2.7.so + obj:/lib/ld-2.7.so + fun:__libc_dlopen_mode + obj:/lib/tls/i686/cmov/libc-2.7.so + fun:pthread_once + fun:cf_fault_event +} + +{ + storage-files-write + Memcheck:Param + write(buf) + obj:/usr/lib/debug/libpthread-2.8.90.so + fun:write_bins + fun:as_storage_record_close_files + fun:as_storage_record_close + fun:write_local + fun:as_write_start + fun:thr_tsvc + fun:start_thread + fun:clone + obj:* +} + +{ + storage-header-write + Memcheck:Param + write(buf) + obj:/lib64/libpthread-2.11.1.so + fun:as_storage_write_header + fun:as_storage_info_flush_ssd + fun:init_ssd_devices + fun:as_storage_namespace_init_ssd + fun:as_storage_namespace_init + fun:main + obj:* +} + +{ + uninitalized_fabric_message + Memcheck:Param + socketcall.sendto(msg) + fun:send + fun:fabric_process_writable + fun:fabric_worker_fn + fun:start_thread + fun:clone +} + + diff --git a/as/include/base/aggr.h b/as/include/base/aggr.h new file mode 100644 index 00000000..aeb29a95 --- /dev/null +++ b/as/include/base/aggr.h @@ -0,0 +1,54 @@ +/* + * aggr.h + * + * Copyright (C) 2014-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include + +#include "aerospike/as_rec.h" +#include "aerospike/as_result.h" +#include "aerospike/as_stream.h" +#include "aerospike/as_val.h" +#include "citrusleaf/cf_ll.h" + +#include "ai_btree.h" + +#include "transaction/udf.h" + +struct as_namespace_s; +struct as_partition_reservation_s; +struct udf_record_s; + +typedef struct { + as_stream_status (* ostream_write) (void *, as_val *); + void (* set_error) (void *, int); + struct as_partition_reservation_s * (* ptn_reserve) (void *, struct as_namespace_s *, uint32_t, struct as_partition_reservation_s *); + void (* ptn_release) (void *, struct as_partition_reservation_s *); + bool (* pre_check) (void *, struct udf_record_s *, void *); +} as_aggr_hooks; + +typedef struct { + udf_def def; + const as_aggr_hooks * aggr_hooks; +} as_aggr_call; + +int as_aggr_process(struct as_namespace_s *ns, as_aggr_call *ag_call, cf_ll *ap_recl, void *udata, as_result *ap_res); diff --git a/as/include/base/as_stap.h b/as/include/base/as_stap.h new file mode 100644 index 00000000..9919e0cc --- /dev/null +++ b/as/include/base/as_stap.h @@ -0,0 +1,52 @@ +/* + * as_stap.h + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#if defined(USE_SYSTEMTAP) +#include +#include "probes.h" +#else +#define ASD_TRANS_DEMARSHAL(arg1,arg2,arg3) +#define ASD_QUERY_STARTING(arg1,arg2) +#define ASD_QUERY_QTRSETUP_STARTING(arg1,arg2) +#define ASD_QUERY_QTRSETUP_FINISHED(arg1,arg2) +#define ASD_QUERY_INIT(arg1,arg2) +#define ASD_QUERY_DONE(arg1,arg2,arg3) +#define ASD_QUERY_TRANS_DONE(arg1,arg2,arg3) +#define ASD_QUERY_QTR_ALLOC(arg1,arg2,arg3) +#define ASD_QUERY_QTR_FREE(arg1,arg2,arg3) +#define ASD_QUERY_IOREQ_STARTING(arg1,arg2) +#define ASD_QUERY_IOREQ_FINISHED(arg1,arg2) +#define ASD_QUERY_IO_STARTING(arg1,arg2) +#define ASD_QUERY_IO_NOTMATCH(arg1,arg2) +#define ASD_QUERY_IO_ERROR(arg1,arg2) +#define ASD_QUERY_IO_FINISHED(arg1,arg2) +#define ASD_QUERY_NETIO_STARTING(arg1,arg2) +#define ASD_QUERY_NETIO_FINISHED(arg1,arg2) +#define ASD_QUERY_ADDFIN(arg1,arg2) +#define ASD_QUERY_SENDPACKET_STARTING(arg1,arg2,arg3) +#define ASD_QUERY_SENDPACKET_CONTINUE(arg1,arg2) +#define ASD_QUERY_SENDPACKET_FINISHED(arg1) +#define ASD_SINDEX_MSGRANGE_STARTING(arg1,arg2) +#define ASD_SINDEX_MSGRANGE_FINISHED(arg1,arg2) +#endif diff --git a/as/include/base/batch.h b/as/include/base/batch.h new file mode 100644 index 00000000..36e9cc33 --- /dev/null +++ b/as/include/base/batch.h @@ -0,0 +1,40 @@ +/* + * batch.h + * + * Copyright (C) 2008-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include "base/transaction.h" +#include "dynbuf.h" + +typedef struct as_batch_shared_s as_batch_shared; + +int as_batch_init(); +int as_batch_queue_task(as_transaction* tr); +void as_batch_add_result(as_transaction* tr, uint16_t n_bins, as_bin** bins, as_msg_op** ops); +void as_batch_add_proxy_result(as_batch_shared* shared, uint32_t index, cf_digest* digest, cl_msg* cmsg, size_t size); +void as_batch_add_error(as_batch_shared* shared, uint32_t index, int result_code); +int as_batch_threads_resize(uint32_t threads); +void as_batch_queues_info(cf_dyn_buf* db); +int as_batch_unused_buffers(); +void as_batch_destroy(); + +as_file_handle* as_batch_get_fd_h(as_batch_shared* shared); diff --git a/as/include/base/cdt.h b/as/include/base/cdt.h new file mode 100644 index 00000000..f70d1552 --- /dev/null +++ b/as/include/base/cdt.h @@ -0,0 +1,492 @@ +/* + * cdt.h + * + * Copyright (C) 2015-2018 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include +#include + +#include "aerospike/as_msgpack.h" + +#include "base/datamodel.h" +#include "base/proto.h" + +#include "dynbuf.h" + + +//========================================================== +// Typedefs & constants. +// + +#define CDT_MAX_PACKED_INT_SZ (sizeof(uint64_t) + 1) +#define CDT_MAX_STACK_OBJ_SZ (1024 * 1024) +#define CDT_MAX_PARAM_LIST_COUNT (1024 * 1024) + +typedef struct rollback_alloc_s { + cf_ll_buf *ll_buf; + size_t malloc_list_sz; + size_t malloc_list_cap; + bool malloc_ns; + void *malloc_list[]; +} rollback_alloc; + +#define define_rollback_alloc(__name, __alloc_buf, __rollback_size, __malloc_ns) \ + uint8_t __name ## __mem[sizeof(rollback_alloc) + sizeof(void *) * (__alloc_buf ? 0 : __rollback_size)]; \ + rollback_alloc *__name = (rollback_alloc *)__name ## __mem; \ + __name->ll_buf = __alloc_buf; \ + __name->malloc_list_sz = 0; \ + __name->malloc_list_cap = (__alloc_buf ? 0 : __rollback_size); \ + __name->malloc_ns = __malloc_ns; + +typedef struct cdt_process_state_s { + as_cdt_optype type; + as_unpacker pk; + uint32_t ele_count; +} cdt_process_state; + +typedef struct cdt_payload_s { + const uint8_t *ptr; + uint32_t sz; +} cdt_payload; + +typedef struct result_data_s { + as_bin *result; + rollback_alloc *alloc; + result_type_t type; + as_cdt_op_flags flags; + bool is_multi; +} cdt_result_data; + +typedef struct cdt_modify_data_s { + as_bin *b; + as_bin *result; + cf_ll_buf *alloc_buf; + + int ret_code; +} cdt_modify_data; + +typedef struct cdt_read_data_s { + const as_bin *b; + as_bin *result; + + int ret_code; +} cdt_read_data; + +typedef struct cdt_container_builder_s { + as_particle *particle; + uint8_t *write_ptr; + uint32_t *sz; + uint32_t ele_count; +} cdt_container_builder; + +typedef struct cdt_op_table_entry_s { + uint32_t count; + uint32_t opt_args; + const char *name; + const as_cdt_paramtype *args; +} cdt_op_table_entry; + +typedef struct cdt_calc_delta_s { + int64_t incr_int; + double incr_double; + + as_val_t type; + + int64_t value_int; + double value_double; +} cdt_calc_delta; + +typedef struct msgpacked_index_s { + uint8_t *ptr; + uint32_t ele_sz; + uint32_t ele_count; +} msgpacked_index; + +typedef struct offset_index_s { + msgpacked_index _; + + const uint8_t *contents; + uint32_t content_sz; + bool is_partial; +} offset_index; + +// Value order index. +typedef struct order_index_s { + msgpacked_index _; + uint32_t max_idx; +} order_index; + +typedef struct order_index_find_s { + uint32_t start; + uint32_t count; + uint32_t target; + uint32_t result; + bool found; +} order_index_find; + +typedef msgpack_compare_t (*order_heap_compare_fn)(const void *ptr, uint32_t index0, uint32_t index1); + +// Value order heap. +typedef struct order_heap_s { + order_index _; + const void *userdata; + order_heap_compare_fn cmp_fn; + msgpack_compare_t cmp; + uint32_t filled; +} order_heap; + +typedef struct cdt_packed_op_s { + // Input. + const uint8_t *packed; + uint32_t packed_sz; + + // Parsed. + uint32_t ele_count; + const uint8_t *contents; + uint32_t content_sz; + + // Result. + uint32_t new_ele_count; +} cdt_packed_op; + +struct order_index_adjust_s; +typedef uint32_t (*order_index_adjust_func)(const struct order_index_adjust_s *via, uint32_t src); + +typedef struct order_index_adjust_s { + order_index_adjust_func f; + uint32_t upper; + uint32_t lower; + int32_t delta; +} order_index_adjust; + +typedef enum { + CDT_FIND_ITEMS_IDXS_FOR_LIST_VALUE, + CDT_FIND_ITEMS_IDXS_FOR_MAP_KEY, + CDT_FIND_ITEMS_IDXS_FOR_MAP_VALUE +} cdt_find_items_idxs_type; + +#define define_offset_index(__name, __contents, __content_sz, __ele_count) \ + offset_index __name; \ + offset_index_init(&__name, NULL, __ele_count, __contents, __content_sz); \ + uint8_t __name ## __offset_index_mem__[offset_index_size(&__name)]; \ + __name._.ptr = __name ## __offset_index_mem__; \ + offset_index_set_filled(&__name, 1) + +#define cond_vla_order_index2(__name, __max_idx, __alloc_count, __cond) \ + union { \ + order_index ordidx; \ + uint8_t mem_temp[sizeof(order_index) + ((__cond) ? order_index_calc_size(__max_idx, __alloc_count) : 0)]; \ + } __name; \ + order_index_init2(&__name.ordidx, __name.mem_temp + sizeof(order_index), __max_idx, __alloc_count) + +#define define_order_index(__name, __ele_count) \ + order_index __name; \ + uint8_t __name ## __order_index_mem__[order_index_calc_size(__ele_count, __ele_count)]; \ + order_index_init(&__name, __name ## __order_index_mem__, __ele_count) + +#define define_order_index2(__name, __max_idx, __alloc_count) \ + order_index __name; \ + uint8_t __name ## __order_index_mem__[order_index_calc_size(__max_idx, __alloc_count)]; \ + order_index_init2(&__name, __name ## __order_index_mem__, __max_idx, __alloc_count) + +#define define_int_list_builder(__name, __alloc, __count) \ + cdt_container_builder __name; \ + cdt_int_list_builder_start(&__name, __alloc, __count) + +#define define_cdt_idx_mask(__name, __ele_count) \ + uint64_t __name[cdt_idx_mask_count(__ele_count)]; \ + cdt_idx_mask_init(__name, __ele_count) + +#define cond_define_cdt_idx_mask(__name, __ele_count, __cond) \ + uint64_t __name[__cond ? cdt_idx_mask_count(__ele_count) : 1]; \ + if (__cond) { \ + cdt_idx_mask_init(__name, __ele_count); \ + } + +#define define_build_order_heap_by_range(__name, __idx, __count, __ele_count, __udata, __cmp_fn, __success) \ + order_heap __name; \ + uint8_t __name ## __order_heap_mem__[order_index_calc_size(__ele_count, __ele_count)]; \ + bool __success = order_heap_init_build_by_range(&__name, __name ## __order_heap_mem__, __idx, __count, __ele_count, __cmp_fn, __udata) + +#define VA_NARGS_SEQ 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +#define VA_NARGS_EXTRACT_N(_9, _8, _7, _6, _5, _4, _3, _2, _1, _0, N, ...) N +#define VA_NARGS_SEQ2N(...) VA_NARGS_EXTRACT_N(__VA_ARGS__) +#define VA_NARGS(...) VA_NARGS_SEQ2N(_, ##__VA_ARGS__, VA_NARGS_SEQ) + +// Get around needing to pass last named arg to va_start(). +#define CDT_OP_TABLE_GET_PARAMS(state, ...) cdt_process_state_get_params(state, VA_NARGS(__VA_ARGS__), __VA_ARGS__) + +static const uint8_t msgpack_nil[1] = {0xC0}; + + +//========================================================== +// Function declarations. +// + +bool calc_index_count(int64_t in_index, uint64_t in_count, uint32_t ele_count, uint32_t *out_index, uint32_t *out_count, bool is_multi); + +// cdt_result_data +bool result_data_set_not_found(cdt_result_data *rd, int64_t index); +void result_data_set_list_int2x(cdt_result_data *rd, int64_t i1, int64_t i2); +int result_data_set_index_rank_count(cdt_result_data *rd, uint32_t start, uint32_t count, uint32_t ele_count); +int result_data_set_range(cdt_result_data *rd, uint32_t start, uint32_t count, uint32_t ele_count); +void result_data_set_by_irc(cdt_result_data *rd, const order_index *ordidx, const order_index *idx_map, uint32_t total_count); +void result_data_set_by_itemlist_irc(cdt_result_data *rd, const order_index *items_ord, order_index *ranks, uint32_t total_count); +void result_data_set_int_list_by_mask(cdt_result_data *rd, const uint64_t *mask, uint32_t count, uint32_t ele_count); + +// as_bin +void as_bin_set_int(as_bin *b, int64_t value); +void as_bin_set_double(as_bin *b, double value); +void as_bin_set_unordered_empty_list(as_bin *b, rollback_alloc *alloc_buf); +void as_bin_set_empty_packed_map(as_bin *b, rollback_alloc *alloc_buf, uint8_t flags); + +// cdt_delta_value +bool cdt_calc_delta_init(cdt_calc_delta *cdv, const cdt_payload *delta_value, bool is_decrement); +bool cdt_calc_delta_add(cdt_calc_delta *cdv, as_unpacker *pk_value); +void cdt_calc_delta_pack_and_result(cdt_calc_delta *cdv, cdt_payload *value, as_bin *result); + +// cdt_payload +bool cdt_payload_is_int(const cdt_payload *payload); +int64_t cdt_payload_get_int64(const cdt_payload *payload); +void cdt_payload_pack_int(cdt_payload *packed, int64_t value); +void cdt_payload_pack_double(cdt_payload *packed, double value); + +// cdt_process_state +bool cdt_process_state_init(cdt_process_state *cdt_state, const as_msg_op *op); +bool cdt_process_state_get_params(cdt_process_state *state, size_t n, ...); +const char *cdt_process_state_get_op_name(const cdt_process_state *state); + +// cdt_process_state_packed_list +bool cdt_process_state_packed_list_modify_optype(cdt_process_state *state, cdt_modify_data *cdt_udata); +bool cdt_process_state_packed_list_read_optype(cdt_process_state *state, cdt_read_data *cdt_udata); + +void cdt_container_builder_add(cdt_container_builder *builder, const uint8_t *buf, uint32_t sz); +void cdt_container_builder_add_n(cdt_container_builder *builder, const uint8_t *buf, uint32_t count, uint32_t sz); +void cdt_container_builder_add_int64(cdt_container_builder *builder, int64_t value); +void cdt_container_builder_add_int_range(cdt_container_builder *builder, uint32_t start, uint32_t count, uint32_t ele_count, bool reverse); +void cdt_container_builder_set_result(cdt_container_builder *builder, cdt_result_data *result); + +void cdt_list_builder_start(cdt_container_builder *builder, rollback_alloc *alloc_buf, uint32_t ele_count, uint32_t max_sz); +void cdt_map_builder_start(cdt_container_builder *builder, rollback_alloc *alloc_buf, uint32_t ele_count, uint32_t content_max_sz, uint8_t flags); + +// cdt_process_state_packed_map +bool cdt_process_state_packed_map_modify_optype(cdt_process_state *state, cdt_modify_data *cdt_udata); +bool cdt_process_state_packed_map_read_optype(cdt_process_state *state, cdt_read_data *cdt_udata); + +// rollback_alloc +void rollback_alloc_push(rollback_alloc *packed_alloc, void *ptr); +uint8_t *rollback_alloc_reserve(rollback_alloc *alloc_buf, size_t sz); +void rollback_alloc_rollback(rollback_alloc *alloc_buf); +bool rollback_alloc_from_msgpack(rollback_alloc *alloc_buf, as_bin *b, const cdt_payload *seg); + +// msgpacked_index +void msgpacked_index_set(msgpacked_index *idxs, uint32_t index, uint32_t value); +void msgpacked_index_incr(msgpacked_index *idxs, uint32_t index); +void msgpacked_index_set_ptr(msgpacked_index *idxs, uint8_t *ptr); +void *msgpacked_index_get_mem(const msgpacked_index *idxs, uint32_t index); +uint32_t msgpacked_index_size(const msgpacked_index *idxs); +uint32_t msgpacked_index_ptr2value(const msgpacked_index *idxs, const void *ptr); +uint32_t msgpacked_index_get(const msgpacked_index *idxs, uint32_t index); +void msgpacked_index_print(const msgpacked_index *idxs, const char *name); +bool msgpacked_index_find_index_sorted(const msgpacked_index *sorted_indexes, uint32_t find_index, uint32_t count, uint32_t *where); + +// offset_index +void offset_index_init(offset_index *offidx, uint8_t *idx_mem_ptr, uint32_t ele_count, const uint8_t *contents, uint32_t content_sz); +void offset_index_set(offset_index *offidx, uint32_t index, uint32_t value); +bool offset_index_set_next(offset_index *offidx, uint32_t index, uint32_t value); +void offset_index_set_filled(offset_index *offidx, uint32_t ele_filled); +void offset_index_set_ptr(offset_index *offidx, uint8_t *idx_mem, const uint8_t *packed_mem); +void offset_index_copy(offset_index *dest, const offset_index *src, uint32_t d_start, uint32_t s_start, uint32_t count, int delta); +void offset_index_append_size(offset_index *offidx, uint32_t delta); + +bool offset_index_find_items(offset_index *full_offidx, cdt_find_items_idxs_type find_type, as_unpacker *items_pk, order_index *items_ordidx_r, bool inverted, uint64_t *rm_mask, uint32_t *rm_count_r, order_index *rm_ranks_r); + +void *offset_index_get_mem(const offset_index *offidx, uint32_t index); +uint32_t offset_index_size(const offset_index *offidx); +bool offset_index_is_null(const offset_index *offidx); +bool offset_index_is_valid(const offset_index *offidx); +bool offset_index_is_full(const offset_index *offidx); +uint32_t offset_index_get_const(const offset_index *offidx, uint32_t idx); +uint32_t offset_index_get_delta_const(const offset_index *offidx, uint32_t index); +uint32_t offset_index_get_filled(const offset_index *offidx); + +void offset_index_print(const offset_index *offidx, const char *name); +void offset_index_delta_print(const offset_index *offidx, const char *name); + +// order_index +void order_index_init(order_index *ordidx, uint8_t *ptr, uint32_t ele_count); +void order_index_init2(order_index *ordidx, uint8_t *ptr, uint32_t max_idx, uint32_t ele_count); +void order_index_init_ref(order_index *dst, const order_index *src, uint32_t start, uint32_t count); +void order_index_set(order_index *ordidx, uint32_t index, uint32_t value); +void order_index_set_ptr(order_index *ordidx, uint8_t *ptr); +void order_index_incr(order_index *ordidx, uint32_t index); +void order_index_clear(order_index *ordidx); +bool order_index_sorted_mark_dup_eles(order_index *ordidx, const offset_index *full_offidx, uint32_t *count_r, uint32_t *sz_r); + +uint32_t order_index_size(const order_index *ordidx); +bool order_index_is_null(const order_index *ordidx); +bool order_index_is_valid(const order_index *ordidx); +bool order_index_is_filled(const order_index *ordidx); + +void *order_index_get_mem(const order_index *ordidx, uint32_t index); +uint32_t order_index_ptr2value(const order_index *ordidx, const void *ptr); +uint32_t order_index_get(const order_index *ordidx, uint32_t index); + +bool order_index_find_rank_by_value(const order_index *ordidx, const cdt_payload *value, const offset_index *full_offidx, order_index_find *find); + +uint32_t order_index_get_ele_size(const order_index *ordidx, uint32_t count, const offset_index *full_offidx); +uint8_t *order_index_write_eles(const order_index *ordidx, uint32_t count, const offset_index *full_offidx, uint8_t *ptr, bool invert); + +uint32_t order_index_adjust_value(const order_index_adjust *via, uint32_t src); +void order_index_copy(order_index *dest, const order_index *src, uint32_t d_start, uint32_t s_start, uint32_t count, const order_index_adjust *adjust); +size_t order_index_calc_size(uint32_t max_idx, uint32_t ele_count); + +void order_index_print(const order_index *ordidx, const char *name); + +// order_heap +bool order_heap_init_build_by_range(order_heap *heap, uint8_t *heap_mem, uint32_t idx, uint32_t count, uint32_t ele_count, order_heap_compare_fn cmp_fn, const void *udata); +void order_heap_swap(order_heap *heap, uint32_t index1, uint32_t index2); +bool order_heap_remove_top(order_heap *heap); +bool order_heap_replace_top(order_heap *heap, uint32_t value); +bool order_heap_heapify(order_heap *heap, uint32_t index); +bool order_heap_build(order_heap *heap, bool init); +bool order_heap_order_at_end(order_heap *heap, uint32_t count); +void order_heap_reverse_end(order_heap *heap, uint32_t count); + +void order_heap_print(const order_heap *heap); + +// cdt_idx_mask +size_t cdt_idx_mask_count(uint32_t ele_count); +void cdt_idx_mask_init(uint64_t *mask, uint32_t ele_count); +void cdt_idx_mask_set(uint64_t *mask, uint32_t idx); +void cdt_idx_mask_set_by_ordidx(uint64_t *mask, const order_index *ordidx, uint32_t start, uint32_t count, bool inverted); +void cdt_idx_mask_set_by_irc(uint64_t *mask, const order_index *rankcount, const order_index *idx_map, bool inverted); +void cdt_idx_mask_invert(uint64_t *mask, uint32_t ele_count); + +uint64_t cdt_idx_mask_get(const uint64_t *mask, uint32_t idx); + +bool cdt_idx_mask_is_set(const uint64_t *mask, uint32_t idx); + +uint32_t cdt_idx_mask_find(const uint64_t *mask, uint32_t start, uint32_t end, bool is_find0); +uint8_t *cdt_idx_mask_write_eles(const uint64_t *mask, uint32_t count, const offset_index *full_offidx, uint8_t *ptr, bool invert); +uint32_t cdt_idx_mask_get_content_sz(const uint64_t *mask, uint32_t count, const offset_index *full_offidx); + +void cdt_idx_mask_print(const uint64_t *mask, uint32_t ele_count, const char *name); + +// list +bool list_full_offset_index_fill_all(offset_index *offidx); +bool list_order_index_sort(order_index *ordidx, const offset_index *full_offidx, as_cdt_sort_flags flags); + +bool list_param_parse(const cdt_payload *items, as_unpacker *pk, uint32_t *count_r); + +// Debugging support +void print_hex(const uint8_t *packed, uint32_t packed_sz, char *buf, uint32_t buf_sz); +void print_packed(const uint8_t *packed, uint32_t sz, const char *name); +void cdt_bin_print(const as_bin *b, const char *name); + + +//========================================================== +// Inline functions. +// + +static inline bool +result_data_is_inverted(cdt_result_data *rd) +{ + return (rd->flags & AS_CDT_OP_FLAG_INVERTED) != 0; +} + +static inline void +result_data_set(cdt_result_data *rd, uint64_t result_type, bool is_multi) +{ + rd->type = (result_type_t)(result_type & AS_CDT_OP_FLAG_RESULT_MASK); + rd->flags = (as_cdt_op_flags)(result_type & (~AS_CDT_OP_FLAG_RESULT_MASK)); + rd->is_multi = is_multi; +} + +static inline void +result_data_set_int(cdt_result_data *rd, int64_t value) +{ + if (rd) { + as_bin_set_int(rd->result, value); + } +} + +static inline bool +result_data_is_return_elements(const cdt_result_data *rd) +{ + return (rd->type == RESULT_TYPE_KEY || rd->type == RESULT_TYPE_VALUE || + rd->type == RESULT_TYPE_MAP); +} + +static inline bool +result_data_is_return_index(const cdt_result_data *rd) +{ + return (rd->type == RESULT_TYPE_INDEX || rd->type == RESULT_TYPE_REVINDEX); +} + +static inline bool +result_data_is_return_index_range(const cdt_result_data *rd) +{ + return (rd->type == RESULT_TYPE_INDEX_RANGE || + rd->type == RESULT_TYPE_REVINDEX_RANGE); +} + +static inline bool +result_data_is_return_rank(const cdt_result_data *rd) +{ + return (rd->type == RESULT_TYPE_REVRANK || rd->type == RESULT_TYPE_RANK); +} + +static inline bool +result_data_is_return_rank_range(const cdt_result_data *rd) +{ + return (rd->type == RESULT_TYPE_REVRANK_RANGE || + rd->type == RESULT_TYPE_RANK_RANGE); +} + +static inline void +order_heap_set(order_heap *heap, uint32_t index, uint32_t value) +{ + order_index_set((order_index *)heap, index, value); +} + +static inline uint32_t +order_heap_get(const order_heap *heap, uint32_t index) +{ + return order_index_get((const order_index *)heap, index); +} + +// Calculate index given index and max_index. +static inline int64_t +calc_index(int64_t index, uint32_t max_index) +{ + return index < 0 ? (int64_t)max_index + index : index; +} + +static inline void +cdt_int_list_builder_start(cdt_container_builder *builder, + rollback_alloc *alloc_buf, uint32_t ele_count) +{ + cdt_list_builder_start(builder, alloc_buf, ele_count, + CDT_MAX_PACKED_INT_SZ * ele_count); +} diff --git a/as/include/base/cfg.h b/as/include/base/cfg.h new file mode 100644 index 00000000..8278bdf3 --- /dev/null +++ b/as/include/base/cfg.h @@ -0,0 +1,284 @@ +/* + * cfg.h + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include +#include +#include +#include + +#include "xdr_config.h" + +#include "aerospike/mod_lua_config.h" +#include "citrusleaf/cf_atomic.h" + +#include "enhanced_alloc.h" +#include "hardware.h" +#include "node.h" +#include "socket.h" +#include "tls.h" + +#include "base/security_config.h" +#include "fabric/clustering.h" +#include "fabric/fabric.h" +#include "fabric/hb.h" +#include "fabric/hlc.h" + + +//========================================================== +// Forward declarations. +// + +struct as_namespace_s; + + +//========================================================== +// Typedefs and constants. +// + +#ifndef AS_NAMESPACE_SZ +#define AS_NAMESPACE_SZ 2 +#endif + +#define AS_CLUSTER_NAME_SZ 65 + +#define MAX_DEMARSHAL_THREADS 256 +#define MAX_BATCH_THREADS 256 +#define MAX_TLS_SPECS 10 + +// Declare bools with PAD_BOOL so they can't share a 4-byte space with other +// bools, chars or shorts. This prevents adjacent bools set concurrently in +// different threads (albeit very unlikely) from interfering with each other. +// Add others (e.g. PAD_UINT8, PAD_UINT16 ...) as needed. +#define PGLUE(a, b) a##b +#define PBOOL(line) bool PGLUE(pad_, line)[3]; bool +#define PAD_BOOL PBOOL(__LINE__) + +typedef struct as_config_s { + + // The order here matches that in the configuration parser's enum, + // cfg_case_id. This is for organizational sanity. + + //-------------------------------------------- + // service context. + // + + // Normally visible, in canonical configuration file order: + + uid_t uid; + gid_t gid; + uint32_t paxos_single_replica_limit; // cluster size at which, and below, the cluster will run with replication factor 1 + char* pidfile; + int n_proto_fd_max; + + // Normally hidden: + + // Note - advertise-ipv6 affects a cf_socket_ee.c global, so can't be here. + cf_topo_auto_pin auto_pin; + int n_batch_threads; + uint32_t batch_max_buffers_per_queue; // maximum number of buffers allowed in a buffer queue at any one time, fail batch if full + uint32_t batch_max_requests; // maximum count of database requests in a single batch + uint32_t batch_max_unused_buffers; // maximum number of buffers allowed in buffer pool at any one time + uint32_t batch_priority; // number of records between an enforced context switch, used by old batch only + uint32_t n_batch_index_threads; + char cluster_name[AS_CLUSTER_NAME_SZ]; + as_clustering_config clustering_config; + PAD_BOOL fabric_benchmarks_enabled; + PAD_BOOL svc_benchmarks_enabled; + PAD_BOOL info_hist_enabled; + const char* feature_key_file; + uint32_t hist_track_back; // total time span in seconds over which to cache data + uint32_t hist_track_slice; // period in seconds at which to cache histogram data + char* hist_track_thresholds; // comma-separated bucket (ms) values to track + int n_info_threads; + // Note - log-local-time affects a cf_fault.c global, so can't be here. + uint32_t migrate_max_num_incoming; + uint32_t n_migrate_threads; + char* node_id_interface; + uint32_t nsup_delete_sleep; // sleep this many microseconds between generating delete transactions, default 0 + uint32_t nsup_period; + PAD_BOOL nsup_startup_evict; + int proto_fd_idle_ms; // after this many milliseconds, connections are aborted unless transaction is in progress + int proto_slow_netio_sleep_ms; // dynamic only + uint32_t query_bsize; + uint64_t query_buf_size; // dynamic only + uint32_t query_bufpool_size; + PAD_BOOL query_in_transaction_thr; + uint32_t query_long_q_max_size; + PAD_BOOL query_enable_histogram; + PAD_BOOL partitions_pre_reserved; // query will reserve all partitions up front + uint32_t query_priority; + uint64_t query_sleep_us; + uint64_t query_rec_count_bound; + PAD_BOOL query_req_in_query_thread; + uint32_t query_req_max_inflight; + uint32_t query_short_q_max_size; + uint32_t query_threads; + uint32_t query_threshold; + uint64_t query_untracked_time_ms; + uint32_t query_worker_threads; + PAD_BOOL run_as_daemon; + uint32_t scan_max_active; // maximum number of active scans allowed + uint32_t scan_max_done; // maximum number of finished scans kept for monitoring + uint32_t scan_max_udf_transactions; // maximum number of active transactions per UDF background scan + uint32_t scan_threads; // size of scan thread pool + uint32_t n_service_threads; + uint32_t sindex_builder_threads; // secondary index builder thread pool size + uint32_t sindex_gc_max_rate; // Max sindex entries processed per second for gc + uint32_t sindex_gc_period; // same as nsup_period for sindex gc + uint32_t ticker_interval; + uint64_t transaction_max_ns; + uint32_t transaction_pending_limit; // 0 means no limit + uint32_t n_transaction_queues; + uint32_t transaction_retry_ms; + uint32_t n_transaction_threads_per_queue; + char* work_directory; + + // For special debugging or bug-related repair: + + cf_alloc_debug debug_allocations; // how to instrument the memory allocation API + PAD_BOOL fabric_dump_msgs; // whether to log information about existing "msg" objects and queues + uint32_t prole_extra_ttl; // seconds beyond expiry time after which we garbage collect, 0 for no garbage collection + + //-------------------------------------------- + // network::service context. + // + + // Normally visible, in canonical configuration file order: + + cf_serv_spec service; // client service + + // Normally hidden: + + cf_serv_spec tls_service; // TLS client service + + //-------------------------------------------- + // network::heartbeat context. + // + + cf_serv_spec hb_serv_spec; // literal binding address spec parsed from config + cf_serv_spec hb_tls_serv_spec; // literal binding address spec for TLS parsed from config + cf_addr_list hb_multicast_groups; // literal multicast groups parsed from config + as_hb_config hb_config; + + //-------------------------------------------- + // network::fabric context. + // + + // Normally visible, in canonical configuration file order: + + cf_serv_spec fabric; // fabric service + cf_serv_spec tls_fabric; // TLS fabric service + + // Normally hidden: + + uint32_t n_fabric_channel_fds[AS_FABRIC_N_CHANNELS]; + uint32_t n_fabric_channel_recv_threads[AS_FABRIC_N_CHANNELS]; + PAD_BOOL fabric_keepalive_enabled; + int fabric_keepalive_intvl; + int fabric_keepalive_probes; + int fabric_keepalive_time; + uint32_t fabric_latency_max_ms; // time window for ordering + uint32_t fabric_recv_rearm_threshold; + uint32_t n_fabric_send_threads; + + //-------------------------------------------- + // network::info context. + // + + // Normally visible, in canonical configuration file order: + + cf_serv_spec info; // info service + + //-------------------------------------------- + // Remaining configuration top-level contexts. + // + + mod_lua_config mod_lua; + as_sec_config sec_cfg; + + uint32_t n_tls_specs; + cf_tls_spec tls_specs[MAX_TLS_SPECS]; + + + //====================================================== + // Not (directly) configuration. Many should probably be + // relocated... + // + + // Global variable that just shouldn't be here. + cf_node self_node; + + // Global variables that just shouldn't be here. + cf_node xdr_clmap[AS_CLUSTER_SZ]; // cluster map as known to XDR + xdr_node_lst xdr_peers_lst[AS_CLUSTER_SZ]; // last XDR shipping info of other nodes + uint64_t xdr_self_lastshiptime[DC_MAX_NUM]; // last XDR shipping by this node + + // Namespaces. + struct as_namespace_s* namespaces[AS_NAMESPACE_SZ]; + uint32_t n_namespaces; + + // To speed up transaction enqueue's determination of whether to "inline": + uint32_t n_namespaces_inlined; + uint32_t n_namespaces_not_inlined; + +} as_config; + + +//========================================================== +// Public API. +// + +as_config* as_config_init(const char* config_file); +void as_config_post_process(as_config* c, const char* config_file); + +void as_config_cluster_name_get(char* cluster_name); +bool as_config_cluster_name_set(const char* cluster_name); +bool as_config_cluster_name_matches(const char* cluster_name); + +bool as_config_error_enterprise_only(); + +extern as_config g_config; + + +//========================================================== +// Private API - for enterprise separation only. +// + +// Parsed configuration file line. +typedef struct cfg_line_s { + int num; + char* name_tok; + char* val_tok_1; + char* val_tok_2; + char* val_tok_3; +} cfg_line; + +void cfg_enterprise_only(const cfg_line* p_line); +void cfg_post_process(); +cf_tls_spec* cfg_link_tls(const char* which, char** our_name); diff --git a/as/include/base/datamodel.h b/as/include/base/datamodel.h new file mode 100644 index 00000000..66fe9779 --- /dev/null +++ b/as/include/base/datamodel.h @@ -0,0 +1,1207 @@ +/* + * datamodel.h + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * core data model structures and definitions + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "aerospike/as_val.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" + +#include "arenax.h" +#include "dynbuf.h" +#include "hist.h" +#include "hist_track.h" +#include "linear_hist.h" +#include "msg.h" +#include "node.h" +#include "shash.h" +#include "vmapx.h" + +#include "base/cfg.h" +#include "base/proto.h" +#include "base/rec_props.h" +#include "base/transaction_policy.h" +#include "base/truncate.h" +#include "fabric/hb.h" +#include "fabric/partition.h" +#include "storage/storage.h" + + +#define AS_STORAGE_MAX_DEVICES (64) // maximum devices per namespace +#define AS_STORAGE_MAX_FILES (64) // maximum files per namespace +#define AS_STORAGE_MAX_DEVICE_SIZE (2L * 1024L * 1024L * 1024L * 1024L) // 2Tb, due to rblock_id in as_index + +#define OBJ_SIZE_HIST_NUM_BUCKETS 100 +#define TTL_HIST_NUM_BUCKETS 100 + +#define MAX_ALLOWED_TTL (3600 * 24 * 365 * 10) // 10 years + +// [0-1] for partition-id +// [1-2] for tree sprigs and locks +// [2-3] for the olock +// [4-7] for rw_request hash +#define DIGEST_SCRAMBLE_BYTE1 4 +// [8-11] for SSD device hash +#define DIGEST_STORAGE_BASE_BYTE 8 + +/* SYNOPSIS + * Data model + * + * Objects are stored in a hierarchy: namespace:record:bin:particle. + * The records in a namespace are further partitioned for distribution + * amongst the participating nodes in the cluster. + */ + + + +/* Forward declarations */ +typedef struct as_namespace_s as_namespace; +typedef struct as_index_s as_record; +typedef struct as_bin_s as_bin; +typedef struct as_index_ref_s as_index_ref; +typedef struct as_set_s as_set; +typedef struct as_treex_s as_treex; + +struct as_index_tree_s; + + +/* AS_ID_[NAMESPACE,SET,BIN,INAME]_SZ + * The maximum length, in bytes, of an identification field; by convention, + * these values are null-terminated UTF-8 */ +#define AS_ID_NAMESPACE_SZ 32 +#define AS_ID_BIN_SZ 15 // size used in storage format +#define AS_ID_INAME_SZ 256 +#define VMAP_BIN_NAME_MAX_SZ ((AS_ID_BIN_SZ + 3) & ~3) // round up to multiple of 4 +#define MAX_BIN_NAMES 0x10000 // no need for more - numeric ID is 16 bits +#define BIN_NAMES_QUOTA (MAX_BIN_NAMES / 2) // don't add more names than this via client transactions + +/* + * Compare two 16-bit generation counts, allowing wrap-arounds. + * Works correctly, if: + * + * - rhs is ahead of lhs, but rhs isn't ahead more than 32,768. + * - lhs is ahead of rhs, but lhs isn't ahead more than 32,767. + */ + +static inline bool +as_gen_less_than(uint16_t lhs, uint16_t rhs) +{ + return (uint16_t)(lhs - rhs) >= 32768; +} + + +/* as_particle_type + * Particles are typed, which reflects their contents: + * NULL: no associated content (not sure I really need this internally?) + * INTEGER: a signed, 64-bit integer + * FLOAT: a floating point + * STRING: a null-terminated UTF-8 string + * BLOB: arbitrary-length binary data + * TIMESTAMP: milliseconds since 1 January 1970, 00:00:00 GMT + * DIGEST: an internal Aerospike key digest */ +typedef enum { + AS_PARTICLE_TYPE_NULL = 0, + AS_PARTICLE_TYPE_INTEGER = 1, + AS_PARTICLE_TYPE_FLOAT = 2, + AS_PARTICLE_TYPE_STRING = 3, + AS_PARTICLE_TYPE_BLOB = 4, + AS_PARTICLE_TYPE_TIMESTAMP = 5, + AS_PARTICLE_TYPE_UNUSED_6 = 6, + AS_PARTICLE_TYPE_JAVA_BLOB = 7, + AS_PARTICLE_TYPE_CSHARP_BLOB = 8, + AS_PARTICLE_TYPE_PYTHON_BLOB = 9, + AS_PARTICLE_TYPE_RUBY_BLOB = 10, + AS_PARTICLE_TYPE_PHP_BLOB = 11, + AS_PARTICLE_TYPE_ERLANG_BLOB = 12, + AS_PARTICLE_TYPE_MAP = 19, + AS_PARTICLE_TYPE_LIST = 20, + AS_PARTICLE_TYPE_GEOJSON = 23, + AS_PARTICLE_TYPE_MAX = 24, + AS_PARTICLE_TYPE_BAD = AS_PARTICLE_TYPE_MAX +} as_particle_type; + +/* as_particle + * The common part of a particle + * this is poor man's subclassing - IE, how to do a subclassed interface in C + * Go look in particle.c to see all the subclass implementation and structure */ +typedef struct as_particle_s { + uint8_t metadata; // used by the iparticle for is_integer and inuse, as well as version in multi bin mode only + // used by *particle for type + uint8_t data[]; +} __attribute__ ((__packed__)) as_particle; + +// Bit Flag constants used for the particle state value (4 bits, 16 values) +#define AS_BIN_STATE_UNUSED 0 +#define AS_BIN_STATE_INUSE_INTEGER 1 +#define AS_BIN_STATE_RECYCLE_ME 2 // was - hidden bin +#define AS_BIN_STATE_INUSE_OTHER 3 +#define AS_BIN_STATE_INUSE_FLOAT 4 + +typedef struct as_particle_iparticle_s { + uint8_t version: 4; // now unused - and can't be used in single-bin config + uint8_t state: 4; // see AS_BIN_STATE_... + uint8_t data[]; +} __attribute__ ((__packed__)) as_particle_iparticle; + +/* Particle function declarations */ + +static inline bool +is_embedded_particle_type(as_particle_type type) +{ + return type == AS_PARTICLE_TYPE_INTEGER || type == AS_PARTICLE_TYPE_FLOAT; +} + +extern as_particle_type as_particle_type_from_asval(const as_val *val); +extern as_particle_type as_particle_type_from_msgpack(const uint8_t *packed, uint32_t packed_size); + +extern uint32_t as_particle_size_from_asval(const as_val *val); + +extern uint32_t as_particle_asval_client_value_size(const as_val *val); +extern uint32_t as_particle_asval_to_client(const as_val *val, as_msg_op *op); + +// as_bin particle function declarations + +extern void as_bin_particle_destroy(as_bin *b, bool free_particle); +extern uint32_t as_bin_particle_size(as_bin *b); + +// wire: +extern int as_bin_particle_alloc_modify_from_client(as_bin *b, const as_msg_op *op); +extern int as_bin_particle_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op); +extern int as_bin_particle_alloc_from_client(as_bin *b, const as_msg_op *op); +extern int as_bin_particle_stack_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op); +extern int as_bin_particle_alloc_from_pickled(as_bin *b, const uint8_t **p_pickled, const uint8_t *end); +extern int as_bin_particle_stack_from_pickled(as_bin *b, cf_ll_buf *particles_llb, const uint8_t **p_pickled, const uint8_t *end); +extern int as_bin_particle_compare_from_pickled(const as_bin *b, uint8_t **p_pickled); +extern uint32_t as_bin_particle_client_value_size(const as_bin *b); +extern uint32_t as_bin_particle_to_client(const as_bin *b, as_msg_op *op); +extern uint32_t as_bin_particle_pickled_size(const as_bin *b); +extern uint32_t as_bin_particle_to_pickled(const as_bin *b, uint8_t *pickled); + +// Different for CDTs - the operations may return results, so we don't use the +// normal APIs and particle table functions. +extern int as_bin_cdt_read_from_client(const as_bin *b, as_msg_op *op, as_bin *result); +extern int as_bin_cdt_alloc_modify_from_client(as_bin *b, as_msg_op *op, as_bin *result); +extern int as_bin_cdt_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, as_msg_op *op, as_bin *result); + +// as_val: +extern int as_bin_particle_replace_from_asval(as_bin *b, const as_val *val); +extern void as_bin_particle_stack_from_asval(as_bin *b, uint8_t* stack, const as_val *val); +extern as_val *as_bin_particle_to_asval(const as_bin *b); + +// msgpack: +extern int as_bin_particle_alloc_from_msgpack(as_bin *b, const uint8_t *packed, uint32_t packed_size); + +// flat: +extern int as_bin_particle_cast_from_flat(as_bin *b, uint8_t *flat, uint32_t flat_size); +extern int as_bin_particle_replace_from_flat(as_bin *b, const uint8_t *flat, uint32_t flat_size); +extern uint32_t as_bin_particle_flat_size(as_bin *b); +extern uint32_t as_bin_particle_to_flat(const as_bin *b, uint8_t *flat); + +// odd as_bin particle functions for specific particle types + +// integer: +extern int64_t as_bin_particle_integer_value(const as_bin *b); +extern void as_bin_particle_integer_set(as_bin *b, int64_t i); + +// string: +extern uint32_t as_bin_particle_string_ptr(const as_bin *b, char **p_value); + +// geojson: +typedef void * geo_region_t; +#define MAX_REGION_CELLS 32 +#define MAX_REGION_LEVELS 30 +extern size_t as_bin_particle_geojson_cellids(const as_bin *b, uint64_t **pp_cells); +extern bool as_particle_geojson_match(as_particle *p, uint64_t cellid, geo_region_t region, bool is_strict); +extern bool as_particle_geojson_match_asval(const as_val *val, uint64_t cellid, geo_region_t region, bool is_strict); +char const *as_geojson_mem_jsonstr(const as_particle *p, size_t *p_jsonsz); + +// list: +struct cdt_payload_s; +struct rollback_alloc_s; +extern void as_bin_particle_list_get_packed_val(const as_bin *b, struct cdt_payload_s *packed); +extern int as_bin_cdt_packed_read(const as_bin *b, const as_msg_op *op, as_bin *result); +extern int as_bin_cdt_packed_modify(as_bin *b, const as_msg_op *op, as_bin *result, cf_ll_buf *particles_llb); + + +/* as_bin + * A bin container - null name means unused */ +struct as_bin_s { + as_particle iparticle; // 1 byte + as_particle *particle; // for embedded particle this is value, not pointer + + // Never read or write these bytes in single-bin configuration: + uint16_t id; // ID of bin name + uint8_t unused; // pad to 12 bytes (multiple of 4) - legacy +} __attribute__ ((__packed__)) ; + +// For data-in-memory namespaces in multi-bin mode, we keep an array of as_bin +// structs in memory, accessed via this struct. +typedef struct as_bin_space_s { + uint16_t n_bins; + as_bin bins[]; +} __attribute__ ((__packed__)) as_bin_space; + +// TODO - Do we really need to pad as_bin to 12 bytes for thread safety? +// Do we ever write & read adjacent as_bin structures in a bins array from +// different threads when not under the record lock? And if we're worried about +// 4-byte alignment for that or any other reason, wouldn't we also have to pad +// after n_bins in as_bin_space? + +// For data-in-memory namespaces in multi-bin mode, if we're storing extra +// record metadata, we access it via this struct. In this case the index points +// here instead of directly to an as_bin_space. +typedef struct as_rec_space_s { + as_bin_space* bin_space; + + // So far the key is the only extra record metadata we store in memory. + uint32_t key_size; + uint8_t key[]; +} __attribute__ ((__packed__)) as_rec_space; + +// For copying as_bin structs without the last 3 bytes. +static inline void +as_single_bin_copy(as_bin *to, const as_bin *from) +{ + to->iparticle = from->iparticle; + to->particle = from->particle; +} + +static inline bool +as_bin_inuse(const as_bin *b) +{ + return (((as_particle_iparticle *)b)->state); +} + +static inline uint8_t +as_bin_state(const as_bin *b) +{ + return ((as_particle_iparticle *)b)->state; +} + +static inline void +as_bin_state_set(as_bin *b, uint8_t val) +{ + ((as_particle_iparticle *)b)->state = val; +} + +static inline void +as_bin_state_set_from_type(as_bin *b, as_particle_type type) +{ + switch (type) { + case AS_PARTICLE_TYPE_NULL: + ((as_particle_iparticle *)b)->state = AS_BIN_STATE_UNUSED; + break; + case AS_PARTICLE_TYPE_INTEGER: + ((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_INTEGER; + break; + case AS_PARTICLE_TYPE_FLOAT: + ((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_FLOAT; + break; + case AS_PARTICLE_TYPE_TIMESTAMP: + // TODO - unsupported + ((as_particle_iparticle *)b)->state = AS_BIN_STATE_UNUSED; + break; + default: + ((as_particle_iparticle *)b)->state = AS_BIN_STATE_INUSE_OTHER; + break; + } +} + +static inline bool +as_bin_inuse_has(as_storage_rd *rd) +{ + // In-use bins are at the beginning - only need to check the first bin. + return (rd->n_bins && as_bin_inuse(rd->bins)); +} + +static inline void +as_bin_set_empty(as_bin *b) +{ + as_bin_state_set(b, AS_BIN_STATE_UNUSED); +} + +static inline void +as_bin_set_empty_shift(as_storage_rd *rd, uint32_t i) +{ + // Shift the bins over, so there's no space between used bins. + // This can overwrite the "emptied" bin, and that's fine. + + uint16_t j; + + for (j = i + 1; j < rd->n_bins; j++) { + if (! as_bin_inuse(&rd->bins[j])) { + break; + } + } + + uint16_t n = j - (i + 1); + + if (n) { + memmove(&rd->bins[i], &rd->bins[i + 1], n * sizeof(as_bin)); + } + + // Mark the last bin that was *formerly* in use as null. + as_bin_set_empty(&rd->bins[j - 1]); +} + +static inline void +as_bin_set_empty_from(as_storage_rd *rd, uint16_t from) { + for (uint16_t i = from; i < rd->n_bins; i++) { + as_bin_set_empty(&rd->bins[i]); + } +} + +static inline void +as_bin_set_all_empty(as_storage_rd *rd) { + as_bin_set_empty_from(rd, 0); +} + +static inline bool +as_bin_is_embedded_particle(const as_bin *b) { + return ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_INTEGER || + ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_FLOAT; +} + +static inline bool +as_bin_is_external_particle(const as_bin *b) { + return ((as_particle_iparticle *)b)->state == AS_BIN_STATE_INUSE_OTHER; +} + +static inline as_particle * +as_bin_get_particle(as_bin *b) { + return as_bin_is_embedded_particle(b) ? &b->iparticle : b->particle; +} + +// "Embedded" types like integer are stored directly, but other bin types +// ("other") must follow an indirection to get the actual type. +static inline uint8_t +as_bin_get_particle_type(const as_bin *b) { + switch (((as_particle_iparticle *)b)->state) { + case AS_BIN_STATE_INUSE_INTEGER: + return AS_PARTICLE_TYPE_INTEGER; + case AS_BIN_STATE_INUSE_FLOAT: + return AS_PARTICLE_TYPE_FLOAT; + case AS_BIN_STATE_INUSE_OTHER: + return b->particle->metadata; + default: + return AS_PARTICLE_TYPE_NULL; + } +} + + +/* Bin function declarations */ +extern int16_t as_bin_get_id(as_namespace *ns, const char *name); +extern uint16_t as_bin_get_or_assign_id(as_namespace *ns, const char *name); +extern uint16_t as_bin_get_or_assign_id_w_len(as_namespace *ns, const char *name, size_t len); +extern const char* as_bin_get_name_from_id(as_namespace *ns, uint16_t id); +extern bool as_bin_name_within_quota(as_namespace *ns, const char *name); +extern void as_bin_init(as_namespace *ns, as_bin *b, const char *name); +extern void as_bin_copy(as_namespace *ns, as_bin *to, const as_bin *from); +extern int as_storage_rd_load_n_bins(as_storage_rd *rd); +extern int as_storage_rd_load_bins(as_storage_rd *rd, as_bin *stack_bins); +extern uint16_t as_bin_inuse_count(as_storage_rd *rd); +extern void as_bin_get_all_p(as_storage_rd *rd, as_bin **bin_ptrs); +extern as_bin *as_bin_get_by_id(as_storage_rd *rd, uint32_t id); +extern as_bin *as_bin_get(as_storage_rd *rd, const char *name); +extern as_bin *as_bin_get_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len); +extern as_bin *as_bin_create(as_storage_rd *rd, const char *name); +extern as_bin *as_bin_create_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len, int *result); +extern as_bin *as_bin_get_or_create(as_storage_rd *rd, const char *name); +extern as_bin *as_bin_get_or_create_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len, int *result); +extern int32_t as_bin_get_index(as_storage_rd *rd, const char *name); +extern int32_t as_bin_get_index_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len); +extern void as_bin_destroy(as_storage_rd *rd, uint16_t i); +extern void as_bin_allocate_bin_space(as_storage_rd *rd, int32_t delta); + + +typedef enum { + AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_UNDEF = 0, + AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION = 1, + AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME = 2, + AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_CP = 3 +} conflict_resolution_pol; + +/* Record function declarations */ +extern uint32_t clock_skew_stop_writes_sec(); +extern void handle_clock_skew(as_namespace* ns, uint64_t skew_ms); +extern uint16_t plain_generation(uint16_t regime_generation, const as_namespace* ns); +extern void as_record_set_lut(as_record *r, uint32_t regime, uint64_t now_ms, const as_namespace* ns); +extern void as_record_increment_generation(as_record *r, const as_namespace* ns); +extern bool as_record_is_live(const as_record *r); +extern int as_record_get_create(struct as_index_tree_s *tree, cf_digest *keyd, as_index_ref *r_ref, as_namespace *ns); +extern int as_record_get(struct as_index_tree_s *tree, cf_digest *keyd, as_index_ref *r_ref); +extern int as_record_get_live(struct as_index_tree_s *tree, cf_digest *keyd, as_index_ref *r_ref, as_namespace *ns); +extern int as_record_exists(struct as_index_tree_s *tree, cf_digest *keyd); +extern int as_record_exists_live(struct as_index_tree_s *tree, cf_digest *keyd, as_namespace *ns); +extern void as_record_rescue(as_index_ref *r_ref, as_namespace *ns); + +extern void as_record_destroy_bins_from(as_storage_rd *rd, uint16_t from); +extern void as_record_destroy_bins(as_storage_rd *rd); +extern void as_record_free_bin_space(as_record *r); + +extern void as_record_destroy(as_record *r, as_namespace *ns); +extern void as_record_done(as_index_ref *r_ref, as_namespace *ns); + +void as_record_drop_stats(as_record* r, as_namespace* ns); + +extern void as_record_allocate_key(as_record* r, const uint8_t* key, uint32_t key_size); +extern void as_record_remove_key(as_record* r); +extern int as_record_resolve_conflict(conflict_resolution_pol policy, uint16_t left_gen, uint64_t left_lut, uint16_t right_gen, uint64_t right_lut); +extern uint8_t *as_record_pickle(as_storage_rd *rd, size_t *len_r); +extern int as_record_write_from_pickle(as_storage_rd *rd); +extern int as_record_set_set_from_msg(as_record *r, as_namespace *ns, as_msg *m); + +static inline bool +as_record_pickle_is_binless(const uint8_t *buf) +{ + return *(uint16_t *)buf == 0; +} + +// For enterprise split only. +int record_resolve_conflict_cp(uint16_t left_gen, uint64_t left_lut, uint16_t right_gen, uint64_t right_lut); + +static inline int +resolve_last_update_time(uint64_t left, uint64_t right) +{ + return left == right ? 0 : (right > left ? 1 : -1); +} + +typedef struct as_remote_record_s { + cf_node src; + as_partition_reservation *rsv; + cf_digest *keyd; + + uint8_t *record_buf; + size_t record_buf_sz; + + uint32_t generation; + uint32_t void_time; + uint64_t last_update_time; + + const char *set_name; + size_t set_name_len; + + const uint8_t *key; + size_t key_size; + + uint8_t repl_state; // relevant only for enterprise edition +} as_remote_record; + +int as_record_replace_if_better(as_remote_record *rr, bool is_repl_write, bool skip_sindex, bool do_xdr_write); + +// a simpler call that gives seconds in the right epoch +#define as_record_void_time_get() cf_clepoch_seconds() +bool as_record_is_expired(const as_record *r); // TODO - eventually inline + +static inline bool +as_record_is_doomed(const as_record *r, struct as_namespace_s *ns) +{ + return as_record_is_expired(r) || as_truncate_record_is_truncated(r, ns); +} + +#define AS_SINDEX_MAX 256 + +#define MIN_PARTITIONS_PER_INDEX 1 +#define MAX_PARTITIONS_PER_INDEX 256 +#define DEFAULT_PARTITIONS_PER_INDEX 32 +#define MAX_PARTITIONS_PER_INDEX_CHAR 3 // Number of characters in max paritions per index + +// as_sindex structure which hangs from the ns. +#define AS_SINDEX_INACTIVE 1 // On init, pre-loading +#define AS_SINDEX_ACTIVE 2 // On creation and afterwards +#define AS_SINDEX_DESTROY 3 // On destroy +// dummy sindex state when ai_btree_create() returns error this +// sindex is not available for any of the DML operations +#define AS_SINDEX_NOTCREATED 4 // Un-used flag +#define AS_SINDEX_FLAG_WACTIVE 0x01 // On ai btree create of sindex, never reset +#define AS_SINDEX_FLAG_RACTIVE 0x02 // When sindex scan of database is completed +#define AS_SINDEX_FLAG_DESTROY_CLEANUP 0x04 // Called for AI clean-up during si deletion +#define AS_SINDEX_FLAG_MIGRATE_CLEANUP 0x08 // Un-used +#define AS_SINDEX_FLAG_POPULATING 0x10 // Indicates current si scan job, reset when scan is done. + +struct as_sindex_s; +struct as_sindex_config_s; + +#define AS_SET_MAX_COUNT 0x3FF // ID's 10 bits worth minus 1 (ID 0 means no set) +#define AS_BINID_HAS_SINDEX_SIZE MAX_BIN_NAMES / ( sizeof(uint32_t) * CHAR_BIT ) + + +// TODO - would be nice to put this in as_index.h: +// Callback invoked when as_index is destroyed. +typedef void (*as_index_value_destructor) (struct as_index_s* v, void* udata); + +// TODO - would be nice to put this in as_index.h: +typedef struct as_index_tree_shared_s { + as_index_value_destructor destructor; + void* destructor_udata; + + // Number of lock pairs and sprigs per partition tree. + uint32_t n_lock_pairs; + uint32_t n_sprigs; + + // Bit-shifts used to calculate indexes from digest bits. + uint32_t locks_shift; + uint32_t sprigs_shift; + + // Offset into as_index_tree struct's variable-sized data. + uint32_t sprigs_offset; +} as_index_tree_shared; + + +struct as_namespace_s { + + char name[AS_ID_NAMESPACE_SZ]; + uint32_t id; // this is 1-based + uint32_t namehash; + + //-------------------------------------------- + // Persistent memory. + // + + // Persistent memory "base" block ID for this namespace. + uint32_t xmem_id; + + // Pointer to the persistent memory "base" block. + uint8_t* xmem_base; + + // Pointer to partition tree info in persistent memory "treex" block. + as_treex* xmem_roots; + + // Pointer to arena structure (not stages) in persistent memory base block. + cf_arenax* arena; + + // Pointer to bin name vmap in persistent memory base block. + cf_vmapx* p_bin_name_vmap; + + // Pointer to set information vmap in persistent memory base block. + cf_vmapx* p_sets_vmap; + + // Temporary array of sets to hold config values until sets vmap is ready. + as_set* sets_cfg_array; + uint32_t sets_cfg_count; + + // Configuration flags relevant for warm or cool restart. + uint32_t xmem_flags; + + //-------------------------------------------- + // Cold start. + // + + // If true, read storage devices to build index at startup. + bool cold_start; + + // Flag for ticker during initial loading of records from device. + bool loading_records; + + // For cold start eviction. + pthread_mutex_t cold_start_evict_lock; + uint32_t cold_start_record_add_count; + cf_atomic32 cold_start_threshold_void_time; + uint32_t cold_start_max_void_time; + + //-------------------------------------------- + // Memory management. + // + + // JEMalloc arena to be used for long-term storage in this namespace (-1 if nonexistent.) + int jem_arena; + + // Cached partition ownership info for clients. + client_replica_map* replica_maps; + + // Common partition tree information. Contains two configuration items. + as_index_tree_shared tree_shared; + + //-------------------------------------------- + // Storage management. + // + + // This is typecast to (drv_ssds*) in storage code. + void* storage_private; + + uint64_t ssd_size; // discovered (and rounded) size of drive + int storage_last_avail_pct; // most recently calculated available percent + int storage_max_write_q; // storage_max_write_cache is converted to this + uint32_t saved_defrag_sleep; // restore after defrag at startup is done + uint32_t defrag_lwm_size; // storage_defrag_lwm_pct % of storage_write_block_size + + // For data-not-in-memory, we optionally cache swbs after writing to device. + // To track fraction of reads from cache: + cf_atomic32 n_reads_from_cache; + cf_atomic32 n_reads_from_device; + + uint8_t storage_encryption_key[32]; + + //-------------------------------------------- + // Truncate records. + // + + as_truncate truncate; + + //-------------------------------------------- + // Secondary index. + // + + int sindex_cnt; + uint32_t n_setless_sindexes; + struct as_sindex_s* sindex; // array with AS_MAX_SINDEX metadata + cf_shash* sindex_set_binid_hash; + cf_shash* sindex_iname_hash; + uint32_t binid_has_sindex[AS_BINID_HAS_SINDEX_SIZE]; + + //-------------------------------------------- + // Configuration. + // + + uint32_t cfg_replication_factor; + uint32_t replication_factor; // indirect config - can become less than cfg_replication_factor + uint64_t memory_size; + uint64_t default_ttl; + + PAD_BOOL enable_xdr; + PAD_BOOL sets_enable_xdr; // namespace-level flag to enable set-based xdr shipping + PAD_BOOL ns_forward_xdr_writes; // namespace-level flag to enable forwarding of xdr writes + PAD_BOOL ns_allow_nonxdr_writes; // namespace-level flag to allow nonxdr writes or not + PAD_BOOL ns_allow_xdr_writes; // namespace-level flag to allow xdr writes or not + + uint32_t cold_start_evict_ttl; + conflict_resolution_pol conflict_resolution_policy; + PAD_BOOL cp; // relevant only for enterprise edition + PAD_BOOL cp_allow_drops; // relevant only for enterprise edition + PAD_BOOL data_in_index; // with single-bin, allows warm restart for data-in-memory (with storage-engine device) + PAD_BOOL write_dup_res_disabled; + PAD_BOOL disallow_null_setname; + PAD_BOOL batch_sub_benchmarks_enabled; + PAD_BOOL read_benchmarks_enabled; + PAD_BOOL udf_benchmarks_enabled; + PAD_BOOL udf_sub_benchmarks_enabled; + PAD_BOOL write_benchmarks_enabled; + PAD_BOOL proxy_hist_enabled; + uint32_t evict_hist_buckets; + uint32_t evict_tenths_pct; + uint32_t hwm_disk_pct; + uint32_t hwm_memory_pct; + uint64_t max_ttl; + uint32_t migrate_order; + uint32_t migrate_retransmit_ms; + uint32_t migrate_sleep; + cf_atomic32 obj_size_hist_max; // TODO - doesn't need to be atomic, really. + uint32_t rack_id; + as_read_consistency_level read_consistency_level; + PAD_BOOL single_bin; // restrict the namespace to objects with exactly one bin + uint32_t stop_writes_pct; + uint32_t tomb_raider_eligible_age; // relevant only for enterprise edition + uint32_t tomb_raider_period; // relevant only for enterprise edition + as_write_commit_level write_commit_level; + cf_vector xdr_dclist_v; + + as_storage_type storage_type; + + char* storage_devices[AS_STORAGE_MAX_DEVICES]; + char* storage_shadows[AS_STORAGE_MAX_DEVICES]; + char* storage_files[AS_STORAGE_MAX_FILES]; + uint64_t storage_filesize; + char* storage_scheduler_mode; // relevant for devices only, not files + uint32_t storage_write_block_size; + PAD_BOOL storage_data_in_memory; + + PAD_BOOL storage_cold_start_empty; + PAD_BOOL storage_commit_to_device; // relevant only for enterprise edition + uint32_t storage_commit_min_size; // relevant only for enterprise edition + uint32_t storage_defrag_lwm_pct; + uint32_t storage_defrag_queue_min; + uint32_t storage_defrag_sleep; + int storage_defrag_startup_minimum; + PAD_BOOL storage_disable_odirect; + PAD_BOOL storage_benchmarks_enabled; // histograms are per-drive except device-read-size & device-write-size + PAD_BOOL storage_enable_osync; + char* storage_encryption_key_file; + uint64_t storage_flush_max_us; + uint64_t storage_fsync_max_us; + uint64_t storage_max_write_cache; + uint32_t storage_min_avail_pct; + cf_atomic32 storage_post_write_queue; // number of swbs/device held after writing to device + uint32_t storage_tomb_raider_sleep; // relevant only for enterprise edition + uint32_t storage_write_threads; + + uint32_t sindex_num_partitions; + + PAD_BOOL geo2dsphere_within_strict; + uint16_t geo2dsphere_within_min_level; + uint16_t geo2dsphere_within_max_level; + uint16_t geo2dsphere_within_max_cells; + uint16_t geo2dsphere_within_level_mod; + uint32_t geo2dsphere_within_earth_radius_meters; + + //-------------------------------------------- + // Statistics and histograms. + // + + // Object counts. + + cf_atomic64 n_objects; + cf_atomic64 n_tombstones; // relevant only for enterprise edition + + // Consistency info. + + uint32_t n_dead_partitions; + uint32_t n_unavailable_partitions; + bool clock_skew_stop_writes; + + // Expiration & eviction (nsup) stats. + + cf_atomic32 stop_writes; + cf_atomic32 hwm_breached; + + uint64_t non_expirable_objects; + + cf_atomic64 n_expired_objects; + cf_atomic64 n_evicted_objects; + + cf_atomic64 evict_ttl; + + uint32_t nsup_cycle_duration; // seconds taken for most recent nsup cycle + uint32_t nsup_cycle_sleep_pct; // fraction of most recent nsup cycle that was spent sleeping + + // Memory usage stats. + + cf_atomic_int n_bytes_memory; + cf_atomic64 n_bytes_sindex_memory; + + // Persistent storage stats. + + float cache_read_pct; + + // Migration stats. + + cf_atomic_int migrate_tx_partitions_imbalance; // debug only + cf_atomic_int migrate_tx_instance_count; // debug only + cf_atomic_int migrate_rx_instance_count; // debug only + cf_atomic_int migrate_tx_partitions_active; + cf_atomic_int migrate_rx_partitions_active; + cf_atomic_int migrate_tx_partitions_initial; + cf_atomic_int migrate_tx_partitions_remaining; + cf_atomic_int migrate_rx_partitions_initial; + cf_atomic_int migrate_rx_partitions_remaining; + cf_atomic_int migrate_signals_active; + cf_atomic_int migrate_signals_remaining; + cf_atomic_int appeals_tx_active; // relevant only for enterprise edition + cf_atomic_int appeals_rx_active; // relevant only for enterprise edition + cf_atomic_int appeals_tx_remaining; // relevant only for enterprise edition + + // Per-record migration stats: + cf_atomic_int migrate_records_skipped; // relevant only for enterprise edition + cf_atomic_int migrate_records_transmitted; + cf_atomic_int migrate_record_retransmits; + cf_atomic_int migrate_record_receives; + cf_atomic_int appeals_records_exonerated; // relevant only for enterprise edition + + // From-client transaction stats. + + cf_atomic64 n_client_tsvc_error; + cf_atomic64 n_client_tsvc_timeout; + + cf_atomic64 n_client_proxy_complete; + cf_atomic64 n_client_proxy_error; + cf_atomic64 n_client_proxy_timeout; + + cf_atomic64 n_client_read_success; + cf_atomic64 n_client_read_error; + cf_atomic64 n_client_read_timeout; + cf_atomic64 n_client_read_not_found; + + cf_atomic64 n_client_write_success; + cf_atomic64 n_client_write_error; + cf_atomic64 n_client_write_timeout; + + // Subset of n_client_write_... above, respectively. + cf_atomic64 n_xdr_write_success; + cf_atomic64 n_xdr_write_error; + cf_atomic64 n_xdr_write_timeout; + + cf_atomic64 n_client_delete_success; + cf_atomic64 n_client_delete_error; + cf_atomic64 n_client_delete_timeout; + cf_atomic64 n_client_delete_not_found; + + // Subset of n_client_delete_... above, respectively. + cf_atomic64 n_xdr_delete_success; + cf_atomic64 n_xdr_delete_error; + cf_atomic64 n_xdr_delete_timeout; + cf_atomic64 n_xdr_delete_not_found; + + cf_atomic64 n_client_udf_complete; + cf_atomic64 n_client_udf_error; + cf_atomic64 n_client_udf_timeout; + + cf_atomic64 n_client_lang_read_success; + cf_atomic64 n_client_lang_write_success; + cf_atomic64 n_client_lang_delete_success; + cf_atomic64 n_client_lang_error; + + // Batch sub-transaction stats. + + cf_atomic64 n_batch_sub_tsvc_error; + cf_atomic64 n_batch_sub_tsvc_timeout; + + cf_atomic64 n_batch_sub_proxy_complete; + cf_atomic64 n_batch_sub_proxy_error; + cf_atomic64 n_batch_sub_proxy_timeout; + + cf_atomic64 n_batch_sub_read_success; + cf_atomic64 n_batch_sub_read_error; + cf_atomic64 n_batch_sub_read_timeout; + cf_atomic64 n_batch_sub_read_not_found; + + // Internal-UDF sub-transaction stats. + + cf_atomic64 n_udf_sub_tsvc_error; + cf_atomic64 n_udf_sub_tsvc_timeout; + + cf_atomic64 n_udf_sub_udf_complete; + cf_atomic64 n_udf_sub_udf_error; + cf_atomic64 n_udf_sub_udf_timeout; + + cf_atomic64 n_udf_sub_lang_read_success; + cf_atomic64 n_udf_sub_lang_write_success; + cf_atomic64 n_udf_sub_lang_delete_success; + cf_atomic64 n_udf_sub_lang_error; + + // Transaction retransmit stats. + + uint64_t n_retransmit_client_read_dup_res; + + uint64_t n_retransmit_client_write_dup_res; + uint64_t n_retransmit_client_write_repl_write; + + uint64_t n_retransmit_client_delete_dup_res; + uint64_t n_retransmit_client_delete_repl_write; + + uint64_t n_retransmit_client_udf_dup_res; + uint64_t n_retransmit_client_udf_repl_write; + + uint64_t n_retransmit_batch_sub_dup_res; + + uint64_t n_retransmit_udf_sub_dup_res; + uint64_t n_retransmit_udf_sub_repl_write; + + // Scan stats. + + cf_atomic64 n_scan_basic_complete; + cf_atomic64 n_scan_basic_error; + cf_atomic64 n_scan_basic_abort; + + cf_atomic64 n_scan_aggr_complete; + cf_atomic64 n_scan_aggr_error; + cf_atomic64 n_scan_aggr_abort; + + cf_atomic64 n_scan_udf_bg_complete; + cf_atomic64 n_scan_udf_bg_error; + cf_atomic64 n_scan_udf_bg_abort; + + // Query stats. + + cf_atomic64 query_reqs; + cf_atomic64 query_fail; + cf_atomic64 query_short_queue_full; + cf_atomic64 query_long_queue_full; + cf_atomic64 query_short_reqs; + cf_atomic64 query_long_reqs; + + cf_atomic64 n_lookup; + cf_atomic64 n_lookup_success; + cf_atomic64 n_lookup_abort; + cf_atomic64 n_lookup_errs; + cf_atomic64 lookup_response_size; + cf_atomic64 lookup_num_records; + + cf_atomic64 n_aggregation; + cf_atomic64 n_agg_success; + cf_atomic64 n_agg_abort; + cf_atomic64 n_agg_errs; + cf_atomic64 agg_response_size; + cf_atomic64 agg_num_records; + + cf_atomic64 n_query_udf_bg_success; + cf_atomic64 n_query_udf_bg_failure; + + // Geospatial query stats: + cf_atomic64 geo_region_query_count; // number of region queries + cf_atomic64 geo_region_query_cells; // number of cells used by region queries + cf_atomic64 geo_region_query_points; // number of valid points found + cf_atomic64 geo_region_query_falsepos; // number of false positives found + + // Re-replication stats - relevant only for enterprise edition. + + cf_atomic64 n_re_repl_success; + cf_atomic64 n_re_repl_error; + cf_atomic64 n_re_repl_timeout; + + // Special errors that deserve their own counters: + + cf_atomic64 n_fail_xdr_forbidden; + cf_atomic64 n_fail_key_busy; + cf_atomic64 n_fail_generation; + cf_atomic64 n_fail_record_too_big; + + // Special non-error counters: + + cf_atomic64 n_deleted_last_bin; + + // One-way automatically activated histograms. + + cf_hist_track* read_hist; + cf_hist_track* write_hist; + cf_hist_track* udf_hist; + cf_hist_track* query_hist; + histogram* query_rec_count_hist; + histogram* re_repl_hist; // relevant only for enterprise edition + + PAD_BOOL read_hist_active; + PAD_BOOL write_hist_active; + PAD_BOOL udf_hist_active; + PAD_BOOL query_hist_active; + PAD_BOOL query_rec_count_hist_active; + PAD_BOOL re_repl_hist_active; // relevant only for enterprise edition + + // Activate-by-config histograms. + + histogram* proxy_hist; + + histogram* read_start_hist; + histogram* read_restart_hist; + histogram* read_dup_res_hist; + histogram* read_repl_ping_hist; + histogram* read_local_hist; + histogram* read_response_hist; + + histogram* write_start_hist; + histogram* write_restart_hist; + histogram* write_dup_res_hist; + histogram* write_master_hist; // split this? + histogram* write_repl_write_hist; + histogram* write_response_hist; + + histogram* udf_start_hist; + histogram* udf_restart_hist; + histogram* udf_dup_res_hist; + histogram* udf_master_hist; // split this? + histogram* udf_repl_write_hist; + histogram* udf_response_hist; + + histogram* batch_sub_start_hist; + histogram* batch_sub_restart_hist; + histogram* batch_sub_dup_res_hist; + histogram* batch_sub_repl_ping_hist; + histogram* batch_sub_read_local_hist; + histogram* batch_sub_response_hist; + + histogram* udf_sub_start_hist; + histogram* udf_sub_restart_hist; + histogram* udf_sub_dup_res_hist; + histogram* udf_sub_master_hist; // split this? + histogram* udf_sub_repl_write_hist; + histogram* udf_sub_response_hist; + + histogram* device_read_size_hist; + histogram* device_write_size_hist; + + // Histograms of master object storage sizes. (Meaningful for drive-backed + // namespaces only.) + linear_hist* obj_size_hist; + linear_hist* set_obj_size_hists[AS_SET_MAX_COUNT + 1]; + + // Histograms used for general eviction and expiration. + linear_hist* evict_hist; // not just for info + linear_hist* ttl_hist; + linear_hist* set_ttl_hists[AS_SET_MAX_COUNT + 1]; + + //-------------------------------------------- + // Data partitions. + // + + as_partition partitions[AS_PARTITIONS]; + + //-------------------------------------------- + // Information for rebalancing. + // + + uint32_t cluster_size; + cf_node succession[AS_CLUSTER_SZ]; + as_partition_version cluster_versions[AS_CLUSTER_SZ][AS_PARTITIONS]; + uint32_t rack_ids[AS_CLUSTER_SZ]; // is observed-rack-ids in CP mode + + // Observed nodes - relevant only for enterprise edition. + uint32_t observed_cluster_size; + cf_node observed_succession[AS_CLUSTER_SZ]; + + // Roster management - relevant only for enterprise edition. + uint32_t smd_roster_generation; + uint32_t smd_roster_count; + cf_node smd_roster[AS_CLUSTER_SZ]; + uint32_t smd_roster_rack_ids[AS_CLUSTER_SZ]; + uint32_t roster_generation; + uint32_t roster_count; + cf_node roster[AS_CLUSTER_SZ]; + uint32_t roster_rack_ids[AS_CLUSTER_SZ]; + + // Master regimes - relevant only for enterprise edition. + uint32_t eventual_regime; + uint32_t rebalance_regime; + uint32_t rebalance_regimes[AS_CLUSTER_SZ]; +}; + +#define AS_SET_NAME_MAX_SIZE 64 // includes space for null-terminator + +#define INVALID_SET_ID 0 + +#define IS_SET_EVICTION_DISABLED(p_set) (cf_atomic32_get(p_set->disable_eviction) == 1) +#define DISABLE_SET_EVICTION(p_set, on_off) (cf_atomic32_set(&p_set->disable_eviction, on_off ? 1 : 0)) + +typedef enum { + AS_SET_ENABLE_XDR_DEFAULT = 0, + AS_SET_ENABLE_XDR_TRUE = 1, + AS_SET_ENABLE_XDR_FALSE = 2 +} as_set_enable_xdr_flag; + +// Caution - changing this struct could break warm or cool restart. +struct as_set_s { + char name[AS_SET_NAME_MAX_SIZE]; + cf_atomic64 n_objects; + cf_atomic64 n_tombstones; // relevant only for enterprise edition + cf_atomic64 n_bytes_memory; // for data-in-memory only - sets's total record data size + cf_atomic64 stop_writes_count; // restrict number of records in a set + uint64_t truncate_lut; // records with last-update-time less than this are truncated + cf_atomic32 disable_eviction; // don't evict anything in this set (note - expiration still works) + cf_atomic32 enable_xdr; // white-list (AS_SET_ENABLE_XDR_TRUE) or black-list (AS_SET_ENABLE_XDR_FALSE) a set for XDR replication + uint32_t n_sindexes; + uint8_t padding[12]; +}; + +static inline bool +as_set_stop_writes(as_set *p_set) { + uint64_t n_objects = cf_atomic64_get(p_set->n_objects); + uint64_t stop_writes_count = cf_atomic64_get(p_set->stop_writes_count); + + return stop_writes_count != 0 && n_objects >= stop_writes_count; +} + +// These bin functions must be below definition of struct as_namespace_s: + +static inline void +as_bin_set_id_from_name_buf(as_namespace *ns, as_bin *b, const uint8_t *buf, + int len) { + if (! ns->single_bin) { + b->id = as_bin_get_or_assign_id_w_len(ns, (const char *)buf, len); + } +} + +static inline void +as_bin_set_id_from_name(as_namespace *ns, as_bin *b, const char *name) { + if (! ns->single_bin) { + b->id = as_bin_get_or_assign_id(ns, name); + } +} + +static inline size_t +as_bin_memcpy_name(as_namespace *ns, uint8_t *buf, as_bin *b) { + size_t len = 0; + + if (! ns->single_bin) { + const char *name = as_bin_get_name_from_id(ns, b->id); + + len = strlen(name); + memcpy(buf, name, len); + } + + return len; +} + +// forward ref +struct as_msg_field_s; + +/* Namespace function declarations */ +extern as_namespace *as_namespace_create(char *name); +extern void as_namespaces_init(bool cold_start_cmd, uint32_t instance); +extern void as_namespaces_setup(bool cold_start_cmd, uint32_t instance, uint32_t stage_capacity); +extern bool as_namespace_configure_sets(as_namespace *ns); +extern as_namespace *as_namespace_get_byname(char *name); +extern as_namespace *as_namespace_get_byid(uint32_t id); +extern as_namespace *as_namespace_get_bybuf(uint8_t *name, size_t len); +extern as_namespace *as_namespace_get_bymsgfield(struct as_msg_field_s *fp); +extern const char *as_namespace_get_set_name(as_namespace *ns, uint16_t set_id); +extern uint16_t as_namespace_get_set_id(as_namespace *ns, const char *set_name); +extern uint16_t as_namespace_get_create_set_id(as_namespace *ns, const char *set_name); +extern int as_namespace_set_set_w_len(as_namespace *ns, const char *set_name, size_t len, uint16_t *p_set_id, bool apply_restrictions); +extern int as_namespace_get_create_set_w_len(as_namespace *ns, const char *set_name, size_t len, as_set **pp_set, uint16_t *p_set_id); +extern as_set *as_namespace_get_set_by_name(as_namespace *ns, const char *set_name); +extern as_set* as_namespace_get_set_by_id(as_namespace* ns, uint16_t set_id); +extern as_set* as_namespace_get_record_set(as_namespace *ns, const as_record *r); +extern void as_namespace_get_set_info(as_namespace *ns, const char *set_name, cf_dyn_buf *db); +extern void as_namespace_adjust_set_memory(as_namespace *ns, uint16_t set_id, int64_t delta_bytes); +extern void as_namespace_release_set_id(as_namespace *ns, uint16_t set_id); +extern void as_namespace_get_bins_info(as_namespace *ns, cf_dyn_buf *db, bool show_ns); +extern void as_namespace_get_hist_info(as_namespace *ns, char *set_name, char *hist_name, + cf_dyn_buf *db, bool show_ns); + +static inline bool +as_namespace_cool_restarts(const as_namespace *ns) +{ + return ns->storage_data_in_memory && ! ns->data_in_index; +} + +static inline const char* +as_namespace_start_mode_str(const as_namespace *ns) +{ + return as_namespace_cool_restarts(ns) ? "cool" : "warm"; +} + +// Persistent Memory Management + +struct as_treex_s { + uint64_t root_h: 40; +} __attribute__ ((__packed__)); + +void as_namespace_xmem_trusted(as_namespace *ns); + +// Not namespace class functions, but they live in namespace.c: +uint32_t as_mem_check(); + +// XXX POST-JUMP - remove in "six months". +static inline uint32_t +truncate_void_time(as_namespace *ns, uint32_t void_time) +{ + uint32_t max_void_time = as_record_void_time_get() + (uint32_t)ns->max_ttl; + return void_time > max_void_time ? max_void_time : void_time; +} diff --git a/as/include/base/features.h b/as/include/base/features.h new file mode 100644 index 00000000..062ce760 --- /dev/null +++ b/as/include/base/features.h @@ -0,0 +1,30 @@ +/* + * features.h + * + * Copyright (C) 2018 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + + +//========================================================== +// Public API. +// + +const char *as_features_info(); diff --git a/as/include/base/index.h b/as/include/base/index.h new file mode 100644 index 00000000..09c9d61c --- /dev/null +++ b/as/include/base/index.h @@ -0,0 +1,337 @@ +/* + * index.h + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_digest.h" + +#include "arenax.h" +#include "cf_mutex.h" + +#include "base/datamodel.h" + + +//========================================================== +// Index tree node - as_index, also known as as_record. +// +// There's one for every record. Contains metadata, and +// points to record data in memory and/or on storage device. +// + +typedef struct as_index_s { + + // offset: 0 + cf_atomic32 rc; + + // offset: 4 + cf_digest keyd; + + // offset: 24 + uint64_t right_h: 40; + uint64_t left_h: 40; + + // offset: 34 + // Don't use the free bits here for record info - this is accessed outside + // the record lock. + uint16_t color: 1; + uint16_t unused_but_unsafe: 15; + + // Everything below here is used under the record lock. + + // offset: 36 + uint32_t tombstone: 1; + uint32_t cenotaph: 1; + uint32_t void_time: 30; + + // offset: 40 + uint64_t last_update_time: 40; + uint64_t generation: 16; + + // offset: 47 + // Used by the storage engines. + uint64_t rblock_id: 34; // can address 2^34 * 128b = 2Tb drive + uint64_t n_rblocks: 14; // is enough for 1Mb/128b = 8K rblocks + uint64_t file_id: 6; // can spec 2^6 = 64 drives + + uint64_t set_id_bits: 10; // do not use directly, used for set-ID + + // offset: 55 + // In single-bin mode for data-in-memory namespaces, this offset is cast to + // an as_bin, but only 4 bits get used (for the iparticle state). The other + // 4 bits are used for replication state and index flags. + uint8_t repl_state: 2; + uint8_t unused_flag: 1; + uint8_t key_stored: 1; + uint8_t single_bin_state: 4; // used indirectly, only in single-bin mode + + // offset: 56 + // For data-not-in-memory namespaces, these 8 bytes are currently unused. + // For data-in-memory namespaces: in single-bin mode the as_bin is embedded + // here (these 8 bytes plus 4 bits in flex_bits above), but in multi-bin + // mode this is a pointer to either of: + // - an as_bin_space containing n_bins and an array of as_bin structs + // - an as_rec_space containing an as_bin_space pointer and other metadata + void* dim; + + // final size: 64 + +} __attribute__ ((__packed__)) as_index; + +#define AS_INDEX_SINGLE_BIN_OFFSET 55 // can't use offsetof() with bit fields + + +//========================================================== +// Accessor functions for bits in as_index. +// + +// Size in bytes of as_index, currently the same for all namespaces. +static inline +uint32_t as_index_size_get(as_namespace *ns) +{ + return (uint32_t)sizeof(as_index); +} + +// Fast way to clear the record portion of as_index. +// Note - relies on current layout and size of as_index! +static inline +void as_index_clear_record_info(as_index *index) { + *(uint32_t*)((uint8_t*)index + 36) = 0; + + uint64_t *p_clear = (uint64_t*)((uint8_t*)index + 40); + + *p_clear++ = 0; + *p_clear++ = 0; + *p_clear = 0; +} + +// Generation 0 is never written, and generation plays no role in record +// destruction, so it works to flag both "half created" and deleted records. +static inline +void as_index_invalidate_record(as_index *index) { + index->generation = 0; +} + +static inline +bool as_index_is_valid_record(as_index *index) { + return index->generation != 0; +} + + +//------------------------------------------------ +// Single bin, as_bin_space & as_rec_space. +// + +static inline +as_bin *as_index_get_single_bin(const as_index *index) { + // We only use 4 bits of the first byte for the bin state. + return (as_bin*)((uint8_t *)index + AS_INDEX_SINGLE_BIN_OFFSET); +} + +static inline +as_bin_space* as_index_get_bin_space(const as_index *index) { + return index->key_stored == 1 ? + ((as_rec_space*)index->dim)->bin_space : (as_bin_space*)index->dim; +} + +static inline +void as_index_set_bin_space(as_index* index, as_bin_space* bin_space) { + if (index->key_stored == 1) { + ((as_rec_space*)index->dim)->bin_space = bin_space; + } + else { + index->dim = (void*)bin_space; + } +} + + +//------------------------------------------------ +// Set-ID bits. +// + +static inline +uint16_t as_index_get_set_id(const as_index *index) { + return index->set_id_bits; +} + +static inline +void as_index_set_set_id(as_index *index, uint16_t set_id) { + // TODO - check that it fits in the 10 bits ??? + index->set_id_bits = set_id; +} + +static inline +bool as_index_has_set(const as_index *index) { + return index->set_id_bits != 0; +} + + +//------------------------------------------------ +// Set-ID helpers. +// + +static inline +int as_index_set_set_w_len(as_index *index, as_namespace *ns, + const char *set_name, size_t len, bool apply_restrictions) { + uint16_t set_id; + int rv = as_namespace_set_set_w_len(ns, set_name, len, &set_id, + apply_restrictions); + + if (rv != 0) { + return rv; + } + + as_index_set_set_id(index, set_id); + return 0; +} + +static inline +int as_index_set_set(as_index *index, as_namespace *ns, const char *set_name, + bool apply_restrictions) { + return as_index_set_set_w_len(index, ns, set_name, strlen(set_name), + apply_restrictions); +} + +static inline +const char *as_index_get_set_name(as_index *index, as_namespace *ns) { + // TODO - don't really need this check - remove? + if (! as_index_has_set(index)) { + return NULL; + } + + return as_namespace_get_set_name(ns, as_index_get_set_id(index)); +} + + +//========================================================== +// Handling as_index objects. +// + +// Container for as_index pointer with lock and location. +struct as_index_ref_s { + bool skip_lock; + as_index *r; + cf_arenax_handle r_h; + cf_mutex *olock; +}; + + +//========================================================== +// Index tree. +// + +typedef struct as_index_tree_s { + // Data common to all trees in a namespace. + as_index_tree_shared *shared; + + // Where we allocate from and free to. Left out of 'shared' since we may + // later use multiple arenas per namespace. + cf_arenax *arena; + + // Variable length data, dependent on configuration. + uint8_t data[]; +} as_index_tree; + + +//========================================================== +// as_index_tree variable length data components. +// + +typedef struct as_lock_pair_s { + // Note: reduce_lock's scope is always inside of lock's scope. + cf_mutex lock; // insert, delete vs. insert, delete, get + cf_mutex reduce_lock; // insert, delete vs. reduce +} as_lock_pair; + +typedef struct as_sprig_s { + cf_arenax_handle root_h; + uint64_t n_elements; +} as_sprig; + +static inline as_lock_pair * +tree_locks(as_index_tree *tree) +{ + return (as_lock_pair*)tree->data; +} + +static inline as_sprig * +tree_sprigs(as_index_tree *tree) +{ + return (as_sprig*)(tree->data + tree->shared->sprigs_offset); +} + + +//------------------------------------------------ +// as_index_tree public API. +// + +void as_index_tree_gc_init(); +int as_index_tree_gc_queue_size(); + +as_index_tree *as_index_tree_create(as_index_tree_shared *shared, cf_arenax *arena); +as_index_tree *as_index_tree_resume(as_index_tree_shared *shared, cf_arenax *arena, as_treex *treex); +void as_index_tree_shutdown(as_index_tree *tree, as_treex *treex); +int as_index_tree_release(as_index_tree *tree); +uint64_t as_index_tree_size(as_index_tree *tree); + +typedef void (*as_index_reduce_fn) (as_index_ref *value, void *udata); + +void as_index_reduce(as_index_tree *tree, as_index_reduce_fn cb, void *udata); +void as_index_reduce_partial(as_index_tree *tree, uint64_t sample_count, as_index_reduce_fn cb, void *udata); + +void as_index_reduce_live(as_index_tree *tree, as_index_reduce_fn cb, void *udata); +void as_index_reduce_partial_live(as_index_tree *tree, uint64_t sample_count, as_index_reduce_fn cb, void *udata); + +int as_index_exists(as_index_tree *tree, cf_digest *keyd); +int as_index_get_vlock(as_index_tree *tree, cf_digest *keyd, as_index_ref *index_ref); +int as_index_get_insert_vlock(as_index_tree *tree, cf_digest *keyd, as_index_ref *index_ref); +int as_index_delete(as_index_tree *tree, cf_digest *keyd); + +#define as_index_reserve(_r) cf_atomic32_incr(&(_r->rc)) +#define as_index_release(_r) cf_atomic32_decr(&(_r->rc)) + + +//------------------------------------------------ +// Private API - for enterprise separation only. +// + +// Container for sprig-level function parameters. +typedef struct as_index_sprig_s { + as_index_value_destructor destructor; + void *destructor_udata; + + cf_arenax *arena; + + as_lock_pair *pair; + as_sprig *sprig; +} as_index_sprig; + +#define SENTINEL_H 0 + +#define RESOLVE_H(__h) ((as_index*)cf_arenax_resolve(isprig->arena, __h)) + +// Flag to indicate full index reduce. +#define AS_REDUCE_ALL (-1L) diff --git a/as/include/base/job_manager.h b/as/include/base/job_manager.h new file mode 100644 index 00000000..39d3bb69 --- /dev/null +++ b/as/include/base/job_manager.h @@ -0,0 +1,171 @@ +/* + * job_manager.h + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_queue.h" +#include "citrusleaf/cf_queue_priority.h" + +struct as_job_s; +struct as_job_manager_s; +struct as_mon_jobstat_s; +struct as_namespace_s; +struct as_partition_reservation_s; + +//---------------------------------------------------------- +// as_priority_thread_pool - class header. +// TODO - move to common. +// + +typedef struct as_priority_thread_pool_s { + pthread_mutex_t lock; + cf_queue_priority* dispatch_queue; + cf_queue* complete_queue; + uint32_t n_threads; +} as_priority_thread_pool; + +typedef void (*as_priority_thread_pool_task_fn)(void* task); + +// Same as cf_queue_priority scheme, so no internal conversion needed: +#define THREAD_POOL_PRIORITY_LOW CF_QUEUE_PRIORITY_LOW +#define THREAD_POOL_PRIORITY_MEDIUM CF_QUEUE_PRIORITY_MEDIUM +#define THREAD_POOL_PRIORITY_HIGH CF_QUEUE_PRIORITY_HIGH + +bool as_priority_thread_pool_init(as_priority_thread_pool* pool, uint32_t n_threads); +void as_priority_thread_pool_shutdown(as_priority_thread_pool* pool); +bool as_priority_thread_pool_resize(as_priority_thread_pool* pool, uint32_t n_threads); +bool as_priority_thread_pool_queue_task(as_priority_thread_pool* pool, as_priority_thread_pool_task_fn task_fn, void* task, int priority); +bool as_priority_thread_pool_remove_task(as_priority_thread_pool* pool, void* task); +void as_priority_thread_pool_change_task_priority(as_priority_thread_pool* pool, void* task, int new_priority); + +//---------------------------------------------------------- +// as_job - base class header. +// + +typedef void (*as_job_slice_fn)(struct as_job_s* _job, struct as_partition_reservation_s* rsv); +typedef void (*as_job_finish_fn)(struct as_job_s* _job); +typedef void (*as_job_destroy_fn)(struct as_job_s* _job); +typedef void (*as_job_info_fn)(struct as_job_s* _job, struct as_mon_jobstat_s* stat); + +typedef struct as_job_vtable_s { + as_job_slice_fn slice_fn; + as_job_finish_fn finish_fn; + as_job_destroy_fn destroy_fn; + as_job_info_fn info_mon_fn; +} as_job_vtable; + +typedef enum { + RSV_WRITE = 0, + RSV_MIGRATE = 1 +} as_job_rsv_type; + +// Same as cf_queue_priority scheme, so no internal conversion needed: +#define AS_JOB_PRIORITY_LOW THREAD_POOL_PRIORITY_LOW +#define AS_JOB_PRIORITY_MEDIUM THREAD_POOL_PRIORITY_MEDIUM +#define AS_JOB_PRIORITY_HIGH THREAD_POOL_PRIORITY_HIGH + +// Same as proto result codes so connected scans don't have to convert: +#define AS_JOB_FAIL_UNKNOWN AS_PROTO_RESULT_FAIL_UNKNOWN +#define AS_JOB_FAIL_PARAMETER AS_PROTO_RESULT_FAIL_PARAMETER +#define AS_JOB_FAIL_CLUSTER_KEY AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH +#define AS_JOB_FAIL_USER_ABORT AS_PROTO_RESULT_FAIL_SCAN_ABORT +#define AS_JOB_FAIL_FORBIDDEN AS_PROTO_RESULT_FAIL_FORBIDDEN + +// These result codes can't make it back to the client, but show in monitor: +#define AS_JOB_FAIL_RESPONSE_ERROR (-1) +#define AS_JOB_FAIL_RESPONSE_TIMEOUT (-2) + +typedef struct as_job_s { + // Mandatory interface for derived classes: + as_job_vtable vtable; + + // Parent: + struct as_job_manager_s* mgr; + + // Which partitions to reduce: + as_job_rsv_type rsv_type; + + // Unique identifier: + uint64_t trid; + + // Job scope: + struct as_namespace_s* ns; + uint16_t set_id; + + // Handle active phase: + pthread_mutex_t requeue_lock; + int priority; + cf_atomic32 active_rc; + volatile int next_pid; + volatile int abandoned; + + // For tracking: + uint64_t start_ms; + uint64_t finish_ms; + cf_atomic64 n_records_read; +} as_job; + +void as_job_init(as_job* _job, const as_job_vtable* vtable, + struct as_job_manager_s* manager, as_job_rsv_type rsv_type, + uint64_t trid, struct as_namespace_s* ns, uint16_t set_id, + int priority); +void as_job_slice(void* task); +void as_job_finish(as_job* _job); +void as_job_destroy(as_job* _job); +void as_job_info(as_job* _job, struct as_mon_jobstat_s* stat); +void as_job_active_reserve(as_job* _job); +void as_job_active_release(as_job* _job); + +//---------------------------------------------------------- +// as_job_manager - class header. +// + +typedef struct as_job_manager_s { + pthread_mutex_t lock; + cf_queue* active_jobs; + cf_queue* finished_jobs; + as_priority_thread_pool thread_pool; + + // Manager configuration: + uint32_t max_active; + uint32_t max_done; +} as_job_manager; + +void as_job_manager_init(as_job_manager* mgr, uint32_t max_active, uint32_t max_done, uint32_t n_threads); +int as_job_manager_start_job(as_job_manager* mgr, as_job* _job); +void as_job_manager_requeue_job(as_job_manager* mgr, as_job* _job); +void as_job_manager_finish_job(as_job_manager* mgr, as_job* _job); +void as_job_manager_abandon_job(as_job_manager* mgr, as_job* _job, int reason); +bool as_job_manager_abort_job(as_job_manager* mgr, uint64_t trid); +int as_job_manager_abort_all_jobs(as_job_manager* mgr); +bool as_job_manager_change_job_priority(as_job_manager* mgr, uint64_t trid, int priority); +void as_job_manager_limit_active_jobs(as_job_manager* mgr, uint32_t max_active); +void as_job_manager_limit_finished_jobs(as_job_manager* mgr, uint32_t max_done); +void as_job_manager_resize_thread_pool(as_job_manager* mgr, uint32_t n_threads); +struct as_mon_jobstat_s* as_job_manager_get_job_info(as_job_manager* mgr, uint64_t trid); +struct as_mon_jobstat_s* as_job_manager_get_info(as_job_manager* mgr, int* size); +int as_job_manager_get_active_job_count(as_job_manager* mgr); diff --git a/as/include/base/json_init.h b/as/include/base/json_init.h new file mode 100644 index 00000000..66f2475d --- /dev/null +++ b/as/include/base/json_init.h @@ -0,0 +1,34 @@ +/* + * json_init.h + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +/* SYNOPSIS + * This module handles initialization of the Jansson JSON API by + * setting the memory allocation functions to be used internally + * by Jansson to the CF allocation-related functions. + */ + +/* + * Initialize the JSON module by setting the memory allocation functions. + */ +void as_json_init(); diff --git a/as/include/base/monitor.h b/as/include/base/monitor.h new file mode 100644 index 00000000..82e37e43 --- /dev/null +++ b/as/include/base/monitor.h @@ -0,0 +1,103 @@ +/* + * monitor.h + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * Long Running Job Monitoring interface + * + * This file implements the generic interface for the long running jobs + * in Aerospike like query / scan / batch etc. The idea is to able to see + * what is going on in the system. + * + * Each module which needs to show up in the monitoring needs to register + * and implement the interfaces. + */ + +#pragma once + +#include + +#include "dynbuf.h" + +#include "base/datamodel.h" + + +#define AS_MON_OK 0 +#define AS_MON_ERR -1 +#define AS_MON_EXIST -2 +#define TRID_LIST_SIZE 1000 + +typedef enum { + QUERY_MOD = 0, + SCAN_MOD = 1, + SBLD_MOD = 2 +} as_mon_module_slot; + +extern const char * AS_MON_MODULES[]; + +// Stat for currently running job +typedef struct as_mon_jobstat_s { + uint64_t trid; + char job_type[32]; + char ns[AS_ID_NAMESPACE_SZ]; + char set[AS_SET_NAME_MAX_SIZE]; + uint32_t priority; + char status[64]; + float progress_pct; + uint64_t run_time; + uint64_t time_since_done; + uint64_t recs_read; + uint64_t net_io_bytes; + float cpu; + char jdata[512]; +} as_mon_jobstat; + +typedef struct as_mon_cb_s { + as_mon_jobstat *(*get_jobstat) (uint64_t trid); + as_mon_jobstat *(*get_jobstat_all) (int * size); + + // Per transaction + int (*set_priority) (uint64_t trid, uint32_t priority); + int (*kill) (uint64_t trid); + int (*suspend) (uint64_t trid); + + // Per Module + // Numer of pending transaction of this job type in queue allowed + // incoming more than this will be rejected. + int (*set_pendingmax) (int); + + // Set the number of transaction that can be inflight at + // any point of time. + int (*set_maxinflight) (int); + + // Any individual transaction priority has upper bound of max + // priority of jobtype + int (*set_maxpriority) (int); +} as_mon_cb; + +// Structure to register module with as mon interface. +typedef struct as_mon_s { + char *type; + as_mon_cb cb; +} as_mon; + +void as_mon_info_cmd(const char *module, char *cmd, uint64_t trid, uint32_t priority, cf_dyn_buf *db); +int as_mon_init(); diff --git a/as/include/base/packet_compression.h b/as/include/base/packet_compression.h new file mode 100644 index 00000000..1af088f5 --- /dev/null +++ b/as/include/base/packet_compression.h @@ -0,0 +1,81 @@ +/* + * packet_compression.h + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include + +typedef enum compression_type_e { + COMPRESSION_ZLIB = 1 +} compression_type; + +/** + * Function to decompress the given data + * Expected arguments + * @param type Type of compression + * @param length Length of buffer to be decompressed + * @param buf Pointer to buffer to be decompressed + * @param out_buf_len Length of buffer to hold decompressed data + * @param out_buf Pointer to buffer to hold decompressed data + * @return 0 if successful + */ +int +as_decompress(compression_type type, size_t buf_len, const uint8_t *buf, size_t *out_buf_len, uint8_t *out_buf); + +/** + * Function to get back decompressed packet from PROTO_TYPE_AS_MSG_COMPRESSED packet + * Packet : Header - Original size of message - Compressed message + * @param buf Pointer to PROTO_TYPE_AS_MSG_COMPRESSED packet. + * @param output_packet Pointer holding address of decompressed packet. + * @param output_packet_size Size of output_packet buffer + */ +int +as_packet_decompression(uint8_t *buf, uint8_t **output_packet, size_t *output_packet_size); + +/* + * Function to compress the given data + * Expected arguments + * 1. Type of compression + * 1 for zlib + * 2. Length of buffer to be compressed - mandatory + * 3. Pointer to buffer to be compressed - mandatory + * 4. Length of buffer to hold compressed data - mandatory + * 5. Pointer to buffer to hold compressed data - mandatory + * 6. Compression level - Optional, default Z_DEFAULT_COMPRESSION + * Z_NO_COMPRESSION 0 + * Z_BEST_SPEED 1 + * Z_BEST_COMPRESSION 9 + * Z_DEFAULT_COMPRESSION (-1) + */ +int +as_compress(int argc, uint8_t *argv[]); + +/* + * Function to create packet to send compressed data. + * Packet : Header - Original size of message - Compressed message. + * Input : buf - Pointer to data to be compressed. - Input + * buf_sz - Size of the data to be compressed. - Input + * compressed_packet : Pointer holding address of compressed packet. - Output + * compressed_packet_sz : Size of the compressed packet. - Output + */ +int +as_packet_compression(uint8_t *buf, size_t buf_sz, uint8_t **compressed_packet, size_t *compressed_packet_sz); diff --git a/as/include/base/particle.h b/as/include/base/particle.h new file mode 100644 index 00000000..591e5578 --- /dev/null +++ b/as/include/base/particle.h @@ -0,0 +1,98 @@ +/* + * particle.h + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include "aerospike/as_val.h" +#include "base/datamodel.h" + +//------------------------------------------------ +// Particle interface specification - functions. +// + +// Destructor, etc. +typedef void (*as_particle_destructor_fn) (as_particle *p); +typedef uint32_t (*as_particle_size_fn) (const as_particle *p); + +// Handle "wire" format. +typedef int32_t (*as_particle_concat_size_from_wire_fn) (as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +typedef int (*as_particle_append_from_wire_fn) (as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +typedef int (*as_particle_prepend_from_wire_fn) (as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +typedef int (*as_particle_incr_from_wire_fn) (as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +typedef int32_t (*as_particle_size_from_wire_fn) (const uint8_t *wire_value, uint32_t value_size); +typedef int (*as_particle_from_wire_fn) (as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +typedef int (*as_particle_compare_from_wire_fn) (const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size); +typedef uint32_t (*as_particle_wire_size_fn) (const as_particle *p); +typedef uint32_t (*as_particle_to_wire_fn) (const as_particle *p, uint8_t *wire); + +// Handle as_val translation. +typedef uint32_t (*as_particle_size_from_asval_fn) (const as_val *val); +typedef void (*as_particle_from_asval_fn) (const as_val *val, as_particle **pp); +typedef as_val *(*as_particle_to_asval_fn) (const as_particle *p); +typedef uint32_t (*as_particle_asval_wire_size_fn) (const as_val *val); +typedef uint32_t (*as_particle_asval_to_wire_fn) (const as_val *val, uint8_t *wire); + +// Handle msgpack translation. +typedef uint32_t (*as_particle_size_from_msgpack_fn) (const uint8_t *packed, uint32_t packed_size); +typedef void (*as_particle_from_msgpack_fn) (const uint8_t *packed, uint32_t packed_size, as_particle **pp); + +// Handle on-device "flat" format. +typedef int32_t (*as_particle_size_from_flat_fn) (const uint8_t *flat, uint32_t flat_size); +typedef int (*as_particle_cast_from_flat_fn) (uint8_t *flat, uint32_t flat_size, as_particle **pp); +typedef int (*as_particle_from_flat_fn) (const uint8_t *flat, uint32_t flat_size, as_particle **pp); +typedef uint32_t (*as_particle_flat_size_fn) (const as_particle *p); +typedef uint32_t (*as_particle_to_flat_fn) (const as_particle *p, uint8_t *flat); + +//------------------------------------------------ +// Particle interface specification - vtable. +// + +typedef struct as_particle_vtable_s { + as_particle_destructor_fn destructor_fn; + as_particle_size_fn size_fn; + + as_particle_concat_size_from_wire_fn concat_size_from_wire_fn; + as_particle_append_from_wire_fn append_from_wire_fn; + as_particle_prepend_from_wire_fn prepend_from_wire_fn; + as_particle_incr_from_wire_fn incr_from_wire_fn; + as_particle_size_from_wire_fn size_from_wire_fn; + as_particle_from_wire_fn from_wire_fn; + as_particle_compare_from_wire_fn compare_from_wire_fn; + as_particle_wire_size_fn wire_size_fn; + as_particle_to_wire_fn to_wire_fn; + + as_particle_size_from_asval_fn size_from_asval_fn; + as_particle_from_asval_fn from_asval_fn; + as_particle_to_asval_fn to_asval_fn; + as_particle_asval_wire_size_fn asval_wire_size_fn; + as_particle_asval_to_wire_fn asval_to_wire_fn; + + as_particle_size_from_msgpack_fn size_from_msgpack_fn; + as_particle_from_msgpack_fn from_msgpack_fn; + + as_particle_size_from_flat_fn size_from_flat_fn; // TODO - unused - remove? + as_particle_cast_from_flat_fn cast_from_flat_fn; + as_particle_from_flat_fn from_flat_fn; + as_particle_flat_size_fn flat_size_fn; + as_particle_to_flat_fn to_flat_fn; +} as_particle_vtable; diff --git a/as/include/base/particle_blob.h b/as/include/base/particle_blob.h new file mode 100644 index 00000000..62b90884 --- /dev/null +++ b/as/include/base/particle_blob.h @@ -0,0 +1,63 @@ +/* + * particle_blob.h + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include "aerospike/as_val.h" +#include "base/datamodel.h" + +// The BLOB particle interface function declarations are in this header file +// since BLOB functions are used by other particles derived from BLOB. + +// Destructor, etc. +void blob_destruct(as_particle *p); +uint32_t blob_size(const as_particle *p); + +// Handle "wire" format. +int32_t blob_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int blob_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int blob_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int blob_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int32_t blob_size_from_wire(const uint8_t *wire_value, uint32_t value_size); +int blob_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int blob_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size); +uint32_t blob_wire_size(const as_particle *p); +uint32_t blob_to_wire(const as_particle *p, uint8_t *wire); + +// Handle as_val translation. +uint32_t blob_size_from_asval(const as_val *val); +void blob_from_asval(const as_val *val, as_particle **pp); +as_val *blob_to_asval(const as_particle *p); +uint32_t blob_asval_wire_size(const as_val *val); +uint32_t blob_asval_to_wire(const as_val *val, uint8_t *wire); + +// Handle msgpack translation. +uint32_t blob_size_from_msgpack(const uint8_t *packed, uint32_t packed_size); +void blob_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp); + +// Handle on-device "flat" format. +int32_t blob_size_from_flat(const uint8_t *flat, uint32_t flat_size); +int blob_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp); +int blob_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp); +uint32_t blob_flat_size(const as_particle *p); +uint32_t blob_to_flat(const as_particle *p, uint8_t *flat); diff --git a/as/include/base/particle_integer.h b/as/include/base/particle_integer.h new file mode 100644 index 00000000..2ebe2e5d --- /dev/null +++ b/as/include/base/particle_integer.h @@ -0,0 +1,63 @@ +/* + * particle_integer.h + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include "aerospike/as_val.h" +#include "base/datamodel.h" + +// The INTEGER particle interface function declarations are in this header file +// since INTEGER functions are used by other particles derived from INTEGER. + +// Destructor, etc. +void integer_destruct(as_particle *p); +uint32_t integer_size(const as_particle *p); + +// Handle "wire" format. +int32_t integer_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int integer_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int integer_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int integer_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int32_t integer_size_from_wire(const uint8_t *wire_value, uint32_t value_size); +int integer_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int integer_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size); +uint32_t integer_wire_size(const as_particle *p); +uint32_t integer_to_wire(const as_particle *p, uint8_t *wire); + +// Handle as_val translation. +uint32_t integer_size_from_asval(const as_val *val); +void integer_from_asval(const as_val *val, as_particle **pp); +as_val *integer_to_asval(const as_particle *p); +uint32_t integer_asval_wire_size(const as_val *val); +uint32_t integer_asval_to_wire(const as_val *val, uint8_t *wire); + +// Handle msgpack translation. +uint32_t integer_size_from_msgpack(const uint8_t *packed, uint32_t packed_size); +void integer_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp); + +// Handle on-device "flat" format. +int32_t integer_size_from_flat(const uint8_t *flat, uint32_t flat_size); +int integer_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp); +int integer_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp); +uint32_t integer_flat_size(const as_particle *p); +uint32_t integer_to_flat(const as_particle *p, uint8_t *flat); diff --git a/as/include/base/predexp.h b/as/include/base/predexp.h new file mode 100644 index 00000000..93454107 --- /dev/null +++ b/as/include/base/predexp.h @@ -0,0 +1,57 @@ +/* + * predexp.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * predicate expression declarations + * + */ + +#pragma once + +#include "base/datamodel.h" +#include "base/index.h" + +// A "compiled" predicate expression +typedef struct predexp_eval_base_s predexp_eval_t; + +// A named variable +typedef struct predexp_var_s as_predexp_var_t; + +// Arguments to predicate expressions +typedef struct predexp_args_s { + as_namespace* ns; // always present + as_record* md; // always present + as_predexp_var_t* vl; // always present + as_storage_rd* rd; // NULL during metadata phase +} predexp_args_t; + +extern predexp_eval_t* predexp_build(as_msg_field* pfp); + +// Called with NULL rd +extern bool predexp_matches_metadata(predexp_eval_t* eval, + predexp_args_t* argsp); + +// Called with both ndx and rd. +extern bool predexp_matches_record(predexp_eval_t* eval, + predexp_args_t* argsp); + +extern void predexp_destroy(predexp_eval_t* eval); diff --git a/as/include/base/proto.h b/as/include/base/proto.h new file mode 100644 index 00000000..9b16a912 --- /dev/null +++ b/as/include/base/proto.h @@ -0,0 +1,693 @@ +/* + * proto.h + * + * Copyright (C) 2008-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * wire protocol definition + */ + +#pragma once + +#include +#include +#include +#include + +#include "aerospike/as_val.h" +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_vector.h" + +#include "dynbuf.h" +#include "socket.h" + + +// Forward declarations. +struct as_bin_s; +struct as_index_s; +struct as_storage_rd_s; +struct as_namespace_s; +struct as_file_handle_s; +struct as_transaction_s; + +// These numbers match with cl_types.h on the client + +#define AS_PROTO_RESULT_OK 0 +#define AS_PROTO_RESULT_FAIL_UNKNOWN 1 // unknown failure - consider retry +#define AS_PROTO_RESULT_FAIL_NOT_FOUND 2 +#define AS_PROTO_RESULT_FAIL_GENERATION 3 +#define AS_PROTO_RESULT_FAIL_PARAMETER 4 +#define AS_PROTO_RESULT_FAIL_RECORD_EXISTS 5 // if 'WRITE_ADD', could fail because already exists +#define AS_PROTO_RESULT_FAIL_UNUSED_6 6 // recycle - was AS_PROTO_RESULT_FAIL_BIN_EXISTS +#define AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH 7 +#define AS_PROTO_RESULT_FAIL_OUT_OF_SPACE 8 +#define AS_PROTO_RESULT_FAIL_TIMEOUT 9 +#define AS_PROTO_RESULT_FAIL_ALWAYS_FORBIDDEN 10 // operation not allowed for current (static) configuration +#define AS_PROTO_RESULT_FAIL_UNAVAILABLE 11 // error returned during node down and partition isn't available +#define AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE 12 // op and bin type incompatibility +#define AS_PROTO_RESULT_FAIL_RECORD_TOO_BIG 13 +#define AS_PROTO_RESULT_FAIL_KEY_BUSY 14 +#define AS_PROTO_RESULT_FAIL_SCAN_ABORT 15 +#define AS_PROTO_RESULT_FAIL_UNSUPPORTED_FEATURE 16 // asked to do something we don't do for a particular configuration +#define AS_PROTO_RESULT_FAIL_UNUSED_17 17 // recycle - was AS_PROTO_RESULT_FAIL_BIN_NOT_FOUND +#define AS_PROTO_RESULT_FAIL_DEVICE_OVERLOAD 18 +#define AS_PROTO_RESULT_FAIL_KEY_MISMATCH 19 +#define AS_PROTO_RESULT_FAIL_NAMESPACE 20 +#define AS_PROTO_RESULT_FAIL_BIN_NAME 21 +#define AS_PROTO_RESULT_FAIL_FORBIDDEN 22 // operation temporarily not possible +#define AS_PROTO_RESULT_FAIL_ELEMENT_NOT_FOUND 23 +#define AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS 24 +#define AS_PROTO_RESULT_FAIL_ENTERPRISE_ONLY 25 // attempting enterprise functionality on community build + +// Security result codes. Must be <= 255, to fit in one byte. Defined here to +// ensure no overlap with other result codes. +#define AS_SEC_RESULT_OK_LAST 50 // the last message + // Security message errors. +#define AS_SEC_ERR_NOT_SUPPORTED 51 // security features not supported +#define AS_SEC_ERR_NOT_ENABLED 52 // security features not enabled +#define AS_SEC_ERR_SCHEME 53 // security scheme not supported +#define AS_SEC_ERR_COMMAND 54 // unrecognized command +#define AS_SEC_ERR_FIELD 55 // can't parse field +#define AS_SEC_ERR_STATE 56 // e.g. unexpected command + // Security procedure errors. +#define AS_SEC_ERR_USER 60 // no user or unknown user +#define AS_SEC_ERR_USER_EXISTS 61 // user already exists +#define AS_SEC_ERR_PASSWORD 62 // no password or bad password +#define AS_SEC_ERR_EXPIRED_PASSWORD 63 // expired password +#define AS_SEC_ERR_FORBIDDEN_PASSWORD 64 // forbidden password (e.g. recently used) +#define AS_SEC_ERR_CREDENTIAL 65 // no credential or bad credential + // ... room for more ... +#define AS_SEC_ERR_ROLE 70 // no role(s) or unknown role(s) +#define AS_SEC_ERR_ROLE_EXISTS 71 // role already exists +#define AS_SEC_ERR_PRIVILEGE 72 // no privileges or unknown privileges + // Permission errors. +#define AS_SEC_ERR_NOT_AUTHENTICATED 80 // socket not authenticated +#define AS_SEC_ERR_ROLE_VIOLATION 81 // role (privilege) violation + +// UDF Errors (100 - 109) +#define AS_PROTO_RESULT_FAIL_UDF_EXECUTION 100 + +// Batch Errors (150 - 159) +#define AS_PROTO_RESULT_FAIL_BATCH_DISABLED 150 // batch functionality has been disabled +#define AS_PROTO_RESULT_FAIL_BATCH_MAX_REQUESTS 151 // batch-max-requests has been exceeded +#define AS_PROTO_RESULT_FAIL_BATCH_QUEUES_FULL 152 // all batch queues are full + +// Geo Errors (160 - 169) +#define AS_PROTO_RESULT_FAIL_GEO_INVALID_GEOJSON 160 // Invalid GeoJSON on insert/update + +// Secondary Index Query Failure Codes (200 - 219) +#define AS_PROTO_RESULT_FAIL_INDEX_FOUND 200 +#define AS_PROTO_RESULT_FAIL_INDEX_NOTFOUND 201 +#define AS_PROTO_RESULT_FAIL_INDEX_OOM 202 +#define AS_PROTO_RESULT_FAIL_INDEX_NOTREADABLE 203 +#define AS_PROTO_RESULT_FAIL_INDEX_GENERIC 204 +#define AS_PROTO_RESULT_FAIL_INDEX_NAME_MAXLEN 205 +#define AS_PROTO_RESULT_FAIL_INDEX_MAXCOUNT 206 + +#define AS_PROTO_RESULT_FAIL_QUERY_USERABORT 210 +#define AS_PROTO_RESULT_FAIL_QUERY_QUEUEFULL 211 +#define AS_PROTO_RESULT_FAIL_QUERY_TIMEOUT 212 +#define AS_PROTO_RESULT_FAIL_QUERY_CBERROR 213 +#define AS_PROTO_RESULT_FAIL_QUERY_NETIO_ERR 214 +#define AS_PROTO_RESULT_FAIL_QUERY_DUPLICATE 215 + +/* SYNOPSIS + * Aerospike wire protocol + * + * Version 2 + * + * Aerospike uses a message-oriented wire protocol to transfer information. + * Each message consists of a header, which determines the type and the length + * to follow. This is called the 'proto_msg'. + * + * these messages are vectored out to the correct handler. Over TCP, they can be + * pipelined (but not out of order). If we wish to support out of order responses, + * we should upgrade the protocol. + * + * the most common type of message is the as_msg, a message which reads or writes + * a single row to the data store. + * + */ + +#define PROTO_VERSION 2 + +#define PROTO_TYPE_INFO 1 // ascii-format message for determining server info +#define PROTO_TYPE_SECURITY 2 +#define PROTO_TYPE_AS_MSG 3 +#define PROTO_TYPE_AS_MSG_COMPRESSED 4 +#define PROTO_TYPE_INTERNAL_XDR 5 +#define PROTO_TYPE_MAX 6 // if you see 6, it's illegal + +#define PROTO_SIZE_MAX (128 * 1024 * 1024) // used simply for validation, as we've been corrupting msgp's + +#define PROTO_FIELD_LENGTH_MAX 1024 +#define PROTO_OP_LENGTH_MAX 131072 + +typedef struct as_proto_s { + uint8_t version; + uint8_t type; + uint64_t sz: 48; + uint8_t data[]; +} __attribute__ ((__packed__)) as_proto; + +/* + * zlib decompression API needs original size of the compressed data. + * So we need to transfer it to another end. + * This structure packs together - + * header + original size of data + compressed data + */ +typedef struct as_comp_proto_s { + as_proto proto; // Protocol header + uint64_t org_sz; // Original size of compressed data hold in 'data' + uint8_t data[]; // Compressed data +} as_comp_proto; + +/* as_msg_field +* Aerospike message field */ +typedef struct as_msg_field_s { +#define AS_MSG_FIELD_TYPE_NAMESPACE 0 +#define AS_MSG_FIELD_TYPE_SET 1 +#define AS_MSG_FIELD_TYPE_KEY 2 +#define AS_MSG_FIELD_TYPE_DIGEST_RIPE 4 +#define AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY 6 +#define AS_MSG_FIELD_TYPE_TRID 7 +#define AS_MSG_FIELD_TYPE_SCAN_OPTIONS 8 +#define AS_MSG_FIELD_TYPE_SOCKET_TIMEOUT 9 + +#define AS_MSG_FIELD_TYPE_INDEX_NAME 21 +#define AS_MSG_FIELD_TYPE_INDEX_RANGE 22 +#define AS_MSG_FIELD_TYPE_INDEX_TYPE 26 + +// UDF RANGE: 30-39 +#define AS_MSG_FIELD_TYPE_UDF_FILENAME 30 +#define AS_MSG_FIELD_TYPE_UDF_FUNCTION 31 +#define AS_MSG_FIELD_TYPE_UDF_ARGLIST 32 +#define AS_MSG_FIELD_TYPE_UDF_OP 33 + +#define AS_MSG_FIELD_TYPE_QUERY_BINLIST 40 +#define AS_MSG_FIELD_TYPE_BATCH 41 +#define AS_MSG_FIELD_TYPE_BATCH_WITH_SET 42 +#define AS_MSG_FIELD_TYPE_PREDEXP 43 + + /* NB: field_sz is sizeof(type) + sizeof(data) */ + uint32_t field_sz; // get the data size through the accessor function, don't worry, it's a small macro + uint8_t type; // ordering matters :-( see as_transaction_prepare + uint8_t data[]; +} __attribute__((__packed__)) as_msg_field; + +// For as_transaction::field_types, a bit-field to mark which fields are in the +// as_msg. +#define AS_MSG_FIELD_BIT_NAMESPACE 0x00000001 +#define AS_MSG_FIELD_BIT_SET 0x00000002 +#define AS_MSG_FIELD_BIT_KEY 0x00000004 +#define AS_MSG_FIELD_BIT_DIGEST_RIPE 0x00000008 +#define AS_MSG_FIELD_BIT_DIGEST_RIPE_ARRAY 0x00000010 +#define AS_MSG_FIELD_BIT_TRID 0x00000020 +#define AS_MSG_FIELD_BIT_SCAN_OPTIONS 0x00000040 +#define AS_MSG_FIELD_BIT_SOCKET_TIMEOUT 0x00000080 +#define AS_MSG_FIELD_BIT_INDEX_NAME 0x00000100 +#define AS_MSG_FIELD_BIT_INDEX_RANGE 0x00000200 +#define AS_MSG_FIELD_BIT_INDEX_TYPE 0x00000400 +#define AS_MSG_FIELD_BIT_UDF_FILENAME 0x00000800 +#define AS_MSG_FIELD_BIT_UDF_FUNCTION 0x00001000 +#define AS_MSG_FIELD_BIT_UDF_ARGLIST 0x00002000 +#define AS_MSG_FIELD_BIT_UDF_OP 0x00004000 +#define AS_MSG_FIELD_BIT_QUERY_BINLIST 0x00008000 +#define AS_MSG_FIELD_BIT_BATCH 0x00010000 +#define AS_MSG_FIELD_BIT_BATCH_WITH_SET 0x00020000 +#define AS_MSG_FIELD_BIT_PREDEXP 0x00040000 + +// as_msg ops + +#define AS_MSG_OP_READ 1 // read the value in question +#define AS_MSG_OP_WRITE 2 // write the value in question + +// Prospective CDT top-level ops: +#define AS_MSG_OP_CDT_READ 3 +#define AS_MSG_OP_CDT_MODIFY 4 + +#define AS_MSG_OP_INCR 5 // arithmetically add a value to an existing value, works only on integers +// Unused - 6 +// Unused - 7 +// Unused - 8 +#define AS_MSG_OP_APPEND 9 // append a value to an existing value, works on strings and blobs +#define AS_MSG_OP_PREPEND 10 // prepend a value to an existing value, works on strings and blobs +#define AS_MSG_OP_TOUCH 11 // touch a value without doing anything else to it - will increment the generation + +#define AS_MSG_OP_MC_INCR 129 // Memcache-compatible version of the increment command +#define AS_MSG_OP_MC_APPEND 130 // append the value to an existing value, works only strings for now +#define AS_MSG_OP_MC_PREPEND 131 // prepend a value to an existing value, works only strings for now +#define AS_MSG_OP_MC_TOUCH 132 // Memcache-compatible touch - does not change generation + +#define OP_IS_MODIFY(op) ( \ + (op) == AS_MSG_OP_INCR \ + || (op) == AS_MSG_OP_APPEND \ + || (op) == AS_MSG_OP_PREPEND \ + || (op) == AS_MSG_OP_MC_INCR \ + || (op) == AS_MSG_OP_MC_APPEND \ + || (op) == AS_MSG_OP_MC_PREPEND \ + ) + +#define OP_IS_TOUCH(op) ((op) == AS_MSG_OP_TOUCH || (op) == AS_MSG_OP_MC_TOUCH) + +typedef struct as_msg_op_s { + uint32_t op_sz; + uint8_t op; + uint8_t particle_type; + uint8_t version; // now unused + uint8_t name_sz; + uint8_t name[]; // UTF-8 + // there's also a value here but you can't have two variable size arrays +} __attribute__((__packed__)) as_msg_op; + +static inline uint8_t * as_msg_op_get_value_p(as_msg_op *op) +{ + return (uint8_t*)op + sizeof(as_msg_op) + op->name_sz; +} + +static inline uint32_t as_msg_op_get_value_sz(const as_msg_op *op) +{ + return op->op_sz - (4 + op->name_sz); +} + +static inline uint32_t as_msg_field_get_value_sz(as_msg_field *f) +{ + return f->field_sz - 1; +} + +static inline uint32_t as_msg_field_get_strncpy(as_msg_field *f, char *dst, int sz) +{ + int fsz = f->field_sz - 1; + if (sz > fsz) { + memcpy(dst, f->data, fsz); + dst[fsz] = 0; + return fsz; + } + else { + memcpy(dst, f->data, sz - 1); + dst[sz - 1] = 0; + return sz - 1; + } +} + +typedef struct as_msg_s { + /*00 [x00] (08) */ uint8_t header_sz; // number of bytes in this header - 22 + /*01 [x01] (09) */ uint8_t info1; // bitfield about this request + /*02 [x02] (10) */ uint8_t info2; // filled up, need another + /*03 [x03] (11) */ uint8_t info3; // nice extra space. Mmm, tasty extra space. + /*04 [x04] (12) */ uint8_t unused; + /*05 [x05] (13) */ uint8_t result_code; + /*06 [x06] (14) */ uint32_t generation; + /*10 [x0A] (18) */ uint32_t record_ttl; + /*14 [x10] (22) */ uint32_t transaction_ttl; + /*18 [x12] (26) */ uint16_t n_fields; // number of fields + /*20 [x14] (28) */ uint16_t n_ops; // number of operations + /*22 [x16] (30) */ uint8_t data[]; // data contains first the fields, then the ops +} __attribute__((__packed__)) as_msg; + +/* as_ms + * Aerospike message + * sz: size of the payload, not including the header */ +typedef struct cl_msg_s { + as_proto proto; + as_msg msg; +} __attribute__((__packed__)) cl_msg; + +#define AS_MSG_INFO1_READ (1 << 0) // contains a read operation +#define AS_MSG_INFO1_GET_ALL (1 << 1) // get all bins, period +// (Note: Bit 2 is unused.) +#define AS_MSG_INFO1_BATCH (1 << 3) // new batch protocol +#define AS_MSG_INFO1_XDR (1 << 4) // operation is being performed by XDR +#define AS_MSG_INFO1_GET_NO_BINS (1 << 5) // get record metadata only - no bin metadata or data +#define AS_MSG_INFO1_CONSISTENCY_LEVEL_B0 (1 << 6) // read consistency level - bit 0 +#define AS_MSG_INFO1_CONSISTENCY_LEVEL_B1 (1 << 7) // read consistency level - bit 1 + +#define AS_MSG_INFO2_WRITE (1 << 0) // contains a write semantic +#define AS_MSG_INFO2_DELETE (1 << 1) // delete record +#define AS_MSG_INFO2_GENERATION (1 << 2) // pay attention to the generation +#define AS_MSG_INFO2_GENERATION_GT (1 << 3) // apply write if new generation > old, good for restore +#define AS_MSG_INFO2_DURABLE_DELETE (1 << 4) // op resulting in record deletion leaves tombstone (Enterprise only) +#define AS_MSG_INFO2_CREATE_ONLY (1 << 5) // write record only if it doesn't exist +// (Note: Bit 6 is unused.) +#define AS_MSG_INFO2_RESPOND_ALL_OPS (1 << 7) // all bin ops (read, write, or modify) require a response, in request order + +#define AS_MSG_INFO3_LAST (1 << 0) // this is the last of a multi-part message +#define AS_MSG_INFO3_COMMIT_LEVEL_B0 (1 << 1) // write commit level - bit 0 +#define AS_MSG_INFO3_COMMIT_LEVEL_B1 (1 << 2) // write commit level - bit 1 +#define AS_MSG_INFO3_UPDATE_ONLY (1 << 3) // update existing record only, do not create new record +#define AS_MSG_INFO3_CREATE_OR_REPLACE (1 << 4) // completely replace existing record, or create new record +#define AS_MSG_INFO3_REPLACE_ONLY (1 << 5) // completely replace existing record, do not create new record +#define AS_MSG_INFO3_LINEARIZE_READ (1 << 6) // enterprise only +// (Note: Bit 7 is unused.) + +#define AS_MSG_FIELD_SCAN_UNUSED_2 (0x02) // was - whether to send ldt bin data back to the client +#define AS_MSG_FIELD_SCAN_DISCONNECTED_JOB (0x04) // for sproc jobs that won't be sending results back to the client [UNUSED] +#define AS_MSG_FIELD_SCAN_FAIL_ON_CLUSTER_CHANGE (0x08) // if we should fail when cluster is migrating or cluster changes +#define AS_MSG_FIELD_SCAN_PRIORITY(__cl_byte) ((0xF0 & __cl_byte)>>4) // 4 bit value indicating the scan priority + +static inline as_msg_field * +as_msg_field_get_next(as_msg_field *mf) +{ + return (as_msg_field*)(((uint8_t*)mf) + sizeof(mf->field_sz) + mf->field_sz); +} + +static inline uint8_t * +as_msg_field_skip(as_msg_field *mf) +{ + // At least 1 byte always follow field_sz. + return mf->field_sz == 0 ? NULL : (uint8_t*)mf + sizeof(mf->field_sz) + mf->field_sz; +} + +/* as_msg_field_get + * Retrieve a specific field from a message */ +static inline as_msg_field * +as_msg_field_get(const as_msg *msg, uint8_t type) +{ + uint16_t n; + as_msg_field *fp = NULL; + + fp = (as_msg_field*)msg->data; + + for (n = 0; n < msg->n_fields; n++) { + + if (fp->type == type) { + break; + } + + fp = as_msg_field_get_next(fp); + } + + if (n == msg->n_fields) { + return NULL; + } + else { + return fp; + } +} + +static inline as_msg_op * +as_msg_op_get_next(as_msg_op *op) +{ + return (as_msg_op*)(((uint8_t*)op) + sizeof(uint32_t) + op->op_sz); +} + +static inline uint8_t * +as_msg_op_skip(as_msg_op *op) +{ + // At least 4 bytes always follow op_sz. + return (uint32_t)op->name_sz + 4 > op->op_sz ? + NULL : (uint8_t*)op + sizeof(op->op_sz) + op->op_sz; +} + +/* as_msg_field_getnext + * Iterator for all fields of a particular type. + * First time through: pass 0 as current, you'll get a field. + * Next time through: pass the current as current, you'll get null when there + * are no more. + */ +static inline as_msg_op * +as_msg_op_iterate(as_msg *msg, as_msg_op *current, int *n) +{ + // Skip over the fields the first time. + if (! current) { + if (msg->n_ops == 0) { + return 0; // short cut + } + + as_msg_field *mf = (as_msg_field*)msg->data; + + for (uint16_t i = 0; i < msg->n_fields; i++) { + mf = as_msg_field_get_next(mf); + } + + current = (as_msg_op*)mf; + *n = 0; + + return current; + } + + (*n)++; + + if (*n >= msg->n_ops) { + return 0; + } + + return as_msg_op_get_next(current); +} + +static inline size_t +as_proto_size_get(const as_proto *proto) +{ + return sizeof(as_proto) + proto->sz; +} + +static inline bool +as_proto_is_valid_type(const as_proto *proto) +{ + return proto->type != 0 && proto->type < PROTO_TYPE_MAX; +} + +static inline bool +as_proto_wrapped_is_valid(const as_proto *proto, size_t size) +{ + return proto->version == PROTO_VERSION && + proto->type == PROTO_TYPE_AS_MSG && // currently we only wrap as_msg + as_proto_size_get(proto) == size; +} + +void as_proto_swap(as_proto *proto); +void as_msg_swap_header(as_msg *m); +void as_msg_swap_field(as_msg_field *mf); +void as_msg_swap_op(as_msg_op *op); + +cl_msg *as_msg_create_internal(const char *ns_name, const cf_digest *keyd, + uint8_t info1, uint8_t info2, uint8_t info3); + +cl_msg *as_msg_make_response_msg(uint32_t result_code, uint32_t generation, + uint32_t void_time, as_msg_op **ops, struct as_bin_s **bins, + uint16_t bin_count, struct as_namespace_s *ns, cl_msg *msgp_in, + size_t *msg_sz_in, uint64_t trid); +int32_t as_msg_make_response_bufbuilder(cf_buf_builder **bb_r, + struct as_storage_rd_s *rd, bool no_bin_data, bool include_key, + bool skip_empty_records, cf_vector *select_bins); +cl_msg *as_msg_make_val_response(bool success, const as_val *val, + uint32_t result_code, uint32_t generation, uint32_t void_time, + uint64_t trid, size_t *p_msg_sz); +void as_msg_make_val_response_bufbuilder(const as_val *val, + cf_buf_builder **bb_r, uint32_t val_sz, bool); + +int as_msg_send_reply(struct as_file_handle_s *fd_h, uint32_t result_code, + uint32_t generation, uint32_t void_time, as_msg_op **ops, + struct as_bin_s **bins, uint16_t bin_count, struct as_namespace_s *ns, + uint64_t trid); +int as_msg_send_ops_reply(struct as_file_handle_s *fd_h, cf_dyn_buf *db); +bool as_msg_send_fin(cf_socket *sock, uint32_t result_code); +size_t as_msg_send_fin_timeout(cf_socket *sock, uint32_t result_code, + int32_t timeout); + +// Async IO +typedef int (* as_netio_finish_cb) (void *udata, int retcode); +typedef int (* as_netio_start_cb) (void *udata, int seq); +typedef struct as_netio_s { + as_netio_finish_cb finish_cb; + as_netio_start_cb start_cb; + void * data; + // fd and buffer + struct as_file_handle_s * fd_h; + cf_buf_builder * bb_r; + uint32_t offset; + uint32_t seq; + bool slow; + uint64_t start_time; +} as_netio; + +void as_netio_init(); +int as_netio_send(as_netio *io, bool slow, bool blocking); + +#define AS_NETIO_OK 0 +#define AS_NETIO_CONTINUE 1 +#define AS_NETIO_ERR 2 +#define AS_NETIO_IO_ERR 3 + +// These values correspond to client protocol values - do not change them! +typedef enum as_udf_op { + AS_UDF_OP_KVS = 0, + AS_UDF_OP_AGGREGATE = 1, + AS_UDF_OP_BACKGROUND = 2, + AS_UDF_OP_FOREGROUND = 3 // not supported yet +} as_udf_op; + +#define CDT_MAGIC 0xC0 // so we know it can't be (first byte of) msgpack list/map + +typedef enum as_cdt_paramtype_e { + AS_CDT_PARAM_NONE = 0, + + AS_CDT_PARAM_INDEX = 1, + AS_CDT_PARAM_COUNT = 2, + AS_CDT_PARAM_PAYLOAD = 3, + AS_CDT_PARAM_FLAGS = 4, +} as_cdt_paramtype; + +typedef enum result_type_e { + RESULT_TYPE_NONE = 0, + RESULT_TYPE_INDEX = 1, + RESULT_TYPE_REVINDEX = 2, + RESULT_TYPE_RANK = 3, + RESULT_TYPE_REVRANK = 4, + RESULT_TYPE_COUNT = 5, + RESULT_TYPE_KEY = 6, + RESULT_TYPE_VALUE = 7, + RESULT_TYPE_MAP = 8, + RESULT_TYPE_INDEX_RANGE = 9, + RESULT_TYPE_REVINDEX_RANGE = 10, + RESULT_TYPE_RANK_RANGE = 11, + RESULT_TYPE_REVRANK_RANGE = 12, +} result_type_t; + +typedef enum { + AS_CDT_OP_FLAG_RESULT_MASK = 0x0000ffff, + AS_CDT_OP_FLAG_INVERTED = 0x00010000 +} as_cdt_op_flags; + +typedef enum { + AS_CDT_SORT_ASCENDING = 0, + AS_CDT_SORT_DESCENDING = 1, + AS_CDT_SORT_DROP_DUPLICATES = 2 +} as_cdt_sort_flags; + +typedef enum { + AS_CDT_LIST_MODIFY_DEFAULT = 0x00, + AS_CDT_LIST_ADD_UNIQUE = 0x01, + AS_CDT_LIST_INSERT_BOUNDED = 0x02 +} as_cdt_list_modify_flags; + +typedef enum as_cdt_optype_e { + // ------------------------------------------------------------------------ + // List Operation + + AS_CDT_OP_LIST_SET_TYPE = 0, + + // Adds + AS_CDT_OP_LIST_APPEND = 1, + AS_CDT_OP_LIST_APPEND_ITEMS = 2, + AS_CDT_OP_LIST_INSERT = 3, + AS_CDT_OP_LIST_INSERT_ITEMS = 4, + + // Removes + AS_CDT_OP_LIST_POP = 5, + AS_CDT_OP_LIST_POP_RANGE = 6, + AS_CDT_OP_LIST_REMOVE = 7, + AS_CDT_OP_LIST_REMOVE_RANGE = 8, + + // Modifies + AS_CDT_OP_LIST_SET = 9, + AS_CDT_OP_LIST_TRIM = 10, + AS_CDT_OP_LIST_CLEAR = 11, + AS_CDT_OP_LIST_INCREMENT = 12, + + AS_CDT_OP_LIST_SORT = 13, + + // Reads + AS_CDT_OP_LIST_SIZE = 16, + AS_CDT_OP_LIST_GET = 17, + AS_CDT_OP_LIST_GET_RANGE = 18, + + // GET_BYs + AS_CDT_OP_LIST_GET_BY_INDEX = 19, + AS_CDT_OP_LIST_GET_BY_VALUE = 20, + AS_CDT_OP_LIST_GET_BY_RANK = 21, + + AS_CDT_OP_LIST_GET_ALL_BY_VALUE = 22, + AS_CDT_OP_LIST_GET_ALL_BY_VALUE_LIST = 23, + + AS_CDT_OP_LIST_GET_BY_INDEX_RANGE = 24, + AS_CDT_OP_LIST_GET_BY_VALUE_INTERVAL = 25, + AS_CDT_OP_LIST_GET_BY_RANK_RANGE = 26, + + // REMOVE_BYs + AS_CDT_OP_LIST_REMOVE_BY_INDEX = 32, + AS_CDT_OP_LIST_REMOVE_BY_VALUE = 33, + AS_CDT_OP_LIST_REMOVE_BY_RANK = 34, + + AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE = 35, + AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE_LIST = 36, + + AS_CDT_OP_LIST_REMOVE_BY_INDEX_RANGE = 37, + AS_CDT_OP_LIST_REMOVE_BY_VALUE_INTERVAL = 38, + AS_CDT_OP_LIST_REMOVE_BY_RANK_RANGE = 39, + + // ------------------------------------------------------------------------ + // Map Operation + + // Create and flags + AS_CDT_OP_MAP_SET_TYPE = 64, + + // Modify Ops + AS_CDT_OP_MAP_ADD = 65, + AS_CDT_OP_MAP_ADD_ITEMS = 66, + AS_CDT_OP_MAP_PUT = 67, + AS_CDT_OP_MAP_PUT_ITEMS = 68, + AS_CDT_OP_MAP_REPLACE = 69, + AS_CDT_OP_MAP_REPLACE_ITEMS = 70, + AS_CDT_OP_MAP_RESERVED_0 = 71, + AS_CDT_OP_MAP_RESERVED_1 = 72, + + AS_CDT_OP_MAP_INCREMENT = 73, + AS_CDT_OP_MAP_DECREMENT = 74, + + AS_CDT_OP_MAP_CLEAR = 75, + + AS_CDT_OP_MAP_REMOVE_BY_KEY = 76, + AS_CDT_OP_MAP_REMOVE_BY_INDEX = 77, + AS_CDT_OP_MAP_REMOVE_BY_VALUE = 78, + AS_CDT_OP_MAP_REMOVE_BY_RANK = 79, + + AS_CDT_OP_MAP_RESERVED_2 = 80, + AS_CDT_OP_MAP_REMOVE_BY_KEY_LIST = 81, + AS_CDT_OP_MAP_REMOVE_ALL_BY_VALUE = 82, + AS_CDT_OP_MAP_REMOVE_BY_VALUE_LIST = 83, + + AS_CDT_OP_MAP_REMOVE_BY_KEY_INTERVAL = 84, + AS_CDT_OP_MAP_REMOVE_BY_INDEX_RANGE = 85, + AS_CDT_OP_MAP_REMOVE_BY_VALUE_INTERVAL = 86, + AS_CDT_OP_MAP_REMOVE_BY_RANK_RANGE = 87, + + // Read ops + AS_CDT_OP_MAP_SIZE = 96, + + AS_CDT_OP_MAP_GET_BY_KEY = 97, + AS_CDT_OP_MAP_GET_BY_INDEX = 98, + AS_CDT_OP_MAP_GET_BY_VALUE = 99, + AS_CDT_OP_MAP_GET_BY_RANK = 100, + + AS_CDT_OP_MAP_RESERVED_3 = 101, + AS_CDT_OP_MAP_GET_ALL_BY_VALUE = 102, + + AS_CDT_OP_MAP_GET_BY_KEY_INTERVAL = 103, + AS_CDT_OP_MAP_GET_BY_INDEX_RANGE = 104, + AS_CDT_OP_MAP_GET_BY_VALUE_INTERVAL = 105, + AS_CDT_OP_MAP_GET_BY_RANK_RANGE = 106, + + AS_CDT_OP_MAP_GET_BY_KEY_LIST = 107, + AS_CDT_OP_MAP_GET_BY_VALUE_LIST = 108, + +} as_cdt_optype; + +#define AS_CDT_OP_LIST_LAST AS_CDT_OP_LIST_REMOVE_BY_RANK_RANGE diff --git a/as/include/base/rec_props.h b/as/include/base/rec_props.h new file mode 100644 index 00000000..14f4f7dd --- /dev/null +++ b/as/include/base/rec_props.h @@ -0,0 +1,79 @@ +/* + * rec_props.h + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * A list of record properties. + * + */ + +#pragma once + + +//========================================================== +// Includes +// + +#include +#include + + +//========================================================== +// Typedefs +// + +// Values stored on drive - be careful. +typedef enum { + CL_REC_PROPS_FIELD_SET_NAME = 0, + CL_REC_PROPS_FIELD_UNUSED_1 = 1, + CL_REC_PROPS_FIELD_KEY = 2, + CL_REC_PROPS_FIELD_LAST_PLUS_1 +} as_rec_props_field_id; + +//------------------------------------------------ +// Class Member Data +// +typedef struct as_rec_props_s { + uint8_t* p_data; + uint32_t size; +} as_rec_props; + + +//========================================================== +// Public API +// + +void as_rec_props_clear(as_rec_props *_this); +int as_rec_props_get_value(const as_rec_props *_this, + as_rec_props_field_id id, uint32_t *p_value_size, uint8_t **pp_value); +uint32_t as_rec_props_sizeof_field(uint32_t value_size); +void as_rec_props_init(as_rec_props *_this, uint8_t *p_data); +void as_rec_props_init_malloc(as_rec_props *_this, uint32_t malloc_size); +void as_rec_props_add_field(as_rec_props *_this, + as_rec_props_field_id id, uint32_t value_size, const uint8_t *p_value); +void as_rec_props_add_field_null_terminate(as_rec_props *_this, + as_rec_props_field_id id, uint32_t value_len, const uint8_t *p_value); + +size_t as_rec_props_size_all(const uint8_t *set_name, size_t set_name_len, + const uint8_t *key, size_t key_size); +void as_rec_props_fill_all(as_rec_props *_this, uint8_t *p_data, + const uint8_t *set_name, size_t set_name_len, const uint8_t *key, + size_t key_size); diff --git a/as/include/base/scan.h b/as/include/base/scan.h new file mode 100644 index 00000000..f71f3f32 --- /dev/null +++ b/as/include/base/scan.h @@ -0,0 +1,58 @@ +/* + * scan.h + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include + +#include "dynbuf.h" + + +//========================================================== +// Forward declarations. +// + +struct as_mon_jobstat_s; +struct as_namespace_s; +struct as_transaction_s; + + +//========================================================== +// Public API. +// + +void as_scan_init(); +int as_scan(struct as_transaction_s *tr, struct as_namespace_s *ns); +void as_scan_limit_active_jobs(uint32_t max_active); +void as_scan_limit_finished_jobs(uint32_t max_done); +void as_scan_resize_thread_pool(uint32_t n_threads); +int as_scan_get_active_job_count(); +int as_scan_list(char* name, cf_dyn_buf* db); +struct as_mon_jobstat_s* as_scan_get_jobstat(uint64_t trid); +struct as_mon_jobstat_s* as_scan_get_jobstat_all(int* size); +int as_scan_abort(uint64_t trid); +int as_scan_abort_all(); +int as_scan_change_job_priority(uint64_t trid, uint32_t priority); diff --git a/as/include/base/secondary_index.h b/as/include/base/secondary_index.h new file mode 100644 index 00000000..8fecf337 --- /dev/null +++ b/as/include/base/secondary_index.h @@ -0,0 +1,691 @@ +/* + * secondary_index.h + * + * Copyright (C) 2012-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * SYNOPSIS + * Abstraction to support secondary indexes with multiple implementations. + */ + +#pragma once + +#include "base/datamodel.h" +#include "base/monitor.h" +#include "base/proto.h" +#include "base/system_metadata.h" +#include "base/transaction.h" +#include "fabric/partition.h" + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_ll.h" + +#include "dynbuf.h" +#include "hist.h" +#include +#include +#include +#include +#include "storage/storage.h" + + +/* + * HARD LIMIT ON SIZES + */ +// ************************************************************************************************** +#define AS_SINDEX_MAX_STRING_KSIZE 2048 +#define AS_SINDEX_MAX_GEOJSON_KSIZE (1024 * 1024) +#define OLD_SINDEX_SMD_KEY_SIZE AS_ID_INAME_SZ + AS_ID_NAMESPACE_SZ +#define SINDEX_SMD_KEY_SIZE (AS_ID_NAMESPACE_SZ + AS_SET_NAME_MAX_SIZE + AS_SINDEX_MAX_PATH_LENGTH + 1 + 2 + 2) +#define SINDEX_SMD_VALUE_SIZE (AS_SMD_MAJORITY_CONSENSUS_KEYSIZE) +#define OLD_SINDEX_MODULE "sindex_module" +#define SINDEX_MODULE "sindex" +#define AS_SINDEX_MAX_PATH_LENGTH 256 +#define AS_SINDEX_MAX_DEPTH 10 +#define AS_SINDEX_TYPE_STR_SIZE 20 // LIST / MAPKEYS / MAPVALUES / DEFAULT(NONE) +#define AS_SINDEXDATA_STR_SIZE AS_SINDEX_MAX_PATH_LENGTH + 1 + 8 // binpath + separator (,) + keytype (string/numeric) +#define AS_INDEX_KEYS_ARRAY_QUEUE_HIGHWATER 512 +#define AS_INDEX_KEYS_PER_ARR 51 +// ************************************************************************************************** + +/* + * Return status codes for index object functions. + * + * NB: When adding error code add the string in the as_sindex_err_str + * in secondary_index.c + * + * Negative > 10 are the ones which show up and goes till client + * + * Positive are < 10 are something which are internal + */ +// ************************************************************************************************** +typedef enum { + AS_SINDEX_ERR_INAME_MAXLEN = -17, + AS_SINDEX_ERR_MAXCOUNT = -16, + AS_SINDEX_ERR_SET_MISMATCH = -15, + AS_SINDEX_ERR_UNKNOWN_KEYTYPE = -14, + AS_SINDEX_ERR_BIN_NOTFOUND = -13, + AS_SINDEX_ERR_TYPE_MISMATCH = -11, + + // Needed when attempting index create/query + AS_SINDEX_ERR_FOUND = -6, + AS_SINDEX_ERR_NOTFOUND = -5, + AS_SINDEX_ERR_NO_MEMORY = -4, + AS_SINDEX_ERR_PARAM = -3, + AS_SINDEX_ERR_NOT_READABLE = -2, + AS_SINDEX_ERR = -1, + AS_SINDEX_OK = 0, + + // Internal Not needed + AS_SINDEX_CONTINUE = 1, + AS_SINDEX_DONE = 2, + // Needed when inserting object in the btree. + AS_SINDEX_KEY_FOUND = 3, + AS_SINDEX_KEY_NOTFOUND = 4 +} as_sindex_status; +// ************************************************************************************************** + +/* + * SINDEX OP TYPES. + */ +// ************************************************************************************************** +typedef enum { + AS_SINDEX_OP_UPDATE = 0, + AS_SINDEX_OP_DELETE = 1, + AS_SINDEX_OP_INSERT = 2, + AS_SINDEX_OP_READ = 3 +} as_sindex_op; +// ************************************************************************************************** + +/* + * SINDEX GC RETURN ENUMS + */ +// ************************************************************************************************** +typedef enum { + AS_SINDEX_GC_OK = 0, + AS_SINDEX_GC_ERROR = 1, + AS_SINDEX_GC_SKIP_ITERATION = 2 +} as_sindex_gc_status; +// ************************************************************************************************** + +/* + * SECONDARY INDEX KEY TYPES same as COL_TYPE* + */ +// ************************************************************************************************** +typedef uint8_t as_sindex_ktype; +// ************************************************************************************************** + +/* + * SINDEX TYPES. + * THEY WOULD BE IN SYNC WITH THE CLIENTS. + * Do not change the order of this enum + */ +// ************************************************************************************************** +typedef enum { + AS_SINDEX_ITYPE_DEFAULT = 0, + AS_SINDEX_ITYPE_LIST = 1, + AS_SINDEX_ITYPE_MAPKEYS = 2, + AS_SINDEX_ITYPE_MAPVALUES = 3, + AS_SINDEX_ITYPE_MAX = 4 +} as_sindex_type; +#define AS_SINDEX_ITYPE_MAX_TO_STR_SZ 2 +// ************************************************************************************************** + +/* + * STRUCTURES FROM ALCHEMY + */ +// ***************************** +struct btree; +// ************************************************************************************************** + +/* + * STATS AND CONFIG STRUCTURE + * Stats are collected about memory utilization based on simple index + * overhead. Any insert delete from the secondary index would update + * this number and the memory management folks has to use this info. + */ +// ************************************************************************************************** +typedef struct as_sindex_stat_s { + cf_atomic64 n_objects; + int n_keys; + cf_atomic64 mem_used; + + cf_atomic64 n_reads; + cf_atomic64 read_errs; + + cf_atomic64 n_writes; + cf_atomic64 write_errs; + histogram * _write_hist; // Histogram to track time spend writing to the sindex + histogram * _si_prep_hist; + + cf_atomic64 n_deletes; + cf_atomic64 delete_errs; + histogram * _delete_hist; // Histogram to track time spend deleting from sindex + + // Background thread stats + cf_atomic64 loadtime; + cf_atomic64 recs_pending; + + cf_atomic64 n_defrag_records; + cf_atomic64 defrag_time; + + // Query Stats + histogram * _query_hist; // Histogram to track query latency + histogram * _query_batch_lookup; // Histogram to track latency of batch request from sindex tree. + histogram * _query_batch_io; // Histogram to track time spend doing I/O per batch + // --aggregation stats + cf_atomic64 n_aggregation; + cf_atomic64 agg_response_size; + cf_atomic64 agg_num_records; + cf_atomic64 agg_errs; + // --lookup stats + cf_atomic64 n_lookup; + cf_atomic64 lookup_response_size; + cf_atomic64 lookup_num_records; + cf_atomic64 lookup_errs; + + histogram * _query_rcnt_hist; // Histogram to track record counts from queries + histogram * _query_diff_hist; // Histogram to track the false positives found by queries +} as_sindex_stat; + +typedef struct as_sindex_config_s { + volatile uint16_t flag; // TODO change_name +} as_sindex_config; + +// ************************************************************************************************** + + +/* + * SINDEX METADATAS + */ +// ************************************************************************************************** +typedef struct as_sindex_physical_metadata_s { + pthread_rwlock_t slock; + struct btree *ibtr; +} as_sindex_pmetadata; + + +typedef struct as_sindex_path_s { + as_particle_type type; // MAP/LIST + union { + int index; // For index of lists. + char * key_str; // For string type keys in maps. + uint64_t key_int; // For integer type keys in maps. + } value; + as_particle_type mapkey_type; // This could be either string or integer type +} as_sindex_path; + +typedef struct as_sindex_metadata_s { + pthread_rwlock_t slock; + // Protected by lock + as_sindex_pmetadata * pimd; + uint32_t flag; + + // Static Data. Does not need protection + struct as_sindex_s * si; + char * ns_name; + char * set; + char * iname; + char * bname; + uint32_t binid; // Redundant info to aid search + as_sindex_ktype sktype; // Same as Aerospike Index type + as_sindex_type itype; + as_sindex_path path[AS_SINDEX_MAX_DEPTH]; + int path_length; + char * path_str; + int nprts; // Aerospike Index Number of Index partitions +} as_sindex_metadata; + +/* + * This structure right now hangs from the namespace structure for the + * Aerospike Index B-tree. + */ +typedef struct as_sindex_s { + int simatch; //self, shash match by name + // Protected by SI_GWLOCK + uint8_t state; + + // TODO : shift to imd + volatile uint16_t flag; + // No need to be volatile; little stale info + // about this is ok. And it is not checked + // in busy loop + bool enable_histogram; // default false; + + as_namespace *ns; + + // Protected by si reference + struct as_sindex_metadata_s *imd; + struct as_sindex_metadata_s *recreate_imd; + + as_sindex_stat stats; + as_sindex_config config; +} as_sindex; + +// ************************************************************************************************** +/* + * SBINS STRUCTURES + */ +typedef struct sbin_value_pool_s{ + uint32_t used_sz; + uint8_t *value; +} sbin_value_pool; + +#define AS_SINDEX_VALUESZ_ON_STACK 16 * 1000 +#define SINDEX_BINS_SETUP(skey_bin, size) \ + sbin_value_pool value_pool; \ + value_pool.value = alloca(AS_SINDEX_VALUESZ_ON_STACK); \ + value_pool.used_sz = 0; \ + as_sindex_bin skey_bin[(size)]; \ + for (int id = 0; id < (size); id++) { \ + skey_bin[id].si = NULL; \ + skey_bin[id].stack_buf = &value_pool; \ + } + +/* + * Used as structure to call into secondary indexes sindex_* interface + * TODO: as_sindex_bin is not appropriate name for this structure. + * maybe as_sindex_transaction + */ +typedef struct as_sindex_bin_s { + union { // we use this if we need to store only one value inside sbin. + int64_t int_val; // accessing this is much faster than accessing any other value + cf_digest str_val; // value on the stack. + } value; + uint64_t num_values; + void * values; // If there are more than 1 value in the sbin, we use this to + as_particle_type type; // point to them. the type of data which is going to get indexed + as_sindex_op op; // (STRING or INTEGER). Should we delete or insert this values + bool to_free; // from/into the secondary index tree. If the values are malloced. + as_sindex * si; // simatch of the si this bin is pointing to. + sbin_value_pool * stack_buf; + uint32_t heap_capacity; +} as_sindex_bin; + +// TODO: Reorganise this structure. +// No need of union. +typedef struct as_sindex_bin_data_s { + uint32_t id; + as_particle_type type; // this type is citrusleaf type + // Union is to support sindex for other datatypes in future. + // Currently sindex is supported for only int64 and string. + union { + int64_t i64; + } u; + cf_digest digest; +} as_sindex_bin_data; + +// Caution: Using this will waste 12 bytes per long type skey +typedef struct as_sindex_key_s { + union { + cf_digest str_key; + uint64_t int_key; + } key; +} as_sindex_key; +// ************************************************************************************************** + + +// ************************************************************************************************** + +/* + * STRUCTUES FOR QUERY MODULE + */ +// ************************************************************************************************** +struct ai_obj; +typedef struct as_sindex_query_context_s { + uint64_t bsize; + cf_ll *recl; + uint64_t n_bdigs; + + int range_index; + + // Physical Tree offset + bool new_ibtr; // If new tree + int pimd_idx; + + // IBTR offset + bool nbtr_done; // If nbtr was finished + // next iteration starts + // from key next to bkey + struct ai_obj *bkey; // offset in ibtr + + // NBTR offset + cf_digest bdig; + + // If true all query-able partitions will be reserved before processing the query + bool partitions_pre_reserved; + // Cache information about query-able partitions + bool can_partition_query[AS_PARTITIONS]; +} as_sindex_qctx; + +/* + * The range structure used to define the lower and upper limit + * along with the key types. + * + * [0, endl] + * [startl, -1(inf)] + * [startl, endl] + */ +typedef struct as_sindex_range_s { + uint8_t num_binval; + bool isrange; + as_sindex_bin_data start; + as_sindex_bin_data end; + as_sindex_type itype; + char bin_path[AS_SINDEX_MAX_PATH_LENGTH]; + uint64_t cellid; // target of regions-containing-point query + geo_region_t region; // target of points-in-region query +} as_sindex_range; + +/* + * sindex_keys are used by Secondary index queries to validate the keys against + * the values of bins + * ALl the jobs which runs over these queries also uses them + * Like - Aggregation Query + */ +typedef struct as_index_keys_arr_s { + uint32_t num; + cf_digest pindex_digs[AS_INDEX_KEYS_PER_ARR]; + as_sindex_key sindex_keys[AS_INDEX_KEYS_PER_ARR]; +} __attribute__ ((packed)) as_index_keys_arr; + +typedef struct as_index_keys_ll_element_s { + cf_ll_element ele; + as_index_keys_arr * keys_arr; +} as_index_keys_ll_element; + + +// ************************************************************************************************** + + +// APIs exposed to other modules +// TODO return values is actually enum. + +/* + * MODULE INIT AND SHUTDOWN + */ +// ************************************************************************************************** + +/* Index abstraction layer functions. */ +/* + * Initialize an instantiation of the index abstraction layer + * using the array of index type-specific parameters passed in. + * + * All indexes created during this instantiation will use these type-specific + * parameters (e.g., maximum data structure sizes, allocation policies, and any + * other tuning parameters.) + * + * Call once before creating any type of index object. + */ +extern int as_sindex_init(as_namespace *ns); + +/* + * Terminate an instantiation of the index abstraction layer. + * + * Do not use any "sindex" functions after calling this function, so free your indexes beforehand. + */ +extern int as_sindex_reinit(char *name, char *params, cf_dyn_buf *db); +// ************************************************************************************************** + +/* + * INDEX BOOT + */ +// ************************************************************************************************** +extern int as_sindex_populate_done(as_sindex *si); +extern int as_sindex_boot_populateall_done(as_namespace *ns); +extern int as_sindex_boot_populateall(); +// ************************************************************************************************** + +/* + * DDL AND METADATA QUERY + * +*/ +// ************************************************************************************************** +extern int as_sindex_create(as_namespace *ns, as_sindex_metadata *imd); +extern int as_sindex_destroy(as_namespace *ns, as_sindex_metadata *imd); +extern int as_sindex_recreate(as_sindex_metadata *imd); +extern void as_sindex_destroy_pmetadata(as_sindex *si); +// ************************************************************************************************** + + +/* + * CREATION AND UPDATION OF SINDEX BIN + */ +// ************************************************************************************************** +extern int as_sindex_sbins_from_rd(as_storage_rd *rd, uint16_t from_bin, uint16_t to_bin, + as_sindex_bin sbins[], as_sindex_op op); +extern int as_sindex_sbins_from_bin(as_namespace *ns, const char *set, const as_bin *b, + as_sindex_bin * start_sbin, as_sindex_op op); +extern int as_sindex_update_by_sbin(as_namespace *ns, const char *set, as_sindex_bin *start_sbin, + int num_sbins, cf_digest * pkey); +extern uint32_t as_sindex_sbins_populate(as_sindex_bin *sbins, as_namespace *ns, const char *set_name, + const as_bin *b_old, const as_bin *b_new); +// ************************************************************************************************** + + +/* + * DMLs USING RECORDS + */ +// ************************************************************************************************** +int as_sindex_put_rd(as_sindex *si, as_storage_rd *rd); +void as_sindex_putall_rd(as_namespace *ns, as_storage_rd *rd); +// ************************************************************************************************** + + +/* + * UTILS + */ +// ************************************************************************************************** +extern int as_sindex_ns_has_sindex(as_namespace *ns); +extern const char * as_sindex_err_str(int err_code); +extern uint8_t as_sindex_err_to_clienterr(int err, char *fname, int lineno); +extern bool as_sindex_isactive(as_sindex *si); +extern int as_sindex_get_err(int op_code, char *filename, int lineno); +extern as_sindex_status as_sindex__delete_from_set_binid_hash(as_namespace * ns, + as_sindex_metadata * imd); +extern as_val * as_sindex_extract_val_from_path(as_sindex_metadata * imd, as_val * v); +extern as_sindex_gc_status as_sindex_can_defrag_record(as_namespace *ns, cf_digest *keyd); +extern as_sindex_status as_sindex_extract_bin_path(as_sindex_metadata * imd, char * path_str); +int as_sindex_create_check_params(as_namespace* ns, as_sindex_metadata* imd); +bool as_sindex_delete_checker(as_namespace *ns, as_sindex_metadata *imd); +as_particle_type as_sindex_pktype(as_sindex_metadata * imd); +extern const char * as_sindex_ktype_str(as_sindex_ktype type); +extern as_sindex_ktype as_sindex_ktype_from_string(const char * type_str); +int as_sindex_arr_lookup_by_set_binid_lockfree(as_namespace * ns, + const char *set, int binid, as_sindex ** si_arr); +void as_sindex_delete_set(as_namespace * ns, char * set_name); +// ************************************************************************************************** + +/* + * INFO AND CONFIGS + */ +// ************************************************************************************************** +extern int as_sindex_list_str(as_namespace *ns, cf_dyn_buf *db); +extern int as_sindex_stats_str(as_namespace *ns, char * iname, cf_dyn_buf *db); +extern int as_sindex_set_config(as_namespace *ns, as_sindex_metadata *imd, char *params); +extern void as_sindex_dump(char *nsname, char *iname, char *fname, bool verbose); +extern void as_sindex_gconfig_default(struct as_config_s *c); +extern int as_info_parse_params_to_sindex_imd(char* params, as_sindex_metadata *imd, cf_dyn_buf* db, + bool is_create, bool *is_smd_op, char * cmd); +void as_sindex__config_default(as_sindex *si); +void as_sindex_ticker_start(as_namespace * ns, as_sindex * si); +void as_sindex_ticker(as_namespace * ns, as_sindex * si, uint64_t n_obj_scanned, uint64_t start_time); +void as_sindex_ticker_done(as_namespace * ns, as_sindex * si, uint64_t start_time); +// ************************************************************************************************** + +/* + * HISTOGRAMS + */ +// ************************************************************************************************** +extern int as_sindex_histogram_enable(as_namespace *ns, char * iname, bool enable); +extern int as_sindex_histogram_dumpall(as_namespace *ns); +#define SINDEX_HIST_INSERT_DATA_POINT(si, type, start_time_ns) \ +do { \ + if (si->enable_histogram && start_time_ns != 0) { \ + if (si->stats._ ##type) { \ + histogram_insert_data_point(si->stats._ ##type, start_time_ns); \ + } \ + } \ +} while(0); + +#define SINDEX_HIST_INSERT_RAW(si, type, value) \ +do { \ + if (si->enable_histogram) { \ + if (si->stats._ ##type) { \ + histogram_insert_raw(si->stats._ ##type, value); \ + } \ + } \ +} while(0); + + +// ************************************************************************************************** + +/* + * UTILS FOR QUERIES +*/ +// ************************************************************************************************** +extern int as_sindex_query(as_sindex *si, as_sindex_range *range, as_sindex_qctx *qctx); +extern int as_sindex_range_free(as_sindex_range **srange); +extern int as_sindex_rangep_from_msg(as_namespace *ns, as_msg *msgp, as_sindex_range **srange); +extern int as_sindex_range_from_msg(as_namespace *ns, as_msg *msgp, as_sindex_range *srange); +extern bool as_sindex_can_query(as_sindex *si); +extern as_sindex * as_sindex_from_msg(as_namespace *ns, as_msg *msgp); +extern as_sindex * as_sindex_from_range(as_namespace *ns, char *set, as_sindex_range *srange); +extern int as_index_keys_reduce_fn(cf_ll_element *ele, void *udata); +extern void as_index_keys_destroy_fn(cf_ll_element *ele); +// ************************************************************************************************** + + +/* + * RESERVE, RELEASE AND FREE + */ +// ************************************************************************************************** +#define AS_SINDEX_RESERVE(si) \ + as_sindex_reserve((si), __FILE__, __LINE__); +#define AS_SINDEX_RELEASE(si) \ + as_sindex_release((si), __FILE__, __LINE__); +extern int as_sindex_reserve(as_sindex *si, char *fname, int lineno); +extern void as_sindex_release(as_sindex *si, char *fname, int lineno); +extern int as_sindex_imd_free(as_sindex_metadata *imd); +extern int as_sindex_sbin_free(as_sindex_bin *sbin); +extern int as_sindex_sbin_freeall(as_sindex_bin *sbin, int numval); +void as_sindex_release_arr(as_sindex *si_arr[], int si_arr_sz); +// ************************************************************************************************** + +/* + * SINDEX LOCKS + */ +// ************************************************************************************************** +extern pthread_rwlock_t g_sindex_rwlock; +#define SINDEX_GRLOCK() \ +do { \ + int ret = pthread_rwlock_rdlock(&g_sindex_rwlock); \ + if (ret) cf_warning(AS_SINDEX, "GRLOCK(%d) %s:%d",ret, __FILE__, __LINE__); \ +} while (0); + +#define SINDEX_GWLOCK() \ +do { \ + int ret = pthread_rwlock_wrlock(&g_sindex_rwlock); \ + if (ret) cf_warning(AS_SINDEX, "GWLOCK(%d) %s:%d", ret, __FILE__, __LINE__); \ +} while (0); + +#define SINDEX_GRUNLOCK() \ +do { \ + int ret = pthread_rwlock_unlock(&g_sindex_rwlock); \ + if (ret) cf_warning(AS_SINDEX, "GRUNLOCK (%d) %s:%d",ret, __FILE__, __LINE__); \ +} while (0); + +#define SINDEX_GWUNLOCK() \ +do { \ + int ret = pthread_rwlock_unlock(&g_sindex_rwlock); \ + if (ret) cf_warning(AS_SINDEX, "GWUNLOCK (%d) %s:%d",ret, __FILE__, __LINE__); \ +} while (0); + +#define PIMD_RLOCK(l) \ +do { \ + int ret = pthread_rwlock_rdlock((l)); \ + if (ret) cf_warning(AS_SINDEX, "RLOCK_ONLY (%d) %s:%d", ret, __FILE__, __LINE__); \ +} while(0); + +#define PIMD_WLOCK(l) \ +do { \ + int ret = pthread_rwlock_wrlock((l)); \ + if (ret) cf_warning(AS_SINDEX, "WLOCK_ONLY (%d) %s:%d",ret, __FILE__, __LINE__); \ +} while(0); + +#define PIMD_RUNLOCK(l) \ +do { \ + int ret = pthread_rwlock_unlock((l)); \ + if (ret) cf_warning(AS_SINDEX, "RUNLOCK_ONLY (%d) %s:%d",ret, __FILE__, __LINE__); \ +} while(0); + +#define PIMD_WUNLOCK(l) \ +do { \ + int ret = pthread_rwlock_unlock((l)); \ + if (ret) cf_warning(AS_SINDEX, "WUNLOCK_ONLY (%d) %s:%d",ret, __FILE__, __LINE__); \ +} while(0); + +// ************************************************************************************************** + +/* + * APIs for SMD + */ +// ************************************************************************************************** +extern void as_sindex_init_smd(); +extern void as_sindex_imd_to_smd_key(const as_sindex_metadata *imd, char *smd_key); +extern bool as_sindex_delete_imd_to_smd_key(as_namespace *ns, as_sindex_metadata *imd, char *smd_key); +extern int as_sindex_smd_accept_cb(char *module, as_smd_item_list_t *items, void *udata, + uint32_t accept_opt); +// ************************************************************************************************** + +/* + * QUERY MACROS + */ +// ************************************************************************************************** +#define AS_QUERY_OK AS_SINDEX_OK +#define AS_QUERY_ERR AS_SINDEX_ERR +#define AS_QUERY_CONTINUE AS_SINDEX_CONTINUE +#define AS_QUERY_DONE AS_SINDEX_DONE +// ************************************************************************************************** + +/* + * QUERY APIs exposed to other modules + */ +// ************************************************************************************************** +extern void as_query_init(); +extern int as_query(as_transaction *tr, as_namespace *ns); +extern int as_query_reinit(int set_size, int *actual_size); +extern int as_query_worker_reinit(int set_size, int *actual_size); +extern int as_query_list(char *name, cf_dyn_buf *db); +extern int as_query_kill(uint64_t trid); +extern void as_query_gconfig_default(struct as_config_s *c); +extern as_mon_jobstat * as_query_get_jobstat(uint64_t trid); +extern as_mon_jobstat * as_query_get_jobstat_all(int * size); +extern int as_query_set_priority(uint64_t trid, uint32_t priority); +extern void as_query_histogram_dumpall(); +extern as_index_keys_arr * as_index_get_keys_arr(); +extern void as_index_keys_release_arr_to_queue(as_index_keys_arr *v); +extern int as_index_keys_ll_reduce_fn(cf_ll_element *ele, void *udata); +extern void as_index_keys_ll_destroy_fn(cf_ll_element *ele); + +extern cf_atomic32 g_query_short_running; +extern cf_atomic32 g_query_long_running; +// ************************************************************************************************** diff --git a/as/include/base/security.h b/as/include/base/security.h new file mode 100644 index 00000000..c34fe07b --- /dev/null +++ b/as/include/base/security.h @@ -0,0 +1,106 @@ +/* + * security.h + * + * Copyright (C) 2014-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include + + +//========================================================== +// Forward declarations. +// + +struct as_file_handle_s; +struct as_namespace_s; +struct as_transaction_s; + + +//========================================================== +// Typedefs & constants. +// + +// Security permissions. +typedef enum { + PERM_NONE = 0, + + // Data transactions. + PERM_READ = 0x0001, + PERM_SCAN = 0x0002, + PERM_QUERY = 0x0004, + PERM_WRITE = 0x0008, + PERM_DELETE = 0x0010, + PERM_UDF_APPLY = 0x0020, + PERM_UDF_SCAN = 0x0040, + PERM_UDF_QUERY = 0x0080, + // ... 8 unused bits ... + + // Data transactions' system metadata management. + PERM_INDEX_MANAGE = 0x00010000, + PERM_UDF_MANAGE = 0x00020000, + PERM_SCAN_MANAGE = 0x00040000, + PERM_QUERY_MANAGE = 0x00080000, + PERM_JOB_MONITOR = 0x00100000, + PERM_TRUNCATE = 0x00200000, + // ... 2 unused bits ... + + // Deployment operations management. + PERM_SET_CONFIG = 0x01000000, + PERM_LOGGING_CTRL = 0x02000000, + PERM_SERVICE_CTRL = 0x04000000, + + // Database users and roles management. + PERM_USER_ADMIN = 0x100000000000 +} as_sec_perm; + +// Current security message version. +#define AS_SEC_MSG_SCHEME 0 + +// Security protocol message container. +typedef struct as_sec_msg_s { + uint8_t scheme; // security scheme/version + uint8_t result; // result code (only for responses, except MORE) + uint8_t command; // security command (only for requests) + uint8_t n_fields; // number of fields in this message + + uint8_t unused[12]; // reserved bytes round as_sec_msg size to 16 bytes + + uint8_t fields[]; // the fields (name/value pairs) +} __attribute__ ((__packed__)) as_sec_msg; + + +//========================================================== +// Public API. +// + +void as_security_init(); +uint8_t as_security_check(const struct as_file_handle_s* fd_h, as_sec_perm perm); +bool as_security_check_data_op(struct as_transaction_s* tr, struct as_namespace_s* ns, as_sec_perm perm); +void* as_security_filter_create(); +void as_security_filter_destroy(void* pv_filter); +void as_security_log(const struct as_file_handle_s* fd_h, uint8_t result, as_sec_perm perm, const char* action, const char* detail); +void as_security_refresh(struct as_file_handle_s* fd_h); +void as_security_transact(struct as_transaction_s* tr); diff --git a/as/include/base/security_config.h b/as/include/base/security_config.h new file mode 100644 index 00000000..6a9bae65 --- /dev/null +++ b/as/include/base/security_config.h @@ -0,0 +1,78 @@ +/* + * security_config.h + * + * Copyright (C) 2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include + + +//========================================================== +// Typedefs & constants. +// + +// Syslog "local" facilities. +typedef enum { + AS_SYSLOG_NONE = -1, + AS_SYSLOG_MIN = 0, + AS_SYSLOG_MAX = 7, + + // May configure any facility from "local0" to "local7". + AS_SYSLOG_LOCAL0 = 0, + AS_SYSLOG_LOCAL1 = 1, + AS_SYSLOG_LOCAL2 = 2, + AS_SYSLOG_LOCAL3 = 3, + AS_SYSLOG_LOCAL4 = 4, + AS_SYSLOG_LOCAL5 = 5, + AS_SYSLOG_LOCAL6 = 6, + AS_SYSLOG_LOCAL7 = 7, +} as_sec_syslog_local; + +// Security-related reporting sink bit-field flags. +#define AS_SEC_SINK_LOG 0x1 +#define AS_SEC_SINK_SYSLOG 0x2 + +// Security-related reporting sinks as bit-fields. +typedef struct as_sec_report_s { + uint32_t authentication; + uint32_t data_op; + uint32_t sys_admin; + uint32_t user_admin; + uint32_t violation; +} as_sec_report; + +// Security configuration. +typedef struct as_sec_config_s { + bool security_enabled; + uint32_t privilege_refresh_period; // (seconds) + as_sec_report report; // reporting sinks + as_sec_syslog_local syslog_local; // syslog local facility +} as_sec_config; + + +//========================================================== +// Public API. +// + +void as_security_config_check(); +void as_security_config_log_scope(uint32_t sink, const char* ns_name, + const char* set_name); diff --git a/as/include/base/stats.h b/as/include/base/stats.h new file mode 100644 index 00000000..5605e536 --- /dev/null +++ b/as/include/base/stats.h @@ -0,0 +1,129 @@ +/* + * stats.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include + +#include "citrusleaf/cf_atomic.h" + +#include "hist.h" + +#include "fabric/fabric.h" + + +//========================================================== +// Typedefs & constants. +// + +typedef struct as_stats_s { + + // Connection stats. + cf_atomic64 proto_connections_opened; // not just a statistic + cf_atomic64 proto_connections_closed; // not just a statistic + // In ticker but not collected via info: + cf_atomic64 heartbeat_connections_opened; + cf_atomic64 heartbeat_connections_closed; + cf_atomic64 fabric_connections_opened; + cf_atomic64 fabric_connections_closed; + + // Heartbeat stats. + cf_atomic64 heartbeat_received_self; + cf_atomic64 heartbeat_received_foreign; + + // Demarshal stats. + uint64_t reaper_count; // not in ticker - incremented only in reaper thread + + // Info stats. + cf_atomic64 info_complete; + + // Early transaction errors. + cf_atomic64 n_demarshal_error; + cf_atomic64 n_tsvc_client_error; + cf_atomic64 n_tsvc_batch_sub_error; + cf_atomic64 n_tsvc_udf_sub_error; + + // Batch-index stats. + cf_atomic64 batch_index_initiate; // not in ticker - not just a statistic + cf_atomic64 batch_index_complete; + cf_atomic64 batch_index_errors; + cf_atomic64 batch_index_timeout; + + // Batch-index stats. + cf_atomic64 batch_index_huge_buffers; // not in ticker + cf_atomic64 batch_index_created_buffers; // not in ticker + cf_atomic64 batch_index_destroyed_buffers; // not in ticker + + // "Old" batch stats. + cf_atomic64 batch_initiate; // not in ticker + cf_atomic64 batch_errors; // not in ticker + cf_atomic64 batch_timeout; // not in ticker + + // Query & secondary index stats. + cf_atomic64 query_false_positives; + cf_atomic64 sindex_gc_timedout; // number of times sindex gc iteration timed out waiting for partition lock + uint64_t sindex_gc_list_creation_time; // cumulative sum of list creation phase in sindex gc + uint64_t sindex_gc_list_deletion_time; // cumulative sum of list deletion phase in sindex gc + uint64_t sindex_gc_objects_validated; // cumulative sum of sindex objects validated + uint64_t sindex_gc_garbage_found; // amount of garbage found during list creation phase + uint64_t sindex_gc_garbage_cleaned; // amount of garbage deleted during list deletion phase + + // Fabric stats. + uint64_t fabric_bulk_s_rate; + uint64_t fabric_bulk_r_rate; + uint64_t fabric_ctrl_s_rate; + uint64_t fabric_ctrl_r_rate; + uint64_t fabric_meta_s_rate; + uint64_t fabric_meta_r_rate; + uint64_t fabric_rw_s_rate; + uint64_t fabric_rw_r_rate; + + //-------------------------------------------- + // Histograms. + // + + histogram* batch_index_hist; + bool batch_index_hist_active; // automatically activated + + histogram* info_hist; + + histogram* svc_demarshal_hist; + histogram* svc_queue_hist; + + histogram* fabric_send_init_hists[AS_FABRIC_N_CHANNELS]; + histogram* fabric_send_fragment_hists[AS_FABRIC_N_CHANNELS]; + histogram* fabric_recv_fragment_hists[AS_FABRIC_N_CHANNELS]; + histogram* fabric_recv_cb_hists[AS_FABRIC_N_CHANNELS]; + +} as_stats; + + +//========================================================== +// Public API. +// + +// For now this is in thr_info.c, until a separate .c file is worth it. +extern as_stats g_stats; diff --git a/as/include/base/system_metadata.h b/as/include/base/system_metadata.h new file mode 100644 index 00000000..6b398cd9 --- /dev/null +++ b/as/include/base/system_metadata.h @@ -0,0 +1,236 @@ +/* + * system_metadata.h + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +/* + * SYNOPSIS + * The System Metadata module provides a mechanism for synchronizing + * module metadata cluster-wide. While each module is responsible + * for the interpretation of its own metadata, the System Metadata + * module provides persistence and automatic distribution of changes + * to that opaque metadata. + */ + +#pragma once + +#include +#include +#include + +#include "node.h" + + +/* Declare Public System Metadata Types */ + + +/* + * Type for actions to perform upon metadata items. + */ +typedef enum as_smd_action_e { + AS_SMD_ACTION_SET, // Add or modify this metadata item + AS_SMD_ACTION_DELETE // Delete this metadata item +} as_smd_action_t; + +/* + * Type for an item of metadata. + */ +typedef struct as_smd_item_s { + cf_node node_id; // Originating node ID + as_smd_action_t action; // Action to perform on this metadata item + char *module_name; // Module name of the item + char *key; // Key of the metadata item + char *value; // Value of the metadata item + uint32_t generation; // Metadata generation counter + uint64_t timestamp; // Time metadata last modified +} as_smd_item_t; + +/* + * Type for a list of metadata items for a particular node. + */ +typedef struct as_smd_item_list_s { + size_t num_items; // Number of metadata items + as_smd_item_t *item[]; // Array of pointers to metadata items +} as_smd_item_list_t; + +/* + * Opaque type representing the state of the System Metadata module. + */ +typedef struct as_smd_s as_smd_t; + +/* + * SMD is a singleton, though many class methods are passed an object pointer. + */ +extern as_smd_t *g_smd; + +/* + * Type for mutually-disjoint flag values passed by SMD to the module's accept callback + * via the "accept_opt" argument specifying the originator of the operation. + */ +typedef enum as_smd_accept_option_e { + AS_SMD_ACCEPT_OPT_CREATE = (1 << 0), // Module creation-time accept event + AS_SMD_ACCEPT_OPT_MERGE = (1 << 1), // Post-cluster state change merge + AS_SMD_ACCEPT_OPT_API = (1 << 2) // User-initiated set/delete metadata via SMD API +} as_smd_accept_option_t; + +/* + * Size of the key to be used during a majority consensus merge operation. + * (Ideally this would be a module-supplied parameter rather than a constant.) + */ +#define AS_SMD_MAJORITY_CONSENSUS_KEYSIZE (1024) + + +/* Callback Function Types. */ + + +/* + * Callback function type for getting metadata items. + */ +typedef int (*as_smd_get_cb)(char *module, as_smd_item_list_t *items, void *udata); + +/* + * Callback function type for metadata merge policy functions. + * Resolve action executed on Paxos principal node to determine the cluster-wide "truth." + * Default merge policy: union + * Alternative merge policies: highest generation, latest timestamp + * Configurable via registering a per-module callback function. + */ +typedef int (*as_smd_merge_cb)(const char *module, as_smd_item_list_t **item_list_out, as_smd_item_list_t **item_lists_in, size_t num_lists, void *udata); + +/* + * Callback function type for metadata merge item conflict resolution functions. + * Use only if not using custom as_smd_merge_cb + * Default item conflict resolution picks greater SMD generation/timestamp + * Configurable via registering a per-module callback function. + * Return true to choose existing_item, false to choose new_item. + */ +typedef bool (*as_smd_conflict_cb)(char *module, as_smd_item_t *existing_item, as_smd_item_t *new_item, void *udata); + +/* + * Callback function type for metadata acceptance policy functions. + * The accept callback is executed to commit a metadata change, with + * the accept option specifying the originator of the accept action as follows: + * 1). OPT_CREATE: When a module has been created and its persisted metadata has been restored. + * 2). OPT_MERGE: When all cluster nodes receive and accept the truth from the Paxos principal. + * 3). OPT_API: When metadata is set via the API or restored from persistence, handled locally + * prior to cluster formation, otherwise proxied via the Paxos principal. + * Configurable via registering a per-module callback function. + */ +typedef int (*as_smd_accept_cb)(char *module, as_smd_item_list_t *items, void *udata, uint32_t accept_opt); + +/* + * Callback function type for metadata acceptance pre-check policy function. + * When a user-initiated metadata change operation is requested via the SMD API, + * the validity of operation and arguments is first checked on the Paxos principal + * to decide whether this operation should be sent to all cluster nodes. + * Configurable via registering a per-module callback function. + */ +typedef int (*as_smd_can_accept_cb)(char* module, as_smd_item_t *item, void *udata); + + +/* Constructor and destructor functions for metadata item list objects passed to/from the callback functions. */ + + +/* + * Create an empty list of reference-counted metadata items. + */ +as_smd_item_list_t *as_smd_item_list_create(size_t num_items); + +/* + * Release a list of reference-counted metadata items. + */ +void as_smd_item_list_destroy(as_smd_item_list_t *items); + + +/* System Metadata Module Startup / Shutdown */ + + +/* + * Initialize the single global System Metadata module. + */ +as_smd_t *as_smd_init(void); + +/* + * Start the System Metadata module to begin receiving Paxos state change events. + */ +int as_smd_start(as_smd_t *smd); + +/* + * Terminate the System Metadata module. + */ +int as_smd_shutdown(as_smd_t *smd); + + +/* Metadata Manipulation */ + + +/* + * Create a container for the named module's metadata and register the policy callback functions. + * (Pass a NULL callback function pointer to select the default policy.) + */ +int as_smd_create_module(char *module, + as_smd_merge_cb merge_cb, void *merge_udata, + as_smd_conflict_cb conflict_cb, void *conflict_udata, + as_smd_accept_cb accept_cb, void *accept_udata, + as_smd_can_accept_cb can_accept_cb, void *can_accept_udata); + +/* + * Destroy the container for the named module's metadata, releasing all of its metadata. + */ +int as_smd_destroy_module(char *module); + +/* + * Add a new, or modify an existing, metadata item in an existing module. + */ +int as_smd_set_metadata(char *module, char *key, char *value); + +/* + * Delete an existing metadata item from an existing module. + */ +int as_smd_delete_metadata(char *module, char *key); + +/* + * Retrieve metadata item(s.) (Pass NULL for module and/or key for "all".) + */ +int as_smd_get_metadata(char *module, char *key, as_smd_get_cb cb, void *udata); + + +/* Info Command Functions */ + + +/* + * Print info. about the System Metadata state to the log. + * (Verbose true prints detailed info. about the metadata values.) + */ +void as_smd_dump(bool verbose); + +/* + * Manipulate the System Metadata and log the result. + */ +void as_smd_info_cmd(char *cmd, cf_node node_id, char *module, char *key, char *value); + + +/* Pre-Defined Callback Policy Functions. */ + + +/* + * Merge callback function implementing the majority consensus merge policy. + */ +int as_smd_majority_consensus_merge(const char *module, as_smd_item_list_t **item_list_out, + as_smd_item_list_t **item_lists_in, size_t num_lists, void *udata); diff --git a/as/include/base/thr_batch.h b/as/include/base/thr_batch.h new file mode 100644 index 00000000..b80056a5 --- /dev/null +++ b/as/include/base/thr_batch.h @@ -0,0 +1,31 @@ +/* + * thr_batch.h + * + * Copyright (C) 2008-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include "base/datamodel.h" +#include "base/transaction.h" + +int as_batch_direct_init(); +int as_batch_direct_queue_task(as_transaction* tr, as_namespace *ns); +int as_batch_direct_queue_size(); +int as_batch_direct_threads_resize(uint32_t threads); diff --git a/as/include/base/thr_demarshal.h b/as/include/base/thr_demarshal.h new file mode 100644 index 00000000..a94dd879 --- /dev/null +++ b/as/include/base/thr_demarshal.h @@ -0,0 +1,46 @@ +/* + * thr_demarshal.h + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include "socket.h" +#include "tls.h" +#include "base/cfg.h" +#include "base/transaction.h" + +typedef struct as_info_endpoint_s { + cf_addr_list addrs; + cf_ip_port port; +} as_info_endpoint; + +typedef struct as_info_access_s { + as_info_endpoint service; + as_info_endpoint alt_service; + as_info_endpoint tls_service; + as_info_endpoint alt_tls_service; +} as_info_access; + +extern as_info_access g_access; +extern cf_serv_cfg g_service_bind; +extern cf_tls_info *g_service_tls; + +void thr_demarshal_rearm(as_file_handle *fd_h); diff --git a/as/include/base/thr_info.h b/as/include/base/thr_info.h new file mode 100644 index 00000000..cfe23370 --- /dev/null +++ b/as/include/base/thr_info.h @@ -0,0 +1,88 @@ +/* + * thr_info.h + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include +#include + +#include "dynbuf.h" + +#include "base/proto.h" +#include "base/security.h" +#include "base/transaction.h" + +typedef int (*as_info_get_tree_fn) (char *name, char *subtree, cf_dyn_buf *db); +typedef int (*as_info_get_value_fn) (char *name, cf_dyn_buf *db); +typedef int (*as_info_command_fn) (char *name, char *parameters, cf_dyn_buf *db); + +// Sets a static value - set to 0 to remove a previous value. +extern int as_info_set_buf(const char *name, const uint8_t *value, size_t value_sz, bool def); +extern int as_info_set(const char *name, const char *value, bool def); + +// For dynamic items - you will get called when the name is requested. The +// dynbuf will be fully set up for you - just add the information you want to +// return. +extern int as_info_set_dynamic(char *name, as_info_get_value_fn gv_fn, bool def); + +// For tree items - you will get called when the name is requested, and it will +// have the name you registered (name) and the subtree portion (value). The +// dynbuf will be fully set up for you - just add the information you want to +// return +extern int as_info_set_tree(char *name, as_info_get_tree_fn gv_fn); + +// For commands - you will be called with the parameters. +extern int as_info_set_command(char *name, as_info_command_fn command_fn, as_sec_perm required_perm); + +int as_info_parameter_get(char *param_str, char *param, char *value, int *value_len); + +typedef struct as_info_transaction_s { + as_file_handle *fd_h; + as_proto *proto; + uint64_t start_time; +} as_info_transaction; + +// Processes an info request that comes in from the network, sends the response. +extern void as_info(as_info_transaction *it); + +// Processes a pure puffer request without any info header stuff. +extern int as_info_buffer(uint8_t *req_buf, size_t req_buf_len, cf_dyn_buf *rsp); + +// The info unit uses the fabric to communicate with the other members of the +// cluster so it needs to register for different messages and create listener +// threads, etc. +extern int as_info_init(); + +// Needed by heartbeat: + +char *as_info_bind_to_string(const cf_serv_cfg *cfg, cf_sock_owner owner); + +// Needed by ticker: + +int as_info_queue_get_size(); +void info_log_with_datestamp(void (*log_fn)(void)); + +extern bool g_mstats_enabled; + +// Needed by main(): +extern uint64_t g_start_ms; diff --git a/as/include/base/thr_info_port.h b/as/include/base/thr_info_port.h new file mode 100644 index 00000000..97a40235 --- /dev/null +++ b/as/include/base/thr_info_port.h @@ -0,0 +1,30 @@ +/* + * thr_info_port.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include "socket.h" + +void as_info_port_start(); + +extern cf_serv_cfg g_info_bind; +extern cf_ip_port g_info_port; diff --git a/as/include/base/thr_query.h b/as/include/base/thr_query.h new file mode 100644 index 00000000..8c21114c --- /dev/null +++ b/as/include/base/thr_query.h @@ -0,0 +1,42 @@ +/* + * thr_query.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * QUERY Engine Defaults + */ +// ************************************************************************************************** +#define QUERY_BATCH_SIZE 100 +#define AS_MAX_NUM_SCRIPT_PARAMS 10 +#define AS_QUERY_BUF_SIZE 1024 * 1024 * 2 // At least 2 Meg +#define AS_QUERY_MAX_BUFS 256 // That makes it 512 meg max in steady state +#define AS_QUERY_MAX_QREQ 1024 // this is 4 kb +#define AS_QUERY_MAX_QTR_POOL 128 // They are 4MB+ each ... +#define AS_QUERY_MAX_THREADS 32 +#define AS_QUERY_MAX_WORKER_THREADS 15 * AS_QUERY_MAX_THREADS +#define AS_QUERY_MAX_QREQ_INFLIGHT 100 // worker queue capping per query +#define AS_QUERY_MAX_QUERY 500 // 32 MB be little generous for now!! +#define AS_QUERY_MAX_SHORT_QUEUE_SZ 500 // maximum 500 outstanding short running queries +#define AS_QUERY_MAX_LONG_QUEUE_SZ 500 // maximum 500 outstanding long running queries +#define AS_QUERY_MAX_UDF_TRANSACTIONS 20 // Higher the value more aggressive it will be +#define AS_QUERY_UNTRACKED_TIME 1000 // (millisecond) 1 sec +#define AS_QUERY_WAIT_MAX_TRAN_US 1000 +// ************************************************************************************************** diff --git a/as/include/base/thr_sindex.h b/as/include/base/thr_sindex.h new file mode 100644 index 00000000..c95d0819 --- /dev/null +++ b/as/include/base/thr_sindex.h @@ -0,0 +1,78 @@ +/* + * thr_sindex.h + * + * Copyright (C) 2013-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * secondary index function declarations + */ + +#pragma once + +#include +#include +#include + +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_queue.h" + +#include "ai_obj.h" +#include "dynbuf.h" +#include "hist.h" + +#include "base/datamodel.h" +#include "base/monitor.h" + +#define SINDEX_GC_QUEUE_HIGHWATER 10 +#define SINDEX_GC_NUM_OBJS_PER_ARR 20 + +typedef struct acol_digest_t { + cf_digest dig; + ai_obj acol; +} acol_digest; + +typedef struct objs_to_defrag_arr_t { + acol_digest acol_digs[SINDEX_GC_NUM_OBJS_PER_ARR]; + uint32_t num; +} objs_to_defrag_arr; + +typedef struct ll_sindex_gc_element_s { + cf_ll_element ele; + objs_to_defrag_arr * objs_to_defrag; +} ll_sindex_gc_element; + +extern pthread_rwlock_t sindex_rwlock; +extern cf_queue *g_sindex_populate_q; +extern cf_queue *g_sindex_destroy_q; +extern cf_queue *g_sindex_populateall_done_q; +extern bool g_sindex_boot_done; + +void as_sindex_thr_init(); +objs_to_defrag_arr * as_sindex_gc_get_defrag_arr(void); + +#define MAX_SINDEX_BUILDER_THREADS 32 + +void as_sbld_init(); +void as_sbld_build_all(as_namespace* ns); +void as_sbld_resize_thread_pool(uint32_t n_threads); +int as_sbld_list(char* name, cf_dyn_buf* db); +as_mon_jobstat* as_sbld_get_jobstat(uint64_t trid); +as_mon_jobstat* as_sbld_get_jobstat_all(int* size); +int as_sbld_abort(uint64_t trid); diff --git a/as/include/base/thr_tsvc.h b/as/include/base/thr_tsvc.h new file mode 100644 index 00000000..12e0d5d5 --- /dev/null +++ b/as/include/base/thr_tsvc.h @@ -0,0 +1,55 @@ +/* + * thr_tsvc.h + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include + + +//========================================================== +// Forward declarations. +// + +struct as_transaction_s; + + +//========================================================== +// Typedefs & constants. +// + +#define MAX_TRANSACTION_QUEUES 128 +#define MAX_TRANSACTION_THREADS_PER_QUEUE 256 + + +//========================================================== +// Public API. +// + +void as_tsvc_init(); +void as_tsvc_enqueue(struct as_transaction_s *tr); +void as_tsvc_set_threads_per_queue(uint32_t n_threads); +int as_tsvc_queue_get_size(); +void as_tsvc_process_transaction(struct as_transaction_s *tr); diff --git a/as/include/base/ticker.h b/as/include/base/ticker.h new file mode 100644 index 00000000..a8063944 --- /dev/null +++ b/as/include/base/ticker.h @@ -0,0 +1,29 @@ +/* + * ticker.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Public API. +// + +void as_ticker_start(); diff --git a/as/include/base/transaction.h b/as/include/base/transaction.h new file mode 100644 index 00000000..aa7803b2 --- /dev/null +++ b/as/include/base/transaction.h @@ -0,0 +1,378 @@ +/* + * transaction.h + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + +#pragma once + +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_byte_order.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" + +#include "msg.h" +#include "node.h" +#include "socket.h" + +#include "base/cfg.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/stats.h" +#include "fabric/partition.h" +#include "storage/storage.h" + +struct as_namespace_s; + + +//========================================================== +// Histogram macros. +// + +#define G_HIST_INSERT_DATA_POINT(name, start_time) \ +{ \ + if (g_config.name##_enabled) { \ + histogram_insert_data_point(g_stats.name, start_time); \ + } \ +} + +#define G_HIST_ACTIVATE_INSERT_DATA_POINT(name, start_time) \ +{ \ + g_stats.name##_active = true; \ + histogram_insert_data_point(g_stats.name, start_time); \ +} + +#define HIST_TRACK_ACTIVATE_INSERT_DATA_POINT(trw, name) \ +{ \ + trw->rsv.ns->name##_active = true; \ + cf_hist_track_insert_data_point(trw->rsv.ns->name, trw->start_time); \ +} + +#define HIST_ACTIVATE_INSERT_DATA_POINT(trw, name) \ +{ \ + trw->rsv.ns->name##_active = true; \ + histogram_insert_data_point(trw->rsv.ns->name, trw->start_time); \ +} + +#define BENCHMARK_START(tr, name, orig) \ +{ \ + if (tr->rsv.ns->name##_benchmarks_enabled && tr->origin == orig) { \ + if (tr->benchmark_time == 0) { \ + tr->benchmark_time = histogram_insert_data_point(tr->rsv.ns->name##_start_hist, tr->start_time); \ + } \ + else { \ + tr->benchmark_time = histogram_insert_data_point(tr->rsv.ns->name##_restart_hist, tr->benchmark_time); \ + } \ + } \ +} + +#define BENCHMARK_NEXT_DATA_POINT(trw, name, tok) \ +{ \ + if (trw->rsv.ns->name##_benchmarks_enabled && trw->benchmark_time != 0) { \ + trw->benchmark_time = histogram_insert_data_point(trw->rsv.ns->name##_##tok##_hist, trw->benchmark_time); \ + } \ +} + + +//========================================================== +// Client socket information - as_file_handle. +// + +typedef struct as_file_handle_s { + char client[64]; // client identifier (currently ip-addr:port) + uint64_t last_used; // last ms we read or wrote + cf_socket sock; // our socket + cf_poll poll; // our epoll instance + bool reap_me; // tells the reaper to come and get us + uint32_t fh_info; // bitmap containing status info of this file handle + as_proto proto_hdr; + as_proto *proto; + uint64_t proto_unread; + void *security_filter; +} as_file_handle; + +#define FH_INFO_DONOT_REAP 0x00000001 // this bit indicates that this file handle should not be reaped +#define FH_INFO_XDR 0x00000002 // the file handle belongs to an XDR connection + +// Helpers to release transaction file handles. +void as_release_file_handle(as_file_handle *proto_fd_h); +void as_end_of_transaction(as_file_handle *proto_fd_h, bool force_close); +void as_end_of_transaction_ok(as_file_handle *proto_fd_h); +void as_end_of_transaction_force_close(as_file_handle *proto_fd_h); + + +//========================================================== +// Transaction. +// + +typedef enum { + TRANS_DONE_ERROR = -1, // tsvc frees msgp & reservation, response was sent to origin + TRANS_DONE_SUCCESS = 0, // tsvc frees msgp & reservation, response was sent to origin + TRANS_IN_PROGRESS = 1, // tsvc leaves msgp & reservation alone, rw_request now owns them + TRANS_WAITING = 2 // tsvc leaves msgp alone but frees reservation +} transaction_status; + +// How to interpret the 'from' union. +// +// NOT a generic transaction type flag, e.g. batch sub-transactions that proxy +// are FROM_PROXY on the proxyee node, hence we still need a separate +// FROM_FLAG_BATCH_SUB. +// +typedef enum { + // External, comes through demarshal or fabric: + FROM_CLIENT = 1, + FROM_PROXY, + + // Internal, generated on local node: + FROM_BATCH, + FROM_IUDF, + FROM_NSUP, + FROM_RE_REPL, // enterprise-only + + FROM_UNDEF = 0 +} transaction_origin; + +struct as_batch_shared_s; +struct iudf_origin_s; + +typedef struct as_transaction_s { + + //------------------------------------------------------ + // transaction 'head' - copied onto queue. + // + + cl_msg* msgp; + uint32_t msg_fields; + + uint8_t origin; + uint8_t from_flags; + + // 2 spare bytes. + + union { + void* any; + as_file_handle* proto_fd_h; + cf_node proxy_node; + struct as_batch_shared_s* batch_shared; + struct iudf_origin_s* iudf_orig; + void (*re_repl_orig_cb) (struct as_transaction_s* tr); + } from; + + union { + uint32_t any; + uint32_t proxy_tid; + uint32_t batch_index; + } from_data; + + cf_digest keyd; // only batch sub-transactions require this on queue + + uint64_t start_time; + uint64_t benchmark_time; + + //<><><><><><><><><><><> 64 bytes <><><><><><><><><><><> + + //------------------------------------------------------ + // transaction 'body' - NOT copied onto queue. + // + + as_partition_reservation rsv; + + uint64_t end_time; + uint8_t result_code; + uint8_t flags; + uint16_t generation; + uint32_t void_time; + uint64_t last_update_time; + +} as_transaction; + +#define AS_TRANSACTION_HEAD_SIZE (offsetof(as_transaction, rsv)) + +// 'from_flags' bits - set before queuing transaction head: +#define FROM_FLAG_BATCH_SUB 0x0001 +#define FROM_FLAG_RESTART 0x0002 + +// 'flags' bits - set in transaction body after queuing: +#define AS_TRANSACTION_FLAG_SINDEX_TOUCHED 0x01 +#define AS_TRANSACTION_FLAG_IS_DELETE 0x02 +#define AS_TRANSACTION_FLAG_MUST_PING 0x04 // enterprise-only + + +void as_transaction_init_head(as_transaction *tr, cf_digest *, cl_msg *); +void as_transaction_init_body(as_transaction *tr); + +void as_transaction_copy_head(as_transaction *to, const as_transaction *from); + +struct rw_request_s; + +void as_transaction_init_from_rw(as_transaction *tr, struct rw_request_s *rw); +void as_transaction_init_head_from_rw(as_transaction *tr, struct rw_request_s *rw); + +bool as_transaction_set_msg_field_flag(as_transaction *tr, uint8_t type); +bool as_transaction_prepare(as_transaction *tr, bool swap); + +static inline bool +as_transaction_is_restart(const as_transaction *tr) +{ + return (tr->from_flags & FROM_FLAG_RESTART) != 0; +} + +static inline bool +as_transaction_is_batch_sub(const as_transaction *tr) +{ + return (tr->from_flags & FROM_FLAG_BATCH_SUB) != 0; +} + +static inline bool +as_transaction_has_set(const as_transaction *tr) +{ + return (tr->msg_fields & AS_MSG_FIELD_BIT_SET) != 0; +} + +static inline bool +as_transaction_has_key(const as_transaction *tr) +{ + return (tr->msg_fields & AS_MSG_FIELD_BIT_KEY) != 0; +} + +static inline bool +as_transaction_has_digest(const as_transaction *tr) +{ + return (tr->msg_fields & AS_MSG_FIELD_BIT_DIGEST_RIPE) != 0; +} + +static inline bool +as_transaction_has_no_key_or_digest(const as_transaction *tr) +{ + return (tr->msg_fields & (AS_MSG_FIELD_BIT_KEY | AS_MSG_FIELD_BIT_DIGEST_RIPE)) == 0; +} + +static inline bool +as_transaction_is_multi_record(const as_transaction *tr) +{ + return (tr->msg_fields & (AS_MSG_FIELD_BIT_KEY | AS_MSG_FIELD_BIT_DIGEST_RIPE)) == 0 && + (tr->from_flags & FROM_FLAG_BATCH_SUB) == 0; +} + +static inline bool +as_transaction_is_batch_direct(const as_transaction *tr) +{ + // Assumes we're already multi-record. + return (tr->msg_fields & AS_MSG_FIELD_BIT_DIGEST_RIPE_ARRAY) != 0; +} + +static inline bool +as_transaction_is_query(const as_transaction *tr) +{ + // Assumes we're already multi-record. + return (tr->msg_fields & AS_MSG_FIELD_BIT_INDEX_RANGE) != 0; +} + +static inline bool +as_transaction_is_udf(const as_transaction *tr) +{ + return (tr->msg_fields & AS_MSG_FIELD_BIT_UDF_FILENAME) != 0; +} + +static inline bool +as_transaction_has_udf_op(const as_transaction *tr) +{ + return (tr->msg_fields & AS_MSG_FIELD_BIT_UDF_OP) != 0; +} + +static inline bool +as_transaction_has_scan_options(const as_transaction *tr) +{ + return (tr->msg_fields & AS_MSG_FIELD_BIT_SCAN_OPTIONS) != 0; +} + +static inline bool +as_transaction_has_socket_timeout(const as_transaction *tr) +{ + return (tr->msg_fields & AS_MSG_FIELD_BIT_SOCKET_TIMEOUT) != 0; +} + +static inline bool +as_transaction_has_predexp(const as_transaction *tr) +{ + return (tr->msg_fields & AS_MSG_FIELD_BIT_PREDEXP) != 0; +} + +// For now it's not worth storing the trid in the as_transaction struct since we +// only parse it from the msg once per transaction anyway. +static inline uint64_t +as_transaction_trid(const as_transaction *tr) +{ + if ((tr->msg_fields & AS_MSG_FIELD_BIT_TRID) == 0) { + return 0; + } + + as_msg_field *f = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_TRID); + + return cf_swap_from_be64(*(uint64_t*)f->data); +} + +static inline bool +as_transaction_is_delete(const as_transaction *tr) +{ + return (tr->msgp->msg.info2 & AS_MSG_INFO2_DELETE) != 0; +} + +static inline bool +as_transaction_is_durable_delete(const as_transaction *tr) +{ + return (tr->msgp->msg.info2 & AS_MSG_INFO2_DURABLE_DELETE) != 0; +} + +// TODO - where should this go? +static inline bool +as_msg_is_xdr(const as_msg *m) +{ + return (m->info1 & AS_MSG_INFO1_XDR) != 0; +} + +static inline bool +as_transaction_is_xdr(const as_transaction *tr) +{ + return (tr->msgp->msg.info1 & AS_MSG_INFO1_XDR) != 0; +} + +static inline bool +as_transaction_is_nsup_delete(const as_transaction *tr) +{ + return tr->origin == FROM_NSUP; +} + +static inline bool +as_transaction_is_linearized_read(const as_transaction *tr) +{ + return (tr->msgp->msg.info3 & AS_MSG_INFO3_LINEARIZE_READ) != 0; +} + +void as_transaction_init_iudf(as_transaction *tr, struct as_namespace_s *ns, cf_digest *keyd, struct iudf_origin_s *iudf_orig, bool is_durable_delete); + +void as_transaction_demarshal_error(as_transaction *tr, uint32_t error_code); +void as_transaction_error(as_transaction *tr, struct as_namespace_s *ns, uint32_t error_code); +void as_multi_rec_transaction_error(as_transaction *tr, uint32_t error_code); diff --git a/as/include/base/transaction_policy.h b/as/include/base/transaction_policy.h new file mode 100644 index 00000000..dcc4b66f --- /dev/null +++ b/as/include/base/transaction_policy.h @@ -0,0 +1,114 @@ +/* + * transaction_policy.h + * + * Copyright (C) 2014-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Typedefs & constants. +// + +typedef enum { + // Server config override value only - means use policy sent by client. + AS_READ_CONSISTENCY_LEVEL_PROTO = -1, + + // Must match AS_POLICY_CONSISTENCY_LEVEL_ONE in C Client v3 as_policy.h. + // Ignore duplicates - i.e. don't duplicate resolve. + AS_READ_CONSISTENCY_LEVEL_ONE, + + // Must match AS_POLICY_CONSISTENCY_LEVEL_ALL in C Client v3 as_policy.h. + // Involve all duplicates in the operation - i.e. duplicate resolve. + AS_READ_CONSISTENCY_LEVEL_ALL, +} as_read_consistency_level; + +typedef enum { + // Server config override value only - means use policy sent by client. + AS_WRITE_COMMIT_LEVEL_PROTO = -1, + + // Must match AS_POLICY_COMMIT_LEVEL_ALL in C Client v3 as_policy.h. + // Respond to client only after successfully committing all replicas. + AS_WRITE_COMMIT_LEVEL_ALL, + + // Must match AS_POLICY_COMMIT_LEVEL_MASTER in C Client v3 as_policy.h. + // Respond to client after successfully committing the master replica. + AS_WRITE_COMMIT_LEVEL_MASTER, +} as_write_commit_level; + + +//========================================================== +// Public API - macros. +// + +//------------------------------------------------ +// Extract levels from an as_msg. +// + +// Not a strict check: both bits == 0 means ONE, anything else means ALL. +#define PROTO_CONSISTENCY_LEVEL(asmsg) \ + ((((asmsg).info1 & AS_MSG_INFO1_CONSISTENCY_LEVEL_B0) == 0 && \ + ((asmsg).info1 & AS_MSG_INFO1_CONSISTENCY_LEVEL_B1) == 0) ? \ + AS_READ_CONSISTENCY_LEVEL_ONE : AS_READ_CONSISTENCY_LEVEL_ALL) + +// Not a strict check: both bits == 0 means ALL, anything else means MASTER. +#define PROTO_COMMIT_LEVEL(asmsg) \ + ((((asmsg).info3 & AS_MSG_INFO3_COMMIT_LEVEL_B0) == 0 && \ + ((asmsg).info3 & AS_MSG_INFO3_COMMIT_LEVEL_B1) == 0) ? \ + AS_WRITE_COMMIT_LEVEL_ALL : AS_WRITE_COMMIT_LEVEL_MASTER) + +//------------------------------------------------ +// Get levels for a transaction with reservation. +// + +// Determine read consistency level for a transaction based on everything. +#define TR_READ_CONSISTENCY_LEVEL(tr) \ + (tr->rsv.ns->read_consistency_level == AS_READ_CONSISTENCY_LEVEL_PROTO ? \ + PROTO_CONSISTENCY_LEVEL(tr->msgp->msg) : \ + tr->rsv.ns->read_consistency_level) + +// Determine write commit level for a transaction based on everything. +#define TR_WRITE_COMMIT_LEVEL(tr) \ + (tr->rsv.ns->write_commit_level == AS_WRITE_COMMIT_LEVEL_PROTO ? \ + PROTO_COMMIT_LEVEL(tr->msgp->msg) : \ + tr->rsv.ns->write_commit_level) + +//------------------------------------------------ +// Get levels without need of reservation. +// + +// Same as above, for use before tr->rsv has been made. +#define READ_CONSISTENCY_LEVEL(ns, asmsg) \ + (ns->read_consistency_level == AS_READ_CONSISTENCY_LEVEL_PROTO ? \ + PROTO_CONSISTENCY_LEVEL(asmsg) : \ + ns->read_consistency_level) + +//------------------------------------------------ +// Get config override values' names. +// + +#define NS_READ_CONSISTENCY_LEVEL_NAME() \ + (ns->read_consistency_level == AS_READ_CONSISTENCY_LEVEL_PROTO ? \ + "off" : (ns->read_consistency_level == AS_READ_CONSISTENCY_LEVEL_ONE ? \ + "one" : "all")) + +#define NS_WRITE_COMMIT_LEVEL_NAME() \ + (ns->write_commit_level == AS_WRITE_COMMIT_LEVEL_PROTO ? \ + "off" : (ns->write_commit_level == AS_WRITE_COMMIT_LEVEL_ALL ? \ + "all" : "master")) diff --git a/as/include/base/truncate.h b/as/include/base/truncate.h new file mode 100644 index 00000000..130b2f10 --- /dev/null +++ b/as/include/base/truncate.h @@ -0,0 +1,94 @@ +/* + * truncate.h + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include +#include +#include + +#include "citrusleaf/cf_atomic.h" + +#include "shash.h" + + +//========================================================== +// Forward declarations. +// + +struct as_index_s; +struct as_namespace_s; + + +//========================================================== +// Typedefs & constants. +// + +typedef enum { + TRUNCATE_IDLE, + TRUNCATE_RUNNING, + TRUNCATE_RESTART +} truncate_state; + +typedef struct as_truncate_s { + uint64_t lut; + cf_shash* startup_set_hash; // relevant only for enterprise edition + truncate_state state; + pthread_mutex_t state_lock; + cf_atomic32 n_threads_running; + cf_atomic32 pid; + cf_atomic64 n_records_this_run; + uint64_t n_records; +} as_truncate; + + +//========================================================== +// Public API. +// + +void as_truncate_init(struct as_namespace_s* ns); +void as_truncate_init_smd(); +void as_truncate_list_cenotaphs(struct as_namespace_s* ns); +void as_truncate_done_startup(struct as_namespace_s* ns); +bool as_truncate_cmd(const char* ns_name, const char* set_name, const char* lut_str); +void as_truncate_undo_cmd(const char* ns_name, const char* set_name); +bool as_truncate_now_is_truncated(struct as_namespace_s* ns, uint16_t set_id); +bool as_truncate_record_is_truncated(const struct as_index_s* r, struct as_namespace_s* ns); + + +//========================================================== +// For enterprise separation only. +// + +typedef struct truncate_hval_s { + uint64_t cenotaph:1; + uint64_t unused:23; + uint64_t lut:40; +} truncate_hval; + +void truncate_startup_hash_init(struct as_namespace_s* ns); +void truncate_action_startup(struct as_namespace_s* ns, const char* set_name, uint64_t lut); diff --git a/as/include/base/udf_aerospike.h b/as/include/base/udf_aerospike.h new file mode 100644 index 00000000..76510ae9 --- /dev/null +++ b/as/include/base/udf_aerospike.h @@ -0,0 +1,27 @@ +/* + * udf_aerospike.h + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include "aerospike/as_aerospike.h" + +extern const as_aerospike_hooks udf_aerospike_hooks; diff --git a/as/include/base/udf_arglist.h b/as/include/base/udf_arglist.h new file mode 100644 index 00000000..42fccf59 --- /dev/null +++ b/as/include/base/udf_arglist.h @@ -0,0 +1,31 @@ +/* + * udf_arglist.h + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include "aerospike/as_list.h" + +/****************************************************************************** + * VARIABLES + ******************************************************************************/ + +extern const as_list_hooks udf_arglist_hooks; diff --git a/as/include/base/udf_cask.h b/as/include/base/udf_cask.h new file mode 100644 index 00000000..42cec76c --- /dev/null +++ b/as/include/base/udf_cask.h @@ -0,0 +1,70 @@ +/* + * udf_cask.h + * + * Copyright (C) 2013-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include + +#include "dynbuf.h" + +#include "base/thr_info.h" + + +// UDF Types +#define AS_UDF_TYPE_LUA 0 +#define MAX_UDF_CONTENT_LENGTH (1024 * 1024) //(1MB) + +extern char *as_udf_type_name[]; + +//------------------------------------------------ +// Register function +void udf_cask_init(); + +//------------------------------------------------ +// these functions are "as_info_command" format +// and called directly from there. +// therefore they have the same calling convention + +int udf_cask_info_clear_cache(char * name, char * params, cf_dyn_buf * out); + +int udf_cask_info_get(char * name, char * params, cf_dyn_buf * out); + +int udf_cask_info_put(char * name, char * params, cf_dyn_buf * out); + +int udf_cask_info_remove(char * name, char * params, cf_dyn_buf * out); + +int udf_cask_info_reconfigure(char * name, char * params, cf_dyn_buf * buf); + +int udf_cask_info_list(char *name, cf_dyn_buf * out); + +//------------------------------------------------ +// these are called by the modules that need to run UDFs + +// called by a module to get the data associated with a udf (the file contents) +// this will be a reference count (rc_alloc) pointer and must be dereferenced by the caller +int udf_cask_get_udf(char *module, char *udf_type, uint8_t **buf , size_t *buf_len ); + +// called by a module to get the data associated with a udf (the fully qualified file name) +// caller passes in a max-size string buffer that gets filled out (null terminated) +int udf_cask_get_udf_filename(char *module, char *udf_type, char *filename ); + diff --git a/as/include/base/udf_memtracker.h b/as/include/base/udf_memtracker.h new file mode 100644 index 00000000..619edd45 --- /dev/null +++ b/as/include/base/udf_memtracker.h @@ -0,0 +1,51 @@ +/* + * udf_memtracker.h + * + * Copyright (C) 2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/** + * An as_memtracker for tests. + */ + +#pragma once + +#include +#include "aerospike/as_memtracker.h" + +typedef enum { + MEM_RESERVE = 0, + MEM_RELEASE = 1, + MEM_RESET = 2 +} memtracker_op; + +typedef struct mem_tracker_s mem_tracker; +typedef bool (*as_memtracker_op_cb)(mem_tracker *mt, uint32_t, memtracker_op); + +struct mem_tracker_s { + void *udata; + as_memtracker_op_cb cb; +}; + +/***************************************************************************** + * STATIC FUNCTIONS + *****************************************************************************/ +as_memtracker * udf_memtracker_init(); +void udf_memtracker_setup(mem_tracker *mt); +void udf_memtracker_cleanup(); diff --git a/as/include/base/udf_record.h b/as/include/base/udf_record.h new file mode 100644 index 00000000..e6973ef4 --- /dev/null +++ b/as/include/base/udf_record.h @@ -0,0 +1,110 @@ +/* + * udf_record.h + * + * Copyright (C) 2013-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include +#include + +#include "aerospike/as_rec.h" +#include "aerospike/as_hashmap.h" +#include "aerospike/as_val.h" +#include "citrusleaf/cf_atomic.h" + +#include "base/datamodel.h" +#include "base/rec_props.h" +#include "base/transaction.h" +#include "base/xdr_serverside.h" +#include "storage/storage.h" + + +// Maximum number of bins that can be updated in a single UDF. +#define UDF_RECORD_BIN_ULIMIT 512 + +typedef struct udf_record_bin_s { + char name[AS_ID_BIN_SZ]; + as_val * value; + as_val * oldvalue; // keeps track of old value in case rollback is required + bool dirty; + void *particle_buf; +} udf_record_bin; + +typedef struct udf_record_s { + + // STORAGE + as_index_ref *r_ref; + as_transaction *tr; + as_storage_rd *rd; + xdr_dirty_bins *dirty; + cf_digest keyd; + as_bin stack_bins[UDF_RECORD_BIN_ULIMIT]; // TODO increase bin limit? + + // UDF CHANGE CACHE + udf_record_bin updates[UDF_RECORD_BIN_ULIMIT]; // stores cache bin value + // if dirty flag is set the bin is being modified + uint32_t nupdates; // reset after every cache free, incremented in every cache set + + // RUNTIME ACCOUNTING + uint8_t *particle_data; // non-null for data-on-ssd, and lazy allocated on first bin write + uint8_t *cur_particle_data; // where the pointer is + uint8_t *end_particle_data; + uint32_t starting_memory_bytes; + cf_atomic_int udf_runtime_memory_used; + + // INTERNAL UTILITY + uint16_t flag; +} udf_record; + +#define UDF_RECORD_FLAG_ALLOW_UPDATES 0x0001 // Write/Updates Allowed +#define UDF_RECORD_FLAG_TOO_MANY_BINS 0x0002 // UDF exceeds the bin limit +#define UDF_RECORD_FLAG_UNUSED_4 0x0004 // was - sub-record +#define UDF_RECORD_FLAG_OPEN 0x0008 // as_record_open done +#define UDF_RECORD_FLAG_STORAGE_OPEN 0x0010 // as_storage_record_open done +#define UDF_RECORD_FLAG_HAS_UPDATES 0x0020 // Write/Update done +#define UDF_RECORD_FLAG_PREEXISTS 0x0040 // Record preexisted not created +#define UDF_RECORD_FLAG_ISVALID 0x0080 // Udf is setup and in use +#define UDF_RECORD_FLAG_METADATA_UPDATED 0x0100 // Write/Update metadata done + +extern const as_rec_hooks udf_record_hooks; + +//------------------------------------------------ +// Utility functions for all the wrapper as_record implementation +// which use udf_record under the hood +extern void udf_record_cache_free (udf_record *); +extern int udf_record_open (udf_record *); +extern int udf_storage_record_open (udf_record *); +extern void udf_record_close (udf_record *); +extern int udf_storage_record_close(udf_record *); +extern void udf_record_init (udf_record *, bool); +extern as_val * udf_record_storage_get (const udf_record *, const char *); + +#define UDF_ERR_INTERNAL_PARAMETER 2 +#define UDF_ERR_RECORD_NOT_VALID 3 +#define UDF_ERR_PARAMETER 4 +extern int udf_record_param_check(const as_rec *rec, char *fname, int lineno); +extern bool udf_record_destroy(as_rec *rec); + +//------------------------------------------------ +// Note that the main interface routines do NOT get declared here. +// extern int udf_record_set_flags(const as_rec *, const char *, uint8_t); +// extern int udf_record_set_type(const as_rec *, int8_t); diff --git a/as/include/base/udf_timer.h b/as/include/base/udf_timer.h new file mode 100644 index 00000000..da71320b --- /dev/null +++ b/as/include/base/udf_timer.h @@ -0,0 +1,47 @@ +/* + * udf_timer.h + * + * Copyright (C) 2013-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * An as_timer for tests. + */ + +#pragma once + +#include +#include "aerospike/as_timer.h" + +typedef struct time_tracker_s time_tracker; +typedef uint64_t (* as_timer_end_time_cb)(time_tracker *tt); +typedef uint64_t (* as_timer_timeslice_cb)(time_tracker *tt); + +struct time_tracker_s { + void * udata; + as_timer_end_time_cb end_time; +}; + +/***************************************************************************** + * STATIC FUNCTIONS + *****************************************************************************/ +void udf_timer_setup(time_tracker *tt); +void udf_timer_cleanup(); +extern const as_timer_hooks udf_timer_hooks; + diff --git a/as/include/base/xdr_config.h b/as/include/base/xdr_config.h new file mode 100644 index 00000000..c5faf457 --- /dev/null +++ b/as/include/base/xdr_config.h @@ -0,0 +1,128 @@ +/* + * xdr_config.h + * + * Copyright (C) 2011-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include "citrusleaf/cf_vector.h" + +#include "node.h" +#include "tls.h" + +//========================================================== +// Forward declarations. +// + +//========================================================== +// Constants & typedefs. +// + +// Length definitions. This should be in sync with the server definitions. +// It is bad that we are not using a common header file for all this. +#define CLUSTER_MAX_SZ 128 +#define NAMESPACE_MAX_NUM 32 +#define DC_MAX_NUM 32 + +typedef struct xdr_node_lst_s { + cf_node node; + uint64_t time[DC_MAX_NUM]; +} xdr_node_lst; + +typedef struct node_addr_port_s { + char *addr; + char *tls_name; + int port; +} node_addr_port; + +// Config option in case the configuration value is changed +typedef struct xdr_new_config_s { + bool skip_outstanding; +} xdr_new_config; + +// Config option which is maintained both by the server and the XDR module +typedef struct xdr_config_s { + + bool xdr_section_configured; + bool xdr_global_enabled; + + // Ring buffer configuration + char *xdr_digestlog_path; + uint64_t xdr_digestlog_file_size; + + uint32_t xdr_info_port; + uint32_t xdr_max_ship_throughput; + uint32_t xdr_max_ship_bandwidth; + uint32_t xdr_min_dlog_free_pct; + uint32_t xdr_hotkey_time_ms; + uint32_t xdr_read_threads; + uint32_t xdr_write_timeout; + uint32_t xdr_client_threads; + uint32_t xdr_forward_xdrwrites; + uint32_t xdr_internal_shipping_delay; + uint32_t xdr_info_request_timeout_ms; + uint32_t xdr_compression_threshold; + uint32_t xdr_digestlog_iowait_ms; + + bool xdr_shipping_enabled; + bool xdr_delete_shipping_enabled; + bool xdr_nsup_deletes_enabled; + bool xdr_ship_bins; + bool xdr_handle_failednode; + bool xdr_handle_linkdown; + + // Internal + bool xdr_conf_change_flag; + xdr_new_config xdr_new_cfg; +} xdr_config; + +typedef struct xdr_security_config_s { + char *sec_config_file; + char *username; + char *password; +} xdr_security_config; + +typedef struct dc_config_opt_s { + char *dc_name; + int dc_id; + cf_vector dc_node_v; + cf_vector dc_addr_map_v; + uint32_t dc_connections; + uint32_t dc_connections_idle_ms; + xdr_security_config dc_security_cfg; + bool dc_use_alternate_services; + char *tls_our_name; + cf_tls_spec *tls_spec; +} dc_config_opt; + +//========================================================== +// Public API. +// + +void xdr_config_defaults(); +bool xdr_read_security_configfile(xdr_security_config* sc); + +extern xdr_config g_xcfg; +extern int g_dc_count; +extern dc_config_opt g_dc_xcfg_opt[DC_MAX_NUM]; diff --git a/as/include/base/xdr_serverside.h b/as/include/base/xdr_serverside.h new file mode 100644 index 00000000..e325e809 --- /dev/null +++ b/as/include/base/xdr_serverside.h @@ -0,0 +1,87 @@ +/* + * xdr_serverside.h + * + * Copyright (C) 2012-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include + +#include "citrusleaf/cf_digest.h" + +#include "dynbuf.h" +#include "node.h" +#include "socket.h" + +#include "base/datamodel.h" +#include "base/transaction.h" + +//========================================================== +// Forward declarations. +// + +//========================================================== +// Constants & typedefs. +// + +typedef enum { + XDR_OP_TYPE_WRITE, + XDR_OP_TYPE_DROP, + XDR_OP_TYPE_DURABLE_DELETE +} xdr_op_type; + +typedef uint64_t xdr_dirty_bins[2]; + +//========================================================== +// Public API. +// + +int as_xdr_init(); +void xdr_config_post_process(); +void as_xdr_start(); +int as_xdr_shutdown(); +void xdr_sig_handler(int signum); + +void xdr_clear_dirty_bins(xdr_dirty_bins *dirty); +void xdr_fill_dirty_bins(xdr_dirty_bins *dirty); +void xdr_copy_dirty_bins(xdr_dirty_bins *from, xdr_dirty_bins *to); +void xdr_add_dirty_bin(as_namespace *ns, xdr_dirty_bins *dirty, const char *name, size_t name_len); +void xdr_write(as_namespace *ns, cf_digest *keyd, uint16_t generation, cf_node masternode, xdr_op_type op_type, uint16_t set_id, xdr_dirty_bins *dirty); +void as_xdr_read_txn(as_transaction *txn); + +void as_xdr_info_init(void); +void as_xdr_info_port(cf_serv_cfg *serv_cfg); +int as_info_command_xdr(char *name, char *params, cf_dyn_buf *db); +void as_xdr_get_stats(cf_dyn_buf *db); +void as_xdr_get_config(cf_dyn_buf *db); +bool as_xdr_set_config(char *params); +bool as_xdr_set_config_ns(char *ns_name, char *params); + +bool is_xdr_delete_shipping_enabled(); +bool is_xdr_digestlog_low(as_namespace *ns); +bool is_xdr_forwarding_enabled(); +bool is_xdr_nsup_deletes_enabled(); + +void xdr_cfg_add_int_ext_mapping(dc_config_opt *dc_cfg, char* orig, char* alt); diff --git a/as/include/fabric/clustering.h b/as/include/fabric/clustering.h new file mode 100644 index 00000000..9ac9163a --- /dev/null +++ b/as/include/fabric/clustering.h @@ -0,0 +1,296 @@ +/* + * clustering.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * Aerospike cluster formation v5 based on paxos. + * Complete discussion of the algorithm can be found + * https://docs.google.com/document/d/1u-27aeZD9no9wiWgt1_BsTSg_6ewG9VBI2sYA0g01BE/edit# + */ +#pragma once + +#include +#include + +#include "citrusleaf/cf_vector.h" + +#include "fault.h" + +#include "fabric/hlc.h" + +/* + * ---------------------------------------------------------------------------- + * Public data structures. + * ---------------------------------------------------------------------------- + */ +/** + * Aerospike cluster key. + */ +typedef uint64_t as_cluster_key; + +/** + * Aerospike clustering protocol identifier. + */ +typedef uint32_t as_cluster_proto_identifier; + +/** + * Configuration for the clustering algorithm. + */ +typedef struct as_clustering_config_s +{ + /** + * The smallest allowed cluster size. + */ + uint32_t cluster_size_min; + + /** + * Indicates if clique based eviction is enabled. + */ + bool clique_based_eviction_enabled; + + /** + * Current protocol identifier. + */ + as_cluster_proto_identifier protocol_identifier; + +} as_clustering_config; + +/** + * The clustering protocol versions. + */ +typedef enum as_clustering_protocol_version +{ + AS_CLUSTERING_PROTOCOL_UNDEF, + AS_CLUSTERING_PROTOCOL_NONE, + AS_CLUSTERING_PROTOCOL_V1, + AS_CLUSTERING_PROTOCOL_V2, + AS_CLUSTERING_PROTOCOL_V3, + AS_CLUSTERING_PROTOCOL_V4, + AS_CLUSTERING_PROTOCOL_V5 +} as_clustering_protocol_version; + +/** + * Clustering event type. + */ +typedef enum as_clustering_event_type_e +{ + /** + * Cluster membership for this node changed. + */ + AS_CLUSTERING_CLUSTER_CHANGED, + + /** + * This node became an orphan node. + */ + AS_CLUSTERING_ORPHANED +} as_clustering_event_type; + +/** + * Clustering event type. + */ +typedef enum as_clustering_event_qualifier_e +{ + /** + * The default qualifier for cases where a qualifier is not applicable. + */ + AS_CLUSTERING_QUALIFIER_NA, + + /** + * Cluster membership lost since the principal evicted this node or is no + * longer reachable or the cluster is invalid. Relevant only for orphaned + * event. + */ + AS_CLUSTERING_MEMBERSHIP_LOST, + + /** + * This node became an orphan node in order to attempt a merge. Relevant + * only for orphaned event. + */ + AS_CLUSTERING_ATTEMPTING_MERGE, +} as_clustering_event_qualifier; + +/** + * Clustering event. + */ +typedef struct as_clustering_event_s +{ + /** + * The clustering event type. + */ + as_clustering_event_type type; + + /** + * The clustering event qualifier. + */ + as_clustering_event_qualifier qualifier; + + /** + * The cluster key. Will be non-zero if this is a cluster change event. + */ + as_cluster_key cluster_key; + + /** + * The new succession list. It will not be empty if this is a cluster change + * event. + * + * The allocated space will be freed once the event processing is complete. + * Listeners should always create a copy of this list, if it needs to be + * used later on by the listener. + */ + cf_vector* succession_list; +} as_clustering_event; + +/* + * ---------------------------------------------------------------------------- + * Public API. + * ---------------------------------------------------------------------------- + */ +/** + * Initialize clustering subsystem. + */ +void +as_clustering_init(); + +/** + * Start clustering subsystem. + */ +void +as_clustering_start(); + +/** + * Stop clustering subsystem. + */ +void +as_clustering_stop(); + +/** + * Reform the cluster with the same succession list.This would trigger the + * generation of new partition info and the cluster would get a new cluster key. + * + * @return 0 if new clustering round started, -1 otherwise. + */ +int +as_clustering_cluster_reform(); + +/** + * Return the quantum interval, i.e., the interval at which cluster change + * decisions are taken. The unit is milliseconds. + */ +uint64_t +as_clustering_quantum_interval(); + +/** + * Log a vector of node-ids at input severity spliting long vectors over + * multiple lines. The call might not work if the vector is not protected + * against multi-threaded access. + * + * @param context the logging context. + * @param severity the log severity. + * @param file_name the source file name for the log line. + * @param line the source file line number for the log line. + * @param message the message prefix for each log line. Message and node list + * will be separated with a space. Can be NULL for no prefix. + * @param nodes the vector of nodes. + */ +void +as_clustering_cf_node_vector_event(cf_fault_severity severity, + cf_fault_context context, char* file_name, int line, char* message, + cf_vector* nodes); + +/** + * Log an array of node-ids at input severity spliting long vectors over + * multiple lines. The call might not work if the array is not protected against + * multi-threaded access. + * + * @param context the logging context. + * @param severity the log severity. + * @param file_name the source file name for the log line. + * @param line the source file line number for the log line. + * @param message the message prefix for each log line. Message and node list + * will be separated with a space. Can be NULL for no prefix. + * @param nodes the array of nodes. + * @param node_count the count of nodes in the array. + */ +void +as_clustering_cf_node_array_event(cf_fault_severity severity, + cf_fault_context context, char* file_name, int line, char* message, + cf_node* nodes, int node_count); + +/** + * Log a vector of node-ids at input severity spliting long vectors over + * multiple lines. The call might not work if the vector is not protected + * against multi-threaded access. + * + * @param context the logging context. + * @param severity the log severity. + * @param message the message prefix for each log line. Message and node list + * will be separated with a space. Can be NULL for no prefix. + * @param nodes the vector of nodes. + */ +#define as_clustering_log_cf_node_vector(severity, context, message, nodes) \ + as_clustering_cf_node_vector_event(severity, context, __FILENAME__, \ + __LINE__, message, nodes) + +/** + * Log an array of node-ids at input severity spliting long vectors over + * multiple lines. The call might not work if the array is not protected against + * multi-threaded access. + * + * @param context the logging context. + * @param severity the log severity. + * @param message the message prefix for each log line. Message and node list + * will be separated with a space. Can be NULL for no prefix. + * @param nodes the array of nodes. + * @param node_count the count of nodes in the array. + */ +#define as_clustering_log_cf_node_array(severity, context, message, nodes, \ + node_count) \ +as_clustering_cf_node_array_event(severity, context, __FILENAME__, \ + __LINE__, message, nodes, node_count); + + +/* + * ---- Clustering info command functions. ---- + */ +/** + * If false means than either this node is orphaned, or is undergoing a cluster + * change. + */ +bool +as_clustering_has_integrity(); + +/** + * Indicates if self node is orphaned. + */ +bool +as_clustering_is_orphan(); + +/** + * Dump clustering state to the log. + */ +void +as_clustering_dump(bool verbose); + +/** + * Set the min cluster size. + */ +int +as_clustering_cluster_size_min_set(uint32_t new_cluster_size_min); diff --git a/as/include/fabric/endpoint.h b/as/include/fabric/endpoint.h new file mode 100644 index 00000000..f10b0d0b --- /dev/null +++ b/as/include/fabric/endpoint.h @@ -0,0 +1,324 @@ +/* + * endpoint.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * Overview + * ======== + * + * An endpoint captures all information needed by a peer node to establish a + * connection to a service (e.g. fabric or heartbeat). The key difference + * between an endpoint and socket API's cf_sock_cfg, is that cf_sock_cfg captures + * all information needed by a service to start a server socket and accept + * connections on it, whereas an endpoint captures all information a peer needs + * to connect to the service. These two complementary structures overlap in + * information content, however cf_sock_cfg will carry server side configuration + * values (e.g. TLS configuration), which are irrelevant for the client using + * this service. Also an endpoint structure is oriented to be advertised over + * the wire. + */ + +#pragma once + +#include +#include +#include + +#include "socket.h" + +/** + * Indicates if this endpoint supports TLS. + */ +#define AS_ENDPOINT_TLS_MASK 0x01 + +/** + * Endpoint address type. + */ +typedef enum +{ + /** + * Undefined address type. + */ + AS_ENDPOINT_ADDR_TYPE_UNDEF, + /** + * IPv4 address. + */ + AS_ENDPOINT_ADDR_TYPE_IPv4, + /** + * IPv6 address. + */ + AS_ENDPOINT_ADDR_TYPE_IPv6, + /** + * Sentinel value. + */ + AS_ENDPOINT_ADDR_TYPE_SENTINEL +} as_endpoint_addr_type; + +/** + * An endpoint definition. + */ +typedef struct as_endpoint_s +{ + /** + * Bit field of capabilities. currently carries only tls enabled flag. + */ + uint8_t capabilities; + + /** + * The type of the address. + */ + uint8_t addr_type; + + /** + * The endpoint port. + */ + uint16_t port; + + /** + * The network formatted and ordered IPv4 / IPv6 address (or string if + * we decide to support dns names). The size of this field depends on + * the address type. + */ + uint8_t addr[]; +}__attribute__((__packed__)) as_endpoint; + +/** + * A list of endpoints. + */ +typedef struct as_endpoint_list_s +{ + /** + * The number of endpoints contained in the list. Max of 255. + */ + uint8_t n_endpoints; + + /** + * The list of endpoints. + */ + as_endpoint endpoints[]; +}__attribute__((__packed__)) as_endpoint_list; + +/** + * Iterate function for iterating over endpoints in an endpoint list. + * @param endpoint current endpoint in the iteration. + * @param udata udata passed through from the invoker of the iterate function. + */ +typedef void + (*as_endpoint_iterate_fn)(const as_endpoint* endpoint, void* udata); + +/** + * Filter function for an endpoints in an endpoint list. + * @param endpoint current endpoint in the iteration. + * @param udata udata passed through from the invoker of the filter function. + * @return should return true if this endpoint passes the filter, false if it + * fails the filter. + */ +typedef bool + (*as_endpoint_filter_fn)(const as_endpoint* endpoint, void* udata); + +/** + * Get the sizeof an endpoint. Accounts for variable size of the address field. + * @return the size of the endpoint address. Zero if the endpoint address is + * invalid. + */ +size_t +as_endpoint_sizeof(const as_endpoint* endpoint); + +/** + * Enable a capability on an endpoint given its mask. + * @param endpoint the endpoint. + * @param capability_mask the capability mask. + */ +void +as_endpoint_capability_enable(as_endpoint* endpoint, uint8_t capability_mask); + +/** + * Disable a capability on an endpoint given its mask. + * @param endpoint the endpoint. + * @param capability_mask the capability mask. + */ +void +as_endpoint_capability_disable(as_endpoint* endpoint, uint8_t capability_mask); + +/** + * Connect to an endpoint. + * + * @param endpoint the peer endpoint to connect to. + * @param owner the socket owner module. + * @param timeout the overall connect timeout. + * @param sock (output) will be populated if connections is successful. + * @return -1 on success, 0 on failure. + */ +int +as_endpoint_connect(const as_endpoint* endpoint, int32_t timeout, cf_socket* sock); + +/** + * Connect to the best matching endpoint in the endpoint list. + * + * @param endpoint_list the list of endpoints. + * @param filter_fn filter function to discard incompatible endpoints. Can be + * NULL. + * @param filter_udata udata passed on as is to the filter function. + * @param timeout the overall connect timeout. + * @param sock (output) will be populated if connection is successful. + * @return the connected endpoint on success, NULL if no endpoint count be + * connected. + */ +const as_endpoint* +as_endpoint_connect_any(const as_endpoint_list* endpoint_list, + as_endpoint_filter_fn filter_fn, void* filter_udata, int32_t timeout, cf_socket* sock); +/** + * Convert a socket configuration to an endpoint inplace. + * @return a heap allocated, converted endpoint. Should be freed using cf_free + * once the endpoint is no longer needed. + */ +void +as_endpoint_from_sock_cfg_fill(const cf_sock_cfg* src, as_endpoint* endpoint); + +/** + * Convert a socket configuration to an endpoint. + * @return a heap allocated, converted endpoint. Should be freed using cf_free + * once the endpoint is no longer needed. + */ +as_endpoint* +as_endpoint_from_sock_cfg(const cf_sock_cfg* src); + +/** + * Convert an endpoint to a cf_sock_addr. + * @param endpoint the source endpoint. + * @param sock_addr the target socket address. + */ +int +as_endpoint_to_sock_addr(const as_endpoint* endpoint, cf_sock_addr* sock_addr); + +/** + * Indicates if an endpoint supports listed capabilities. + * @return true if the endpoint supports the input capability. + */ +bool +as_endpoint_capability_is_supported(const as_endpoint* endpoint, uint8_t capability_mask); + +/** + * Iterate over endpoints in an endpoint list and invoke the iterate function + * for each endpoint. + * @param iterate_fn the iterate function invoked for each endpoint in the list. + * @param udata passed as is to the iterate function. Useful for getting results + * out of the iteration. + * NULL if there is no plugin data. + * @return the size of the plugin data. 0 if there is no plugin data. + */ +void +as_endpoint_list_iterate(const as_endpoint_list* endpoint_list, as_endpoint_iterate_fn iterate_fn, + void* udata); + +/** + * Return the in memory size in bytes of the endpoint list. + * @param endpoint_list the endpoint list. + * @param size (output) the size of the list on success. + * @return 0 on successful size calculation, -1 otherwise. + */ +int +as_endpoint_list_sizeof(const as_endpoint_list* endpoint_list, size_t* size); + +/** + * Return the in memory size in bytes of the endpoint list, but abort if the + * size of the read exceeds the input size. + * @param endpoint_list the endpoint list. + * @param size (output) the size of the list on success. + * @param size_max the maximum size until which parsing will be attempted. + * @return 0 on successful size calculation, -1 otherwise. + */ +int +as_endpoint_list_nsizeof(const as_endpoint_list* endpoint_list, size_t* size, size_t size_max); + +/** + * Convert a server configuration to an endpoint list in place into the + * destination endpoint list. + * @param serv_cfg source server configuration. + * @param endpoint_list destination endpoint list. + */ +void +as_endpoint_list_from_serv_cfg_fill(const cf_serv_cfg* serv_cfg, as_endpoint_list* endpoint_list); + +/** + * Convert a server configuration to an endpoint list. + * @param serv_cfg server configuration. + * @return a heap allocated endpoint list. Should be freed using cf_free + * once the endpoint is no longer needed. + */ +as_endpoint_list* +as_endpoint_list_from_serv_cfg(const cf_serv_cfg* serv_cfg); + +/** + * Compare two endpoint lists for equality. + * @param list1 the first. NULL allowed. + * @param list2 the second list. NULL allowed. + * @return true iff the lists are equals, false otherwise. + */ +bool +as_endpoint_lists_are_equal(const as_endpoint_list* list1, const as_endpoint_list* list2); + +/** + * Check if two lists overlap in at least one endpoint. + * @param list1 the first. NULL allowed. + * @param list2 the second list. NULL allowed. + * @param ignore_capabilities set to true if the overlap match should ignore + * node capabilities, false if capabilities should also be matched. + * @return true iff the lists are overlap, false otherwise. + */ +bool +as_endpoint_lists_are_overlapping(const as_endpoint_list* list1, const as_endpoint_list* list2, + bool ignore_capabilities); + +/** + * Convert an endpoint list to a string. + * @param endpoint_list the input list. NULL allowed. + * @param buffer the output buffer. + * @buffer_capacity the capacity of the output buffer. + * @return the number of characters printed (excluding the null byte used to + end output to strings) + */ +int +as_endpoint_list_to_string(const as_endpoint_list* endpoint_list, char* buffer, + size_t buffer_capacity); + +/** + * Convert an endpoint list to a string matching capabilities. + * @param endpoint_list the input list. NULL allowed. + * @param buffer the output buffer. + * @param buffer_capacity the capacity of the output buffer. + * @param capability_mask specifies which bit to match. + * @param capabilities specifies capabilities to be match for. + * @return the number of characters printed (excluding the null byte used to + * end output to strings) + */ +int +as_endpoint_list_to_string_match_capabilities( + const as_endpoint_list* endpoint_list, char* buffer, + size_t buffer_capacity, uint8_t capability_mask, uint8_t capabilities); + +/** + * Populate dyn buf with endpoints info. + * @param endpoint_list the input list. NULL allowed. + * @param db the dynamic buffer. + */ +void +as_endpoint_list_info(const as_endpoint_list* endpoint_list, cf_dyn_buf* db); diff --git a/as/include/fabric/exchange.h b/as/include/fabric/exchange.h new file mode 100644 index 00000000..c24cd34b --- /dev/null +++ b/as/include/fabric/exchange.h @@ -0,0 +1,158 @@ +/* + * exchange.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_vector.h" + +#include "dynbuf.h" +#include "node.h" + +/* + * ---------------------------------------------------------------------------- + * Constants + * ---------------------------------------------------------------------------- + */ + +/** + * Number of quantum intervals in orphan state after which client transactions + * will be blocked. + */ +#define AS_EXCHANGE_REVERT_ORPHAN_INTERVALS 5 + +/* + * ---------------------------------------------------------------------------- + * Typedefs. + * ---------------------------------------------------------------------------- + */ + +/** + * Exchange event raised for every well-formed cluster change, after exchange + * concludes successfully. + */ +typedef struct as_exchange_cluster_changed_event_s +{ + /** + * The new cluster key. + */ + uint64_t cluster_key; + + /** + * The new cluster size. + */ + uint32_t cluster_size; + + /** + * The new succession list. + */ + cf_node* succession; +} as_exchange_cluster_changed_event; + +/** + * Cluster change event call back function for cluster changed event listeners. + */ +typedef void +(*as_exchange_cluster_changed_cb)( + const as_exchange_cluster_changed_event* event, void* udata); + +/* + * ---------------------------------------------------------------------------- + * Public API. + * ---------------------------------------------------------------------------- + */ +/** + * Initialize exchange subsystem. + */ +void +as_exchange_init(); + +/** + * Start exchange subsystem. + */ +void +as_exchange_start(); + +/** + * Stop exchange subsystem. + */ +void +as_exchange_stop(); + +/** + * Register to receive cluster-changed events. + * TODO - may replace with simple static list someday. + */ +void +as_exchange_register_listener(as_exchange_cluster_changed_cb cb, void* udata); + +/** + * Dump exchange state to log. + */ +void +as_exchange_dump(bool verbose); + +/** + * Member-access method. + */ +uint64_t +as_exchange_cluster_key(); + +/** + * Member-access method. + */ +uint32_t +as_exchange_cluster_size(); + +/** + * Copy over the committed succession list. + * Ensure the input vector has enough capacity. + */ +void +as_exchange_succession(cf_vector* succession); + +/** + * Return the committed succession list as a string in a dyn-buf. + */ +void +as_exchange_info_get_succession(cf_dyn_buf* db); + +/** + * Member-access method. + */ +cf_node +as_exchange_principal(); + +/** + * Lock before setting or getting exchanged info from non-exchange thread. + */ +void +as_exchange_info_lock(); + +/** + * Unlock after setting or getting exchanged info from non-exchange thread. + */ +void +as_exchange_info_unlock(); diff --git a/as/include/fabric/fabric.h b/as/include/fabric/fabric.h new file mode 100644 index 00000000..20734fe5 --- /dev/null +++ b/as/include/fabric/fabric.h @@ -0,0 +1,129 @@ +/* + * fabric.h + * + * Copyright (C) 2008-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include +#include + +#include "msg.h" +#include "node.h" +#include "socket.h" +#include "tls.h" + + +//========================================================== +// Forward declarations. +// + +struct as_endpoint_list_s; +struct as_hb_plugin_node_data_s; + + +//========================================================== +// Typedefs & constants. +// + +#define AS_FABRIC_SUCCESS (0) +#define AS_FABRIC_ERR_UNKNOWN (-1) // used by transact +#define AS_FABRIC_ERR_NO_NODE (-3) +#define AS_FABRIC_ERR_TIMEOUT (-6) // used by transact + +typedef enum { + AS_FABRIC_CHANNEL_RW = 0, // duplicate resolution and replica writes + AS_FABRIC_CHANNEL_CTRL = 1, // clustering, migration ctrl and services info + AS_FABRIC_CHANNEL_BULK = 2, // migrate records + AS_FABRIC_CHANNEL_META = 3, // smd + + AS_FABRIC_N_CHANNELS +} as_fabric_channel; + +#define MAX_FABRIC_CHANNEL_THREADS 128 +#define MAX_FABRIC_CHANNEL_SOCKETS 128 + +typedef struct fabric_rate_s { + uint64_t s_bytes[AS_FABRIC_N_CHANNELS]; + uint64_t r_bytes[AS_FABRIC_N_CHANNELS]; +} fabric_rate; + +typedef int (*as_fabric_msg_fn) (cf_node node_id, msg *m, void *udata); +typedef int (*as_fabric_transact_recv_fn) (cf_node node_id, msg *m, void *transact_data, void *udata); +typedef int (*as_fabric_transact_complete_fn) (msg *rsp, void *udata, int err); + + +//========================================================== +// Globals. +// + +extern cf_serv_cfg g_fabric_bind; +extern cf_tls_info *g_fabric_tls; + + +//========================================================== +// Public API. +// + +//------------------------------------------------ +// msg +// + +msg *as_fabric_msg_get(msg_type type); +void as_fabric_msg_put(msg *m); +void as_fabric_msg_queue_dump(void); + +//------------------------------------------------ +// as_fabric +// + +int as_fabric_init(void); +int as_fabric_start(void); +void as_fabric_set_recv_threads(as_fabric_channel channel, uint32_t count); +int as_fabric_send(cf_node node_id, msg *m, as_fabric_channel channel); +int as_fabric_send_list(const cf_node *nodes, uint32_t node_count, msg *m, as_fabric_channel channel); +void as_fabric_register_msg_fn(msg_type type, const msg_template *mt, size_t mt_sz, size_t scratch_sz, as_fabric_msg_fn msg_cb, void *msg_udata); +void as_fabric_info_peer_endpoints_get(cf_dyn_buf *db); +bool as_fabric_is_published_endpoint_list(const struct as_endpoint_list_s *list); +struct as_endpoint_list_s *as_fabric_hb_plugin_get_endpoint_list(struct as_hb_plugin_node_data_s *plugin_data); +void as_fabric_rate_capture(fabric_rate *rate); +void as_fabric_dump(bool verbose); + + +//============================================================================== +// Fabric transact. +// + +// Used to send a request, and receive a response, reliably. This is guaranteed +// to NEVER return an error directly, but might call the callback function +// saying that we ran out of time or had some other error. +// +// Requires field 0 be a uint64_t which will be used by the fabric system - an +// unknown error will be thrown if this is not true. + +void as_fabric_transact_init(void); +void as_fabric_transact_start(cf_node node_id, msg *m, int timeout_ms, as_fabric_transact_complete_fn cb, void *userdata); +int as_fabric_transact_register(msg_type type, const msg_template *mt, size_t mt_sz, size_t scratch_sz, as_fabric_transact_recv_fn cb, void *udata); +int as_fabric_transact_reply(msg *reply_msg, void *transact_data); diff --git a/as/include/fabric/hb.h b/as/include/fabric/hb.h new file mode 100644 index 00000000..3462ef5b --- /dev/null +++ b/as/include/fabric/hb.h @@ -0,0 +1,473 @@ +/* + * hb.h + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_vector.h" + +#include "msg.h" +#include "socket.h" +#include "tls.h" + +#include "fabric/hlc.h" + +/** + * Maximum number of nodes in a cluster. + */ +#ifndef AS_CLUSTER_SZ +#define AS_CLUSTER_SZ 8 +#endif + +/** + * Minimum heartbeat interval. + */ +#define AS_HB_TX_INTERVAL_MS_MIN 50 + +/** + * Maximum heartbeat interval. (10 mins) + */ +#define AS_HB_TX_INTERVAL_MS_MAX 600000 + +/** + * Minimum max-intervals-missed. + */ +#define AS_HB_MAX_INTERVALS_MISSED_MIN 3 + +/** + * Heartbeat modes. + */ +typedef enum as_hb_mode_enum +{ + AS_HB_MODE_UNDEF, + AS_HB_MODE_MULTICAST, + AS_HB_MODE_MESH +} as_hb_mode; + +/** + * Heartbeat protocol versions. + */ +typedef enum as_hb_protocol_enum +{ + AS_HB_PROTOCOL_UNDEF, + AS_HB_PROTOCOL_NONE, + AS_HB_PROTOCOL_RESET, + AS_HB_PROTOCOL_V3 +} as_hb_protocol; + +/** + * Events published by the heartbeat subsystem. + */ +typedef enum +{ + AS_HB_NODE_ARRIVE, + AS_HB_NODE_DEPART, + AS_HB_NODE_ADJACENCY_CHANGED, + AS_HB_NODE_EVENT_SENTINEL +} as_hb_event_type; + +/** + * A plugin that is publishing and receiving data via the heartbeat subsystem. + * The heartbeat outgoing message buffer will be populated and parsed in the + * order of this enum. + */ +typedef enum +{ + /** + * The heartbeat subsystem itself. + */ + AS_HB_PLUGIN_HB, + /** + * The older clustering subsystem. + * TODO: Use only one plugin id and register differently based on the + * clustering version. + */ + AS_HB_PLUGIN_FABRIC, + /** + * The clustering subsystem. + */ + AS_HB_PLUGIN_CLUSTERING, + /** + * The skew monitor. + */ + AS_HB_PLUGIN_SKEW_MONITOR, + /** + * Dummy sentinel enum value. Should be the last. + */ + AS_HB_PLUGIN_SENTINEL +} as_hb_plugin_id; + +/** + * The fields in the heartbeat message. + * New field additions only at the end. + */ +typedef enum +{ + /** + * HB protocol identifier. + */ + AS_HB_MSG_ID, + + /** + * HB subsystem message type. + */ + AS_HB_MSG_TYPE, + + /** + * HB message source. + */ + AS_HB_MSG_NODE, + + /** + * Cluster Name. + */ + AS_HB_MSG_CLUSTER_NAME, + + /** + * HLC timestamp. + */ + AS_HB_MSG_HLC_TIMESTAMP, + + /** + * Heartbeats endpoints advertised by this node. + */ + AS_HB_MSG_ENDPOINTS, + + /** + * Payload for compressed messages. + */ + AS_HB_MSG_COMPRESSED_PAYLOAD, + + /** + * Mesh info request. + */ + AS_HB_MSG_INFO_REQUEST, + + /** + * Mesh info reply. + */ + AS_HB_MSG_INFO_REPLY, + + /* + * ---- Plugin data fields. Potentially extensible ---- + */ + /** + * Fabric data advertised by this node. Placed close to hb endpoints to + * help compression, because it would most likely match with hb endpoints. + */ + AS_HB_MSG_FABRIC_DATA, + + /** + * Valid only for pulse messages, has adjacency list and clusterid. + */ + AS_HB_MSG_HB_DATA, + + /** + * Contains the cluster key and succession list. + */ + AS_HB_MSG_PAXOS_DATA, + + /** + * Local physical clock monotonic timestamp for when the message was sent. + */ + AS_HB_MSG_SKEW_MONITOR_DATA +} as_hb_msg_fields; + +/** + * Heartbeat subsystem configuration. + */ +typedef struct as_hb_config_s +{ + /** + * Mode of operation. Mesh or Multicast for now. + */ + as_hb_mode mode; + + /** + * Binding interface config. + */ + cf_serv_cfg bind_cfg; + + /** + * Global TLS configuration. + */ + + cf_tls_info *tls; + + /** + * Multicast mode only config for multicast groups. + */ + cf_mserv_cfg multicast_group_cfg; + + /** + * The interval at which heartbeat pulse messages are sent in milliseconds. + */ + uint32_t tx_interval; + + /** + * Max number of missed heartbeat intervals after which a node is considered + * expired. + */ + uint32_t max_intervals_missed; + + /** + * The ttl for multicast packets. Set to zero for default TTL. + */ + uint8_t multicast_ttl; + + /** + * HB protocol to use. + */ + as_hb_protocol protocol; + + /** + * Set to a value > 0 to override the MTU read from the network interface. + */ + uint32_t override_mtu; + + /** + * Mesh seeds from config file. + * Only used for during config parsing and initialization. + */ + char* mesh_seed_addrs[AS_CLUSTER_SZ]; + int mesh_seed_ports[AS_CLUSTER_SZ]; + bool mesh_seed_tls[AS_CLUSTER_SZ]; + +} as_hb_config; + +/** + * Heartbeat published event structure. + */ +typedef struct as_hb_event_node_s +{ + /** + * The type of the event. + */ + as_hb_event_type evt; + + /** + * The event nodeid. + */ + cf_node nodeid; + + /** + * The monotonic timestamp when this event happened. + */ + cf_clock event_time; + + /** + * The monotonic timestamp when this event was detected. Will differ from + * event_time for node depart events. + */ + cf_clock event_detected_time; +} as_hb_event_node; + +/** + * A hook to allow plugin to publish its data as a part of the heartbeat + * message. + */ +typedef void (*as_hb_plugin_set_data_fn)(msg* hb_message); + +/** + * Data stored for an adjacent node for a plugin. + */ +typedef struct as_hb_plugin_node_data_s +{ + /** + * Heap allocated node specific data blob for this plugin. + */ + void* data; + + /** + * The size of the stored data. + */ + size_t data_size; + + /** + * The capacity of the allocated data structure. + */ + size_t data_capacity; +} as_hb_plugin_node_data; + +/** + * A function to parse plugin data for a node into an in memory object. Should + * be fast and never acquire locks. + * + * The parameter plugin_data->data will always be pointer to a previously + * allocated memory location. plugin_data->data_capacity will indicate the + * capacity of this memory. Implementations should reuse this previously + * allocated data blob to avoid the overhead of heap allocations. If current + * data capacity is greater than the new data size please invoke cf_realloc and + * get a new block for current data and update plugin_data->data and + * plugin_data->data_capacity accordingly. + * + * This function should always data_size correctly before returning. Set + * plugin_data->data_size = 0 for no plugin data. + * + * @param hb_message the heartbeat message. + * @param source the source node. + * @param plugin_data (output) plugin data structure to output parsed data. + */ +typedef void (*as_hb_plugin_parse_data_fn)(msg* hb_message, cf_node source, as_hb_plugin_node_data* plugin_data); + +/** + * A listener for detecting changes to this plugin's data for a particular node. + * Does not supply old and new values of the data, because does not seem to be + * required currently and to keep implementation simple. + * + * @param node the node whose plugin data changed. + */ +typedef void (*as_hb_plugin_data_changed_fn)(cf_node nodeid); + +/** + * A plugin allows a module to pushing and read data with heartbeat pulse + * messages. + */ +typedef struct as_hb_plugin_s +{ + /** + * The plugin id. + */ + as_hb_plugin_id id; + + /** + * Fixed plugin data size on wire. + */ + size_t wire_size_fixed; + + /** + * Additional plugin data size on wire per node in the adjacency list. + */ + size_t wire_size_per_node; + + /** + * The function which adds this plugin's data to the pulse message. Can be + * NULL. This function can hold the plugin module's locks. + */ + as_hb_plugin_set_data_fn set_fn; + + /** + * A function will parses and reads this plugins data from an incoming + * message. Can be NULL. This function SHOULD NOT hold the plugin module's + * locks to prevent deadlocks. + */ + as_hb_plugin_parse_data_fn parse_fn; + + /** + * A function invoked when plugin data for a particular node changed. + * Can be NULL. This function can hold the plugin module's locks. + */ + as_hb_plugin_data_changed_fn change_listener; +} as_hb_plugin; + +/* + * ----------------------------------------------------------------- + * HB subsystem public API + * ----------------------------------------------------------------- + */ + +void as_hb_init(); + +void as_hb_start(); + +void as_hb_shutdown(); + +bool as_hb_self_is_duplicate(); + +bool as_hb_node_is_adjacent(cf_node nodeid); + +typedef void (*as_hb_event_fn)(int nevents, as_hb_event_node* events, void* udata); + +void as_hb_register_listener(as_hb_event_fn event_callback, void* udata); + +void as_hb_dump(bool verbose); + +as_hb_protocol as_hb_protocol_get(); + +int as_hb_protocol_set(as_hb_protocol protocol); + +uint32_t as_hb_node_timeout_get(); + +void as_hb_override_mtu_set(int mtu); + +uint32_t as_hb_tx_interval_get(); + +int as_hb_tx_interval_set(uint32_t new_interval); + +int as_hb_max_intervals_missed_set(uint32_t new_max); + +uint32_t as_hb_node_timeout_get(); + +bool as_hb_max_cluster_size_isvalid(uint32_t max_cluster_size); + +/* + * ----------------------------------------------------------------- + * HB plugin subsystem public API. + * ----------------------------------------------------------------- + */ + +void as_hb_plugin_register(as_hb_plugin* plugin); + +bool as_hb_is_alive(cf_node nodeid); + +void as_hb_config_validate(); + +void as_hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict); + +int as_hb_plugin_data_get(cf_node nodeid, as_hb_plugin_id plugin, as_hb_plugin_node_data* plugin_data, as_hlc_msg_timestamp* msg_hlc_ts, cf_clock* recv_monotonic_ts); + +typedef void (*as_hb_plugin_data_iterate_fn)(cf_node nodeid, void* plugin_data, size_t plugin_data_size, cf_clock recv_monotonic_ts, as_hlc_msg_timestamp* msg_hlc_ts, void* udata); + +void as_hb_plugin_data_iterate(cf_vector* nodes, as_hb_plugin_id plugin, as_hb_plugin_data_iterate_fn iterate_fn, void* udata); + +void as_hb_plugin_data_iterate_all(as_hb_plugin_id plugin, as_hb_plugin_data_iterate_fn iterate_fn, void* udata); + +/* + * ----------------------------------------------------------------- + * Info public API + * ----------------------------------------------------------------- + */ + +void as_hb_info_config_get(cf_dyn_buf* db); + +void as_hb_info_endpoints_get(cf_dyn_buf* db); + +void as_hb_info_listen_addr_get(as_hb_mode* mode, char* addr_port, size_t addr_port_capacity); + +void as_hb_info_duplicates_get(cf_dyn_buf* db); + +/* + * ----------------------------------------------------------------- + * Mesh mode public API + * ----------------------------------------------------------------- + */ + +int as_hb_mesh_tip(char* host, int port, bool tls); + +int as_hb_mesh_tip_clear(char* host, int port); + +int as_hb_mesh_tip_clear_all(uint32_t* cleared); + +void as_hb_config_validate(); diff --git a/as/include/fabric/hlc.h b/as/include/fabric/hlc.h new file mode 100644 index 00000000..4bb7fbdf --- /dev/null +++ b/as/include/fabric/hlc.h @@ -0,0 +1,160 @@ +/* + * hlc.h + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * Hybrid logical clock as described in + * http://www.cse.buffalo.edu/tech-reports/2014-04.pdf. + * + */ + +#pragma once + +#include +#include + +#include "citrusleaf/cf_clock.h" + +#include "node.h" + +/** + * A hybrid logical clock timestamp. + * + * The most significant 48 bits represent the physical component of the hlc and + * the least significant 16 bits represent the logical component. + */ +typedef uint64_t as_hlc_timestamp; + +/** + * Timestamp for a message receive event. + */ +typedef struct as_hlc_msg_timestamp_s +{ + /** + * The sender's HLC timestamp at time when the message was sent. + */ + as_hlc_timestamp send_ts; + /** + * Local HLC timestamp at message receipt. + */ + as_hlc_timestamp recv_ts; +} as_hlc_msg_timestamp; + +/** + * Result of ordering two hlc timestamps. + */ +typedef enum as_hlc_timestamp_order_e { + /** + * The event with first timestamp happened before. + */ + AS_HLC_HAPPENS_BEFORE, + /** + * The event with first timestamp happened after. + */ + AS_HLC_HAPPENS_AFTER, + /** + * The order of the timestamps is indeterminated. + */ + AS_HLC_ORDER_INDETERMINATE +} as_hlc_timestamp_order; + +/*---------------------------------------------------------------------------- + * Public API. + *----------------------------------------------------------------------------*/ +/** + * Initialize hybrid logical clock. + */ +void as_hlc_init(); + +/** + * Return a hlc timestamp representing the hlc time "now". + */ +as_hlc_timestamp as_hlc_timestamp_now(); + +/** + * Return the physical component of a hlc timstamp + * @param hlc_ts the hybrid logical clock timestamp. + */ +cf_clock as_hlc_physical_ts_get(as_hlc_timestamp hlc_ts); + +/** + * Update the HLC on receipt of a remote message. The notion is to adjust this + * node's hlc to ensure the receive hlc ts > the send hlc ts. + * + * @param source for debugging and tracking only. + * @param send_timestamp the hlc timestamp when this message was sent. + * @param recv_timestamp (output) the message receive timestamp which will be + * populated. Can be NULL in which case it will be ignored. + */ +void as_hlc_timestamp_update(cf_node source, as_hlc_timestamp send_ts, + as_hlc_msg_timestamp* msg_ts); + +/** + * Return the difference in milliseconds between two hlc timestamps. Note this + * difference may be greater than or equal to the physical wall call difference, + * because HLC can have non linear jumps, whenever the clock is adjusted. The + * difference should be used as an estimate rather than an absolute difference. + * For e.g. use the difference to check that the time difference is at least + * some number of milliseconds. However do not use this for interval statistics + * or to check if the difference in time is at the most some number of + * milliseconds. + * + * @param ts1 the first timestamp. + * @param ts2 the seconds timestamp. + * @return ts1 - ts2 in milliseconds. + */ +int64_t as_hlc_timestamp_diff_ms(as_hlc_timestamp ts1, as_hlc_timestamp ts2); + +/** + * Orders a local timestamp and remote message send timestamp. + * + * @param local_ts the local timestamp. + * @param msg_ts message receive timestamp containing the remote send and the + * local receive timestamp. + * @return the order between the local and the message timestamp. + */ +as_hlc_timestamp_order as_hlc_send_timestamp_order( + as_hlc_timestamp local_ts, as_hlc_msg_timestamp* msg_ts); + +/** + * Orders two timestamp generated by the same node / process. + * + * @param ts1 the first timestamp. + * @param ts2 the second timestamp. + * @return AS_HLC_HAPPENS_BEFORE if ts1 happens before ts2 else + * AS_HLC_HAPPENS_AFTER if ts1 happens after ts2 else + * AS_HLC_ORDER_INDETERMINATE. + */ +as_hlc_timestamp_order as_hlc_timestamp_order_get(as_hlc_timestamp ts1, + as_hlc_timestamp ts2); + +/** + * Subtract milliseconds worth of time from the timestamp. + * @param timestamp the input timestamp. + * @param ms the number of milliseconds to subtract. + */ +as_hlc_timestamp as_hlc_timestamp_subtract_ms(as_hlc_timestamp timestamp, + int ms); + +/** + * Dump some debugging information to the logs. + */ +void as_hlc_dump(bool verbose); diff --git a/as/include/fabric/meta_batch.h b/as/include/fabric/meta_batch.h new file mode 100644 index 00000000..1a895f5c --- /dev/null +++ b/as/include/fabric/meta_batch.h @@ -0,0 +1,42 @@ +/* + * meta_batch.h + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Forward declarations. +// + +struct meta_in_q_s; +struct meta_out_q_s; + + +//========================================================== +// Public API. +// + +struct meta_in_q_s *meta_in_q_create(); +void meta_in_q_destroy(struct meta_in_q_s *iq); +void meta_in_q_rejected(struct meta_in_q_s *iq); + +struct meta_out_q_s *meta_out_q_create(); +void meta_out_q_destroy(struct meta_out_q_s *oq); diff --git a/as/include/fabric/migrate.h b/as/include/fabric/migrate.h new file mode 100644 index 00000000..80caa0d2 --- /dev/null +++ b/as/include/fabric/migrate.h @@ -0,0 +1,215 @@ +/* + * migrate.h + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_queue.h" +#include "citrusleaf/cf_rchash.h" + +#include "msg.h" +#include "node.h" +#include "shash.h" + +#include "fabric/hb.h" +#include "fabric/partition.h" +#include "fabric/partition_balance.h" + + +//========================================================== +// Forward declarations. +// + +struct as_index_s; +struct as_index_ref_s; +struct as_namespace_s; +struct as_remote_record_s; +struct meta_in_q_s; +struct meta_out_q_s; +struct pb_task_s; + + +//========================================================== +// Typedefs & constants. +// + +// For receiver-side migration flow-control. +// TODO - move to namespace? Go even lower than 4? +#define AS_MIGRATE_DEFAULT_MAX_NUM_INCOMING 4 +#define AS_MIGRATE_LIMIT_MAX_NUM_INCOMING 256 + +// Maximum permissible number of migrate xmit threads. +#define MAX_NUM_MIGRATE_XMIT_THREADS 100 + +#define TX_FLAGS_NONE ((uint32_t) 0x0) +#define TX_FLAGS_ACTING_MASTER ((uint32_t) 0x1) + + +//========================================================== +// Public API. +// + +void as_migrate_init(); +void as_migrate_emigrate(const struct pb_task_s *task); +void as_migrate_set_num_xmit_threads(uint32_t n_threads); +void as_migrate_dump(bool verbose); + + +//========================================================== +// Private API - for enterprise separation only. +// + +typedef enum { + // These values go on the wire, so mind backward compatibility if changing. + MIG_FIELD_OP, + MIG_FIELD_UNUSED_1, + MIG_FIELD_EMIG_ID, + MIG_FIELD_NAMESPACE, + MIG_FIELD_PARTITION, + MIG_FIELD_DIGEST, + MIG_FIELD_GENERATION, + MIG_FIELD_RECORD, + MIG_FIELD_CLUSTER_KEY, + MIG_FIELD_UNUSED_9, + MIG_FIELD_VOID_TIME, + MIG_FIELD_UNUSED_11, + MIG_FIELD_UNUSED_12, + MIG_FIELD_INFO, + MIG_FIELD_UNUSED_14, + MIG_FIELD_UNUSED_15, + MIG_FIELD_UNUSED_16, + MIG_FIELD_UNUSED_17, + MIG_FIELD_UNUSED_18, + MIG_FIELD_LAST_UPDATE_TIME, + MIG_FIELD_FEATURES, + MIG_FIELD_UNUSED_21, + MIG_FIELD_META_RECORDS, + MIG_FIELD_META_SEQUENCE, + MIG_FIELD_META_SEQUENCE_FINAL, + MIG_FIELD_PARTITION_SIZE, + MIG_FIELD_SET_NAME, + MIG_FIELD_KEY, + MIG_FIELD_UNUSED_28, + MIG_FIELD_EMIG_INSERT_ID, + + NUM_MIG_FIELDS +} migrate_msg_fields; + +#define OPERATION_UNDEF 0 +#define OPERATION_INSERT 1 +#define OPERATION_INSERT_ACK 2 +#define OPERATION_START 3 +#define OPERATION_START_ACK_OK 4 +#define OPERATION_START_ACK_EAGAIN 5 +#define OPERATION_START_ACK_FAIL 6 +#define OPERATION_UNUSED_7 7 // deprecated +#define OPERATION_DONE 8 +#define OPERATION_DONE_ACK 9 +#define OPERATION_UNUSED_10 10 // deprecated +#define OPERATION_MERGE_META 11 +#define OPERATION_MERGE_META_ACK 12 +#define OPERATION_ALL_DONE 13 +#define OPERATION_ALL_DONE_ACK 14 + +#define MIG_INFO_UNUSED_1 0x0001 +#define MIG_INFO_UNUSED_2 0x0002 +#define MIG_INFO_UNREPLICATED 0x0004 // enterprise only +#define MIG_INFO_TOMBSTONE 0x0008 // enterprise only + +#define MIG_FEATURE_MERGE 0x00000001U +#define MIG_FEATURES_SEEN 0x80000000U // needed for backward compatibility +extern const uint32_t MY_MIG_FEATURES; + +typedef struct emigration_s { + cf_node dest; + uint64_t cluster_key; + uint32_t id; + pb_task_type type; + uint32_t tx_flags; + cf_atomic32 state; + bool aborted; + bool from_replica; + uint64_t wait_until_ms; + + cf_atomic32 bytes_emigrating; + cf_shash *reinsert_hash; + uint64_t insert_id; + cf_queue *ctrl_q; + struct meta_in_q_s *meta_q; + + as_partition_reservation rsv; +} emigration; + +typedef struct immigration_s { + cf_node src; + uint64_t cluster_key; + uint32_t pid; + + cf_atomic32 done_recv; // flag - 0 if not yet received, atomic counter for receives + uint64_t start_recv_ms; // time the first START event was received + uint64_t done_recv_ms; // time the first DONE event was received + + uint32_t emig_id; + struct meta_out_q_s *meta_q; + + as_migrate_result start_result; + uint32_t features; + struct as_namespace_s *ns; // for statistics only + + as_partition_reservation rsv; +} immigration; + +typedef struct immigration_hkey_s { + cf_node src; + uint32_t emig_id; +} __attribute__((__packed__)) immigration_hkey; + + +// Globals. +extern cf_rchash *g_emigration_hash; +extern cf_rchash *g_immigration_hash; + + +// Emigration, immigration, & pickled record destructors. +void emigration_release(emigration *emig); +void immigration_release(immigration *immig); + +// Emigration. +bool should_emigrate_record(emigration *emig, struct as_index_ref_s *r_ref); +uint32_t emigration_pack_info(const emigration *emig, const struct as_index_s *r); + +// Migrate fabric message handling. +void emigration_handle_meta_batch_request(cf_node src, msg *m); +bool immigration_ignore_pickle(const uint8_t *buf, uint32_t info); +void immigration_init_repl_state(struct as_remote_record_s* rr, uint32_t info); +void immigration_handle_meta_batch_ack(cf_node src, msg *m); + +// Meta sender. +bool immigration_start_meta_sender(immigration *immig, uint32_t emig_features, uint64_t emig_n_recs); diff --git a/as/include/fabric/partition.h b/as/include/fabric/partition.h new file mode 100644 index 00000000..f8e59189 --- /dev/null +++ b/as/include/fabric/partition.h @@ -0,0 +1,285 @@ +/* + * partition.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include +#include +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_digest.h" + +#include "dynbuf.h" +#include "node.h" + +#include "base/cfg.h" +#include "fabric/hb.h" + + +//========================================================== +// Forward declarations. +// + +struct as_index_tree_s; +struct as_namespace_s; + + +//========================================================== +// Typedefs & constants. +// + +#define AS_PARTITIONS 4096 +#define AS_PARTITION_MASK (AS_PARTITIONS - 1) + +#define VERSION_FAMILY_BITS 4 +#define VERSION_FAMILY_UNIQUE ((1 << VERSION_FAMILY_BITS) - 1) +#define AS_PARTITION_N_FAMILIES VERSION_FAMILY_UNIQUE + +typedef struct as_partition_version_s { + uint64_t ckey:48; + uint64_t family:VERSION_FAMILY_BITS; + uint64_t unused:8; + uint64_t revived:1; // enterprise only + uint64_t master:1; + uint64_t subset:1; + uint64_t evade:1; +} as_partition_version; + +COMPILER_ASSERT(sizeof(as_partition_version) == sizeof(uint64_t)); + +typedef struct as_partition_version_string_s { + char s[19 + 1]; // format CCCCccccCCCC.F.mse - F may someday be 2 characters +} as_partition_version_string; + +typedef struct as_partition_s { + pthread_mutex_t lock; + + uint32_t id; + + struct as_index_tree_s* vp; + + cf_atomic64 n_tombstones; // relevant only for enterprise edition + cf_atomic64 max_void_time; // TODO - convert to 32-bit ... + + // Replica information. + uint32_t n_nodes; // relevant only for enterprise edition + uint32_t n_replicas; + cf_node replicas[AS_CLUSTER_SZ]; + + // Rebalance & migration related: + + as_partition_version final_version; + as_partition_version version; + int pending_emigrations; + int pending_immigrations; + bool immigrators[AS_CLUSTER_SZ]; + + cf_node working_master; + + uint32_t n_dupl; + cf_node dupls[AS_CLUSTER_SZ]; + + uint32_t n_witnesses; + cf_node witnesses[AS_CLUSTER_SZ]; + + bool must_appeal; // relevant only for enterprise edition + + uint32_t regime; // relevant only for enterprise edition +} as_partition; + +typedef struct as_partition_reservation_s { + struct as_namespace_s* ns; + as_partition* p; + struct as_index_tree_s* tree; + uint32_t regime; + uint32_t n_dupl; + cf_node dupl_nodes[AS_CLUSTER_SZ]; +} as_partition_reservation; + +typedef struct repl_stats_s { + uint64_t n_master_objects; + uint64_t n_prole_objects; + uint64_t n_non_replica_objects; + uint64_t n_master_tombstones; + uint64_t n_prole_tombstones; + uint64_t n_non_replica_tombstones; +} repl_stats; + +#define CLIENT_BITMAP_BYTES ((AS_PARTITIONS + 7) / 8) +#define CLIENT_B64MAP_BYTES (((CLIENT_BITMAP_BYTES + 2) / 3) * 4) + +typedef struct client_replica_map_s { + pthread_mutex_t write_lock; + + volatile uint8_t bitmap[CLIENT_BITMAP_BYTES]; + volatile char b64map[CLIENT_B64MAP_BYTES]; +} client_replica_map; + +typedef enum { + AS_MIGRATE_OK, + AS_MIGRATE_FAIL, + AS_MIGRATE_AGAIN +} as_migrate_result; + + +//========================================================== +// Public API. +// + +void as_partition_init(struct as_namespace_s* ns, uint32_t pid); +void as_partition_shutdown(struct as_namespace_s* ns, uint32_t pid); + +void as_partition_isolate_version(const struct as_namespace_s* ns, as_partition* p); +int as_partition_check_source(const struct as_namespace_s* ns, as_partition* p, cf_node src, bool* from_replica); +void as_partition_freeze(as_partition* p); + +uint32_t as_partition_get_other_replicas(as_partition* p, cf_node* nv); + +cf_node as_partition_writable_node(struct as_namespace_s* ns, uint32_t pid); +cf_node as_partition_proxyee_redirect(struct as_namespace_s* ns, uint32_t pid); + +void as_partition_get_replicas_prole_str(cf_dyn_buf* db); // deprecate in "six months" +void as_partition_get_replicas_master_str(cf_dyn_buf* db); +void as_partition_get_replicas_all_str(cf_dyn_buf* db, bool include_regime); + +void as_partition_get_replica_stats(struct as_namespace_s* ns, repl_stats* p_stats); + +void as_partition_reserve(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv); +int as_partition_reserve_timeout(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv, int timeout_ms); +int as_partition_reserve_replica(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv); +int as_partition_reserve_write(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv, cf_node* node); +int as_partition_reserve_read(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv, bool would_dup_res, cf_node* node); +int as_partition_prereserve_query(struct as_namespace_s* ns, bool can_partition_query[], as_partition_reservation rsv[]); +int as_partition_reserve_query(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv); +int as_partition_reserve_xdr_read(struct as_namespace_s* ns, uint32_t pid, as_partition_reservation* rsv); +void as_partition_reservation_copy(as_partition_reservation* dst, as_partition_reservation* src); + +void as_partition_release(as_partition_reservation* rsv); + +void as_partition_getinfo_str(cf_dyn_buf* db); + +// Use VERSION_AS_STRING() - see below. +static inline as_partition_version_string +as_partition_version_as_string(const as_partition_version* version) +{ + as_partition_version_string str; + + if (version->family == VERSION_FAMILY_UNIQUE) { + sprintf(str.s, "%012lx.U.%c%c%c", (uint64_t)version->ckey, + version->master == 0 ? '-' : 'm', + version->subset == 0 ? 'p' : 's', + version->evade == 0 ? '-' : 'e'); + } + else { + sprintf(str.s, "%012lx.%X.%c%c%c", (uint64_t)version->ckey, + (uint32_t)version->family, + version->master == 0 ? '-' : 'm', + version->subset == 0 ? 'p' : 's', + version->evade == 0 ? + (version->revived == 0 ? '-' : 'r') : 'e'); + } + + return str; +} + +static inline bool +as_partition_version_is_null(const as_partition_version* version) +{ + return *(uint64_t*)version == 0; +} + +static inline bool +as_partition_version_has_data(const as_partition_version* version) +{ + return version->ckey != 0; +} + +static inline bool +as_partition_version_same(const as_partition_version* v1, const as_partition_version* v2) +{ + return v1->ckey == v2->ckey && + v1->family == v2->family && + // Note - master flag not included in definition of "same". + v1->subset == v2->subset && + // Note - could probably exclude these too... + v1->evade == v2->evade && + v1->revived == v2->revived; +} + +static inline uint32_t +as_partition_getid(const cf_digest* d) +{ + return *(uint32_t*)d & AS_PARTITION_MASK; +} + +static inline int +find_self_in_replicas(const as_partition* p) +{ + return index_of_node(p->replicas, p->n_replicas, g_config.self_node); +} + +static inline bool +is_self_replica(const as_partition* p) +{ + return contains_node(p->replicas, p->n_replicas, g_config.self_node); +} + +static inline bool +contains_self(const cf_node* nodes, uint32_t n_nodes) +{ + return contains_node(nodes, n_nodes, g_config.self_node); +} + +#define AS_PARTITION_ID_UNDEF ((uint16_t)0xFFFF) + +#define AS_PARTITION_RESERVATION_INIT(__rsv) \ + __rsv.ns = NULL; \ + __rsv.p = NULL; \ + __rsv.tree = NULL; \ + __rsv.regime = 0; \ + __rsv.n_dupl = 0; + +#define VERSION_AS_STRING(v_ptr) (as_partition_version_as_string(v_ptr).s) + + +//========================================================== +// Public API - client view replica maps. +// + +void client_replica_maps_create(struct as_namespace_s* ns); +void client_replica_maps_clear(struct as_namespace_s* ns); +bool client_replica_maps_update(struct as_namespace_s* ns, uint32_t pid); +bool client_replica_maps_is_partition_queryable(const struct as_namespace_s* ns, uint32_t pid); + + +//========================================================== +// Private API - for enterprise separation only. +// + +bool partition_reserve_promote(const struct as_namespace_s* ns, const as_partition* p, bool would_dup_res); diff --git a/as/include/fabric/partition_balance.h b/as/include/fabric/partition_balance.h new file mode 100644 index 00000000..e01fa76d --- /dev/null +++ b/as/include/fabric/partition_balance.h @@ -0,0 +1,197 @@ +/* + * partition_balance.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_queue.h" + +#include "node.h" + +#include "fabric/hb.h" +#include "fabric/partition.h" + + +//========================================================== +// Forward declarations. +// + +struct as_namespace_s; + + +//========================================================== +// Typedefs & constants. +// + +typedef enum { + PB_TASK_EMIG_TRANSFER, + PB_TASK_EMIG_SIGNAL_ALL_DONE, + PB_TASK_APPEAL +} pb_task_type; + +typedef struct pb_task_s { + cf_node dest; + struct as_namespace_s* ns; + uint32_t pid; + uint64_t cluster_key; + pb_task_type type; + uint32_t tx_flags; +} pb_task; + +#define MAX_RACK_ID 1000000 +#define MAX_RACK_ID_LEN 7 // number of decimal characters + + +//========================================================== +// Public API - regulate migrations. +// + +void as_partition_balance_disallow_migrations(); +bool as_partition_balance_are_migrations_allowed(); +void as_partition_balance_synchronize_migrations(); +void as_partition_balance_emigration_yield(); + + +//========================================================== +// Public API - balance partitions. +// + +void as_partition_balance_init(); +bool as_partition_balance_is_init_resolved(); +void as_partition_balance_revert_to_orphan(); +void as_partition_balance(); + +uint64_t as_partition_balance_remaining_migrations(); +bool as_partition_balance_revive(struct as_namespace_s* ns); + + +//========================================================== +// Public API - migration-related as_partition methods. +// + +bool as_partition_pending_migrations(as_partition* p); + +bool as_partition_pre_emigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, uint32_t tx_flags); +void as_partition_emigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, uint32_t tx_flags); +as_migrate_result as_partition_immigrate_start(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node source_node); +as_migrate_result as_partition_immigrate_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key, cf_node source_node); +as_migrate_result as_partition_migrations_all_done(struct as_namespace_s* ns, uint32_t pid, uint64_t orig_cluster_key); + +// Counter that tells clients partition ownership has changed. +extern cf_atomic32 g_partition_generation; + + +//========================================================== +// Private API - for enterprise separation only. +// + +//------------------------------------------------ +// Typedefs & constants. +// + +COMPILER_ASSERT((AS_CLUSTER_SZ & (AS_CLUSTER_SZ - 1)) == 0); + +#define AS_CLUSTER_SZ_MASKP (-(uint64_t)AS_CLUSTER_SZ) +#define AS_CLUSTER_SZ_MASKN ((uint64_t)AS_CLUSTER_SZ - 1) + +typedef uint8_t sl_ix_t; + +COMPILER_ASSERT(AS_CLUSTER_SZ_MASKN >> (sizeof(sl_ix_t) * 8) == 0); + +typedef struct inter_hash_s { + uint64_t hashed_node; + uint64_t hashed_pid; +} inter_hash; + +extern const as_partition_version ZERO_VERSION; + + +//------------------------------------------------ +// Globals. +// + +extern volatile int g_allow_migrations; + +extern uint64_t g_hashed_pids[AS_PARTITIONS]; + +// Shortcuts to values set by as_exchange, for use in partition balance only. +extern uint32_t g_cluster_size; +extern cf_node* g_succession; + +extern cf_node g_full_node_seq_table[AS_CLUSTER_SZ * AS_PARTITIONS]; +extern sl_ix_t g_full_sl_ix_table[AS_CLUSTER_SZ * AS_PARTITIONS]; + + +//------------------------------------------------ +// Forward declarations. +// + +void partition_balance_init(); + +void pb_task_init(pb_task* task, cf_node dest, struct as_namespace_s* ns, uint32_t pid, uint64_t cluster_key, pb_task_type type, uint32_t tx_flags); +void drop_trees(as_partition* p, struct as_namespace_s* ns); + +void balance_namespace(struct as_namespace_s* ns, cf_queue* mq); +void prepare_for_appeals(); +void process_pb_tasks(cf_queue* tq); +void balance_namespace_ap(struct as_namespace_s* ns, cf_queue* mq); +void fill_translation(int translation[], const struct as_namespace_s* ns); +void fill_namespace_rows(const cf_node* full_node_seq, const sl_ix_t* full_sl_ix, cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, const int translation[]); +void rack_aware_adjust_row(cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, uint32_t replication_factor, const uint32_t* rack_ids, uint32_t n_ids, uint32_t n_racks, uint32_t start_n); +uint32_t find_self(const cf_node* ns_node_seq, const struct as_namespace_s* ns); +uint32_t fill_immigrators(as_partition* p, const sl_ix_t* ns_sl_ix, struct as_namespace_s* ns, uint32_t working_master_n, uint32_t n_dupl); +void queue_namespace_migrations(as_partition* p, struct as_namespace_s* ns, uint32_t self_n, cf_node working_master, uint32_t n_dupl, cf_node dupls[], cf_queue* mq); +void fill_witnesses(as_partition* p, const cf_node* ns_node_seq, const sl_ix_t* ns_sl_ix, struct as_namespace_s* ns); + +void emigrate_done_advance_non_master_version(struct as_namespace_s* ns, as_partition* p, uint32_t tx_flags); +void emigrate_done_advance_non_master_version_ap(struct as_namespace_s* ns, as_partition* p, uint32_t tx_flags); +void immigrate_start_advance_non_master_version(struct as_namespace_s* ns, as_partition* p); +void immigrate_start_advance_non_master_version_ap(as_partition* p); +void immigrate_done_advance_final_master_version(struct as_namespace_s* ns, as_partition* p); +void immigrate_done_advance_final_master_version_ap(struct as_namespace_s* ns, as_partition* p); +bool immigrate_yield(); + + +//------------------------------------------------ +// Inlines and macros. +// + +static inline bool +is_family_same(const as_partition_version* v1, const as_partition_version* v2) +{ + return v1->ckey == v2->ckey && v1->family == v2->family && + v1->family != VERSION_FAMILY_UNIQUE; +} + +// Define macros for accessing the full node-seq and sl-ix arrays. +#define FULL_NODE_SEQ(x, y) g_full_node_seq_table[(x * g_cluster_size) + y] +#define FULL_SL_IX(x, y) g_full_sl_ix_table[(x * g_cluster_size) + y] + +// Get the partition version that was input by exchange. +#define INPUT_VERSION(_n) (&ns->cluster_versions[ns_sl_ix[_n]][p->id]) diff --git a/as/include/fabric/roster.h b/as/include/fabric/roster.h new file mode 100644 index 00000000..4d0d11f1 --- /dev/null +++ b/as/include/fabric/roster.h @@ -0,0 +1,52 @@ +/* + * roster.h + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include + +#include "node.h" + +#include "fabric/partition_balance.h" + + +//========================================================== +// Public API. +// + +void as_roster_init_smd(); +bool as_roster_set_nodes_cmd(const char* ns_name, const char* nodes); + + +//========================================================== +// Inlines and macros. +// + +// Format is: :, +#define ROSTER_STRING_ELE_LEN ((sizeof(cf_node) * 2) + 1 + MAX_RACK_ID_LEN + 1) + +// In string lists, separate node-id and rack-id with this character. +#define ROSTER_ID_PAIR_SEPARATOR '@' diff --git a/as/include/fabric/skew_monitor.h b/as/include/fabric/skew_monitor.h new file mode 100644 index 00000000..194ac7a5 --- /dev/null +++ b/as/include/fabric/skew_monitor.h @@ -0,0 +1,67 @@ +/* + * skew_monitor.h + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include + +#include "citrusleaf/cf_vector.h" + +#include "dynbuf.h" + +/** + * Initialize skew monitor. + */ +void +as_skew_monitor_init(); + +/** + * Return the current estimate of the clock skew in the cluster. + */ +uint64_t +as_skew_monitor_skew(); + +/** + * Return the currently estimated outliers from our cluster. + * Outliers should have space to hold at least AS_CLUSTER_SZ nodes. + */ +uint32_t +as_skew_monitor_outliers(cf_vector* outliers); + +/** + * Print skew outliers to a dynamic buffer. + */ +uint32_t +as_skew_monitor_outliers_append(cf_dyn_buf* db); + +/** + * Print skew monitor info to a dynamic buffer. + */ +void +as_skew_monitor_info(cf_dyn_buf* db); + +/** + * Dump some debugging information to the logs. + */ +void +as_skew_monitor_dump(); diff --git a/as/include/geospatial/geojson.h b/as/include/geospatial/geojson.h new file mode 100644 index 00000000..69eee955 --- /dev/null +++ b/as/include/geospatial/geojson.h @@ -0,0 +1,56 @@ +/* + * Copyright 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more + * contributor license agreements. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. You + * may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#ifndef __geojson_h +#define __geojson_h 1 + +#include + +#include + +#include +#include + +namespace GeoJSON { + +class GeometryHandler +{ +public: + virtual ~GeometryHandler() {} + + virtual void handle_point(S2CellId const & cellid); + + virtual bool handle_region(S2Region * regionp); + + virtual double earth_radius_meters() { + return 6371000.0; // Wikipedia, mean radius. + } + + void set_json(json_t * i_jsonp) { m_jsonp = i_jsonp; } + + json_t * get_json() { return m_jsonp; } + +private: + json_t * m_jsonp; +}; + +void parse(GeometryHandler & geohand, std::string const & geostr); + +} // end namespace GeoJSON + +#endif // __geojson_h diff --git a/as/include/geospatial/geospatial.h b/as/include/geospatial/geospatial.h new file mode 100644 index 00000000..dde168d3 --- /dev/null +++ b/as/include/geospatial/geospatial.h @@ -0,0 +1,61 @@ +/* + * geospatial.h + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include +#include + +#include "base/datamodel.h" + +#ifdef __cplusplus +extern "C" { +#endif + +extern bool geo_parse(as_namespace * ns, + const char * buf, + size_t bufsz, + uint64_t * cellidp, + geo_region_t * regionp); + +extern bool geo_region_cover(as_namespace * ns, + geo_region_t region, + int maxnumcells, + uint64_t * cellctrp, + uint64_t * cellminp, + uint64_t * cellmaxp, + int * numcellsp); + +extern bool geo_point_centers(as_namespace * ns, + uint64_t cellidval, + int maxnumcenters, + uint64_t * center, + int * numcentersp); + +extern bool geo_point_within(uint64_t cellidval, geo_region_t region); + +extern void geo_region_destroy(geo_region_t region); + +#ifdef __cplusplus +} // end extern "C" +#endif diff --git a/as/include/geospatial/scoped.h b/as/include/geospatial/scoped.h new file mode 100644 index 00000000..de857dda --- /dev/null +++ b/as/include/geospatial/scoped.h @@ -0,0 +1,107 @@ +/* + * Copyright 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more + * contributor license agreements. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. You + * may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#ifndef scoped_h__ +#define scoped_h__ + +template +class Scoped +{ +public: + /// A deletion function. + typedef void (*Del)(T p); + + /// Default constructor. + /// + /// Note - the deletion function will not be called on the nil + /// value. + /// + /// @param[in] i_nil Nil value. + /// @param[in] i_del Deletion functor. + /// + Scoped(T const & i_nil, Del i_del) + : m_val(i_nil) + , m_nil(i_nil) + , m_del(i_del) + {} + + /// Contructor from value. + /// + /// Note - the deletion function will not be called on the nil + /// value. + /// + /// @param[in] i_val The value to assign. + /// @param[in] i_nil Nil value. + /// @param[in] i_del Deletion functor. + /// + Scoped(T const & i_val, T const & i_nil, Del i_del) + : m_val(i_val) + , m_nil(i_nil) + , m_del(i_del) + {} + + + /// Destructor, calls deletion function on non-nil values. + /// + ~Scoped() + { + if (m_val != m_nil) + m_del(m_val); + } + + /// Assignment operator. + /// + /// Calls deletion on existing non-nil value and assigns new + /// value. + /// + /// @param[in] i_val The right-hand-side is the new value. + /// + inline Scoped & operator=(T const & i_val) + { + // Delete any pre-existing value. + if (m_val != m_nil) + m_del(m_val); + + m_val = i_val; + return *this; + } + + /// Pointer dereference. + /// + inline T const operator->() const { return m_val; } + + /// Reference. + /// + inline operator T&() { return m_val; } + + /// Takes value, will not be deleted. + /// + T const take() + { + T tmp = m_val; + m_val = m_nil; + return tmp; + } + +private: + T m_val; + T m_nil; + Del m_del; +}; + +#endif // scoped_h__ diff --git a/as/include/geospatial/throwstream.h b/as/include/geospatial/throwstream.h new file mode 100644 index 00000000..e5548595 --- /dev/null +++ b/as/include/geospatial/throwstream.h @@ -0,0 +1,35 @@ +/* + * Copyright 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more + * contributor license agreements. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. You + * may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#ifndef __throwstream_h +#define __throwstream_h 1 + +#include +#include + +// The throwstream macro assembles the string argument to the +// exception constructor from an iostream. +// +#define throwstream(__except, __msg) \ + do { \ + std::ostringstream __ostrm; \ + __ostrm << __msg; \ + throw __except(__ostrm.str().c_str()); \ + } while (false) + +#endif // __throwstream_h diff --git a/as/include/storage/drv_ssd.h b/as/include/storage/drv_ssd.h new file mode 100644 index 00000000..fe450811 --- /dev/null +++ b/as/include/storage/drv_ssd.h @@ -0,0 +1,463 @@ +/* + * drv_ssd.h + * + * Copyright (C) 2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * Common header for drv_ssd.c, drv_ssd_cold.c, drv_ssd_warm.c. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_queue.h" + +#include "cf_mutex.h" +#include "hist.h" + +#include "base/datamodel.h" +#include "fabric/partition.h" + + +//========================================================== +// Forward declarations. +// + +struct as_index_s; +struct as_namespace_s; +struct as_rec_props_s; +struct as_storage_rd_s; +struct drv_ssd_s; + + +//========================================================== +// Typedefs & constants. +// + +// Linux has removed O_DIRECT, but not its functionality. +#ifndef O_DIRECT +#define O_DIRECT 00040000 +#endif + +#define SSD_HEADER_MAGIC (0x4349747275730707L) +#define SSD_VERSION 2 +// Must update conversion code when bumping version. +// +// SSD_VERSION history: +// 1 - original +// 2 - minimum storage increment (RBLOCK_SIZE) from 512 to 128 bytes + +// Device header flags. +#define SSD_HEADER_FLAG_ENCRYPTED 0x01 +#define SSD_HEADER_FLAG_CP 0x02 +#define SSD_HEADER_FLAG_TRUSTED 0x04 + +#define MAX_SSD_THREADS 20 + + +//------------------------------------------------ +// Device header. +// +typedef struct { + uint64_t magic; // shows we've got the right stuff + uint64_t random; // a random value - good for telling all disks are of the same state + uint32_t write_block_size; + uint32_t last_evict_void_time; + uint8_t version; + uint8_t flags; + uint16_t devices_n; // number of devices + uint32_t header_length; + char namespace[32]; // ascii representation of the namespace name, null-terminated + uint32_t info_n; // number of info slices (should be > a reasonable partition count) + uint32_t info_stride; // currently 128 bytes + uint8_t info_data[]; +} __attribute__((__packed__)) ssd_device_header; + + +//------------------------------------------------ +// A defragged wblock waiting to be freed. +// +typedef struct vacated_wblock_s { + uint32_t file_id; + uint32_t wblock_id; +} vacated_wblock; + + +//------------------------------------------------ +// Write buffer - where records accumulate until +// (the full buffer is) flushed to a device. +// +typedef struct { + cf_atomic32 rc; + cf_atomic32 n_writers; // number of concurrent writers + bool skip_post_write_q; + uint32_t n_vacated; + uint32_t vacated_capacity; + vacated_wblock *vacated_wblocks; + struct drv_ssd_s *ssd; + uint32_t wblock_id; + uint32_t pos; + uint8_t *buf; +} ssd_write_buf; + + +//------------------------------------------------ +// Per-wblock information. +// +typedef struct ssd_wblock_state_s { + cf_atomic32 inuse_sz; // number of bytes currently used in the wblock + cf_mutex LOCK; // transactions, write_worker, and defrag all are interested in wblock_state + ssd_write_buf *swb; // pending writes for the wblock, also treated as a cache for reads + uint32_t state; // for now just a defrag flag + cf_atomic32 n_vac_dests; // number of wblocks into which this wblock defragged +} ssd_wblock_state; + +// wblock state +// +// Ultimately this may become a full-blown state, but for now it's effectively +// just a defrag flag. +#define WBLOCK_STATE_NONE 0 +#define WBLOCK_STATE_DEFRAG 1 + + +//------------------------------------------------ +// Per-device information about its wblocks. +// +typedef struct ssd_alloc_table_s { + uint32_t n_wblocks; // number allocated below + ssd_wblock_state wblock_state[]; +} ssd_alloc_table; + + +//------------------------------------------------ +// Where on free_wblock_q freed wblocks go. +// +typedef enum { + FREE_TO_HEAD, + FREE_TO_TAIL +} e_free_to; + + +//------------------------------------------------ +// Per-device information. +// +typedef struct drv_ssd_s +{ + struct as_namespace_s *ns; + + char *name; // this device's name + char *shadow_name; // this device's shadow's name, if any + + uint32_t running; + + pthread_mutex_t write_lock; // lock protects writes to current swb + ssd_write_buf *current_swb; // swb currently being filled by writes + + int commit_fd; // relevant for enterprise edition only + int shadow_commit_fd; // relevant for enterprise edition only + + pthread_mutex_t defrag_lock; // lock protects writes to defrag swb + ssd_write_buf *defrag_swb; // swb currently being filled by defrag + + cf_queue *fd_q; // queue of open fds + cf_queue *shadow_fd_q; // queue of open fds on shadow, if any + + cf_queue *free_wblock_q; // IDs of free wblocks + cf_queue *defrag_wblock_q; // IDs of wblocks to defrag + + cf_queue *swb_write_q; // pointers to swbs ready to write + cf_queue *swb_shadow_q; // pointers to swbs ready to write to shadow, if any + cf_queue *swb_free_q; // pointers to swbs free and waiting + cf_queue *post_write_q; // pointers to swbs that have been written but are cached + + cf_atomic64 n_defrag_wblock_reads; // total number of wblocks added to the defrag_wblock_q + cf_atomic64 n_defrag_wblock_writes; // total number of swbs added to the swb_write_q by defrag + cf_atomic64 n_wblock_writes; // total number of swbs added to the swb_write_q by writes + + volatile uint64_t n_tomb_raider_reads; // relevant for enterprise edition only + + cf_atomic32 defrag_sweep; // defrag sweep flag + + uint64_t file_size; + int file_id; + + uint32_t open_flag; + bool data_in_memory; + bool started_fresh; // relevant only for warm or cool restart + + uint64_t io_min_size; // device IO operations are aligned and sized in multiples of this + uint64_t commit_min_size; // commit (write) operations are aligned and sized in multiples of this + + cf_atomic64 inuse_size; // number of bytes in actual use on this device + + uint32_t write_block_size; // number of bytes to write at a time + + uint32_t sweep_wblock_id; // wblocks read at startup + uint64_t record_add_older_counter; // records not inserted due to better existing one + uint64_t record_add_expired_counter; // records not inserted due to expiration + uint64_t record_add_max_ttl_counter; // records not inserted due to max-ttl + uint64_t record_add_replace_counter; // records reinserted + uint64_t record_add_unique_counter; // records inserted + + ssd_alloc_table *alloc_table; + + pthread_t maintenance_thread; + pthread_t write_worker_thread[MAX_SSD_THREADS]; + pthread_t shadow_worker_thread; + pthread_t defrag_thread; + + histogram *hist_read; + histogram *hist_large_block_read; + histogram *hist_write; + histogram *hist_shadow_write; + histogram *hist_fsync; +} drv_ssd; + + +//------------------------------------------------ +// Per-namespace storage information. +// +typedef struct drv_ssds_s +{ + ssd_device_header *header; + struct as_namespace_s *ns; + + // Not a great place for this - used only at startup to determine whether to + // load a record. + bool get_state_from_storage[AS_PARTITIONS]; + + int n_ssds; + drv_ssd ssds[]; +} drv_ssds; + + +//========================================================== +// Private API - for enterprise separation only +// + +// SSD_HEADER_SIZE must be a power of 2 and >= MAX_WRITE_BLOCK_SIZE. +// Do NOT change SSD_HEADER_SIZE! +#define SSD_HEADER_SIZE (1024 * 1024) + +// Artificial limit on write-block-size, in case we ever move to an +// SSD_HEADER_SIZE that's too big to be a write-block size limit. +// MAX_WRITE_BLOCK_SIZE must be power of 2 and <= SSD_HEADER_SIZE. +#define MAX_WRITE_BLOCK_SIZE (1024 * 1024) + +// Artificial limit on write-block-size, must be power of 2 and >= RBLOCK_SIZE. +#define MIN_WRITE_BLOCK_SIZE (1024 * 1) + +#define SSD_BLOCK_MAGIC 0x037AF200 +#define LENGTH_BASE offsetof(struct drv_ssd_block_s, keyd) + +#define SSD_HEADER_INFO_STRIDE 128 + +typedef struct ssd_load_records_info_s { + drv_ssds *ssds; + drv_ssd *ssd; + cf_queue *complete_q; + void *complete_udata; + void *complete_rc; +} ssd_load_records_info; + +// Per-record metadata on device. +typedef struct drv_ssd_block_s { + uint64_t sig; // deprecated + uint32_t magic; + uint32_t length; // total after this field - this struct's pointer + 16 + cf_digest keyd; + uint32_t generation; + cf_clock void_time; + uint32_t bins_offset; // offset to bins from data + uint32_t n_bins; + uint64_t last_update_time; + uint8_t data[]; +} __attribute__ ((__packed__)) drv_ssd_block; + +// Per-bin metadata on device. +typedef struct drv_ssd_bin_s { + char name[AS_ID_BIN_SZ]; // 15 aligns well + uint8_t version; // now unused + uint32_t offset; // offset of bin data within block + uint32_t len; // size of bin data + uint32_t next; // location of next bin: block offset +} __attribute__ ((__packed__)) drv_ssd_bin; + +// Info slice in device header block. +typedef struct info_buf_s { + uint32_t regime; // used to be len, but was never read + as_partition_version version; +} __attribute__ ((__packed__)) info_buf; + +// Warm and cool restart. +void ssd_resume_devices(drv_ssds *ssds); +void *run_ssd_cool_start(void *udata); +void ssd_load_wblock_queues(drv_ssds *ssds); +void ssd_start_maintenance_threads(drv_ssds *ssds); +void ssd_start_write_worker_threads(drv_ssds *ssds); +void ssd_start_defrag_threads(drv_ssds *ssds); +bool is_valid_record(const drv_ssd_block *block, const char *ns_name); +void apply_rec_props(struct as_index_s *r, struct as_namespace_s *ns, const struct as_rec_props_s *p_props); + +// Tomb raider. +void ssd_cold_start_adjust_cenotaph(struct as_namespace_s *ns, const drv_ssd_block *block, struct as_index_s *r); +void ssd_cold_start_transition_record(struct as_namespace_s *ns, const drv_ssd_block *block, struct as_index_s *r, bool is_create); +void ssd_cold_start_drop_cenotaphs(struct as_namespace_s *ns); + +// Record encryption. +void ssd_init_encryption_key(struct as_namespace_s *ns); +void ssd_do_encrypt(const uint8_t *key, uint64_t off, drv_ssd_block *block); +void ssd_do_decrypt(const uint8_t *key, uint64_t off, drv_ssd_block *block); + +// CP. +void ssd_adjust_versions(struct as_namespace_s *ns, ssd_device_header *header); +conflict_resolution_pol ssd_cold_start_policy(struct as_namespace_s *ns); +void ssd_cold_start_init_repl_state(struct as_namespace_s *ns, struct as_index_s* r); + +// Miscellaneous. +void ssd_header_init_cfg(const struct as_namespace_s *ns, ssd_device_header *header); +bool ssd_header_is_valid_cfg(const struct as_namespace_s *ns, const ssd_device_header *header); +bool ssd_cold_start_is_valid_n_bins(uint32_t n_bins); +bool ssd_cold_start_is_record_truncated(struct as_namespace_s *ns, const drv_ssd_block *block, const struct as_rec_props_s *p_props); +void ssd_write_header(drv_ssd *ssd, ssd_device_header *header, off_t offset, size_t size); // TODO - change name! + +// Durability. +void ssd_init_commit(drv_ssd *ssd); +uint64_t ssd_flush_max_us(const struct as_namespace_s *ns); +void ssd_post_write(drv_ssd *ssd, ssd_write_buf *swb); +int ssd_write_bins(struct as_storage_rd_s *rd); +int ssd_buffer_bins(struct as_storage_rd_s *rd); +uint32_t ssd_record_size(struct as_storage_rd_s *rd); +ssd_write_buf *swb_get(drv_ssd *ssd); +void ssd_init_trusted(struct as_namespace_s *ns); +bool ssd_is_untrusted(struct as_namespace_s *ns, uint8_t header_flags); +void ssd_set_trusted(struct as_namespace_s *ns); + +// Called in (enterprise-split) storage table function. +int ssd_write(struct as_storage_rd_s *rd); + + +// +// Conversions between bytes and rblocks. +// + +// TODO - make checks stricter (exclude drive header, consider drive size) ??? +#define STORAGE_RBLOCK_IS_VALID(__x) ((__x) != 0) +#define STORAGE_RBLOCK_IS_INVALID(__x) ((__x) == 0) + +#define RBLOCK_SIZE 128 // 2^7 +#define LOG_2_RBLOCK_SIZE 7 // must be in sync with RBLOCK_SIZE + +// Round bytes up to a multiple of rblock size. +static inline uint32_t BYTES_TO_RBLOCK_BYTES(uint32_t bytes) { + return (bytes + (RBLOCK_SIZE - 1)) & -RBLOCK_SIZE; +} + +// Convert byte offset to rblock_id, or bytes to rblocks as long as 'bytes' is +// already a multiple of rblock size. +static inline uint64_t BYTES_TO_RBLOCKS(uint64_t bytes) { + return bytes >> LOG_2_RBLOCK_SIZE; +} + +// Convert rblock_id to byte offset, or rblocks to bytes. +static inline uint64_t RBLOCKS_TO_BYTES(uint64_t rblocks) { + return rblocks << LOG_2_RBLOCK_SIZE; +} + + +// +// Conversions between bytes/rblocks and wblocks. +// + +#define STORAGE_INVALID_WBLOCK 0xFFFFffff + +// Convert byte offset to wblock_id. +static inline uint32_t BYTES_TO_WBLOCK_ID(drv_ssd *ssd, uint64_t bytes) { + return (uint32_t)(bytes / ssd->write_block_size); +} + +// Convert wblock_id to byte offset. +static inline uint64_t WBLOCK_ID_TO_BYTES(drv_ssd *ssd, uint32_t wblock_id) { + return (uint64_t)wblock_id * (uint64_t)ssd->write_block_size; +} + +// Convert rblock_id to wblock_id. +static inline uint32_t RBLOCK_ID_TO_WBLOCK_ID(drv_ssd *ssd, uint64_t rblock_id) { + return (uint32_t)((rblock_id << LOG_2_RBLOCK_SIZE) / ssd->write_block_size); +} + + +// +// Size rounding needed for direct IO. +// + +// Used when determining a device's io_min_size. +#define LO_IO_MIN_SIZE 512 +#define HI_IO_MIN_SIZE 4096 + +// Round bytes down to a multiple of device's minimum IO operation size. +static inline uint64_t BYTES_DOWN_TO_IO_MIN(drv_ssd *ssd, uint64_t bytes) { + return bytes & -ssd->io_min_size; +} + +// Round bytes up to a multiple of device's minimum IO operation size. +static inline uint64_t BYTES_UP_TO_IO_MIN(drv_ssd *ssd, uint64_t bytes) { + return (bytes + (ssd->io_min_size - 1)) & -ssd->io_min_size; +} + + +// +// Device header parsing utilities. +// + +static inline bool +can_convert_storage_version(uint8_t version) +{ + return version == 1 + // In case I bump version 2 and forget to tweak conversion code: + && SSD_VERSION == 2; +} + + +// +// Record encryption. +// + +static inline void +ssd_encrypt(drv_ssd *ssd, uint64_t off, drv_ssd_block *block) +{ + if (ssd->ns->storage_encryption_key_file != NULL) { + ssd_do_encrypt(ssd->ns->storage_encryption_key, off, block); + } +} + +static inline void +ssd_decrypt(drv_ssd *ssd, uint64_t off, drv_ssd_block *block) +{ + if (ssd->ns->storage_encryption_key_file != NULL) { + ssd_do_decrypt(ssd->ns->storage_encryption_key, off, block); + } +} diff --git a/as/include/storage/storage.h b/as/include/storage/storage.h new file mode 100644 index 00000000..eedff374 --- /dev/null +++ b/as/include/storage/storage.h @@ -0,0 +1,183 @@ +/* + * storage.h + * + * Copyright (C) 2009-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include +#include + +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_queue.h" + +#include "base/rec_props.h" + + +// Forward declarations. +struct as_bin_s; +struct as_index_s; +struct as_partition_s; +struct as_namespace_s; +struct drv_ssd_s; +struct drv_ssd_block_s; + + +typedef enum { + AS_STORAGE_ENGINE_MEMORY = 0, + AS_STORAGE_ENGINE_SSD = 1, + + AS_NUM_STORAGE_ENGINES +} as_storage_type; + +typedef struct as_storage_rd_s { + struct as_index_s *r; + struct as_namespace_s *ns; + + as_rec_props rec_props; + + struct as_bin_s *bins; + uint16_t n_bins; + + bool record_on_device; + bool ignore_record_on_device; + + // Parameters used when handling key storage: + uint32_t key_size; + uint8_t *key; + + bool is_durable_delete; // enterprise only + + // Specific to storage type AS_STORAGE_ENGINE_SSD: + struct drv_ssd_block_s *block; + uint8_t *must_free_block; + struct drv_ssd_s *ssd; +} as_storage_rd; + + +//------------------------------------------------ +// Generic "base class" functions that call +// through storage-engine "v-tables". +// + +extern void as_storage_init(); +extern void as_storage_start_tomb_raider(); +extern int as_storage_namespace_destroy(struct as_namespace_s *ns); + +extern int as_storage_record_destroy(struct as_namespace_s *ns, struct as_index_s *r); // not the counterpart of as_storage_record_create() + +// Start and finish an as_storage_rd usage cycle. +extern int as_storage_record_create(struct as_namespace_s *ns, struct as_index_s *r, as_storage_rd *rd); +extern int as_storage_record_open(struct as_namespace_s *ns, struct as_index_s *r, as_storage_rd *rd); +extern int as_storage_record_close(as_storage_rd *rd); + +// Called within as_storage_rd usage cycle. +extern int as_storage_record_load_n_bins(as_storage_rd *rd); +extern int as_storage_record_load_bins(as_storage_rd *rd); +extern bool as_storage_record_size_and_check(as_storage_rd *rd); +extern int as_storage_record_write(as_storage_rd *rd); + +// Storage capacity monitoring. +extern void as_storage_wait_for_defrag(); +extern bool as_storage_overloaded(struct as_namespace_s *ns); // returns true if write queue is too backed up +extern bool as_storage_has_space(struct as_namespace_s *ns); +extern void as_storage_defrag_sweep(struct as_namespace_s *ns); + +// Storage of generic data into device headers. +extern void as_storage_info_set(struct as_namespace_s *ns, const struct as_partition_s *p, bool flush); +extern void as_storage_info_get(struct as_namespace_s *ns, struct as_partition_s *p); +extern int as_storage_info_flush(struct as_namespace_s *ns); +extern void as_storage_save_evict_void_time(struct as_namespace_s *ns, uint32_t evict_void_time); + +// Statistics. +extern int as_storage_stats(struct as_namespace_s *ns, int *available_pct, uint64_t *inuse_disk_bytes); // available percent is that of worst device +extern int as_storage_ticker_stats(struct as_namespace_s *ns); // prints SSD histograms to the info ticker +extern int as_storage_histogram_clear_all(struct as_namespace_s *ns); // clears all SSD histograms + + +//------------------------------------------------ +// Generic functions that don't use "v-tables". +// + +// Called within as_storage_rd usage cycle. +extern uint64_t as_storage_record_get_n_bytes_memory(as_storage_rd *rd); +extern void as_storage_record_adjust_mem_stats(as_storage_rd *rd, uint64_t start_bytes); +extern void as_storage_record_drop_from_mem_stats(as_storage_rd *rd); +extern bool as_storage_record_get_key(as_storage_rd *rd); +extern size_t as_storage_record_rec_props_size(as_storage_rd *rd); +extern void as_storage_record_set_rec_props(as_storage_rd *rd, uint8_t* rec_props_data); + +// Called only at shutdown to flush all device write-queues. +extern void as_storage_shutdown(); + + +//------------------------------------------------ +// AS_STORAGE_ENGINE_MEMORY functions. +// + +extern int as_storage_namespace_init_memory(struct as_namespace_s *ns, cf_queue *complete_q, void *udata); +extern void as_storage_start_tomb_raider_memory(struct as_namespace_s *ns); +extern int as_storage_namespace_destroy_memory(struct as_namespace_s *ns); + +extern int as_storage_record_write_memory(as_storage_rd *rd); + +extern void as_storage_info_get_memory(struct as_namespace_s *ns, struct as_partition_s *p); + +extern int as_storage_stats_memory(struct as_namespace_s *ns, int *available_pct, uint64_t *used_disk_bytes); + + +//------------------------------------------------ +// AS_STORAGE_ENGINE_SSD functions. +// + +extern int as_storage_namespace_init_ssd(struct as_namespace_s *ns, cf_queue *complete_q, void *udata); +extern void as_storage_start_tomb_raider_ssd(struct as_namespace_s *ns); +extern void as_storage_loading_records_ticker_ssd(); // called directly by as_storage_init() +extern int as_storage_namespace_destroy_ssd(struct as_namespace_s *ns); + +extern int as_storage_record_destroy_ssd(struct as_namespace_s *ns, struct as_index_s *r); + +extern int as_storage_record_create_ssd(as_storage_rd *rd); +extern int as_storage_record_open_ssd(as_storage_rd *rd); +extern int as_storage_record_close_ssd(as_storage_rd *rd); + +extern int as_storage_record_load_n_bins_ssd(as_storage_rd *rd); +extern int as_storage_record_load_bins_ssd(as_storage_rd *rd); +extern bool as_storage_record_size_and_check_ssd(as_storage_rd *rd); +extern int as_storage_record_write_ssd(as_storage_rd *rd); + +extern void as_storage_wait_for_defrag_ssd(struct as_namespace_s *ns); +extern bool as_storage_overloaded_ssd(struct as_namespace_s *ns); +extern bool as_storage_has_space_ssd(struct as_namespace_s *ns); +extern void as_storage_defrag_sweep_ssd(struct as_namespace_s *ns); + +extern void as_storage_info_set_ssd(struct as_namespace_s *ns, const struct as_partition_s *p, bool flush); +extern void as_storage_info_get_ssd(struct as_namespace_s *ns, struct as_partition_s *p); +extern int as_storage_info_flush_ssd(struct as_namespace_s *ns); +extern void as_storage_save_evict_void_time_ssd(struct as_namespace_s *ns, uint32_t evict_void_time); + +extern int as_storage_stats_ssd(struct as_namespace_s *ns, int *available_pct, uint64_t *used_disk_bytes); +extern int as_storage_ticker_stats_ssd(struct as_namespace_s *ns); +extern int as_storage_histogram_clear_ssd(struct as_namespace_s *ns); + +// Called by "base class" functions but not via table. +extern bool as_storage_record_get_key_ssd(as_storage_rd *rd); +extern void as_storage_shutdown_ssd(struct as_namespace_s *ns); diff --git a/as/include/transaction/delete.h b/as/include/transaction/delete.h new file mode 100644 index 00000000..97cdc9de --- /dev/null +++ b/as/include/transaction/delete.h @@ -0,0 +1,56 @@ +/* + * delete.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include + +#include "base/transaction.h" + + +//========================================================== +// Forward declarations. +// + +struct as_index_ref_s; +struct as_transaction_s; +struct rw_request_s; + + +//========================================================== +// Public API. +// + +transaction_status as_delete_start(struct as_transaction_s* tr); + + +//========================================================== +// Private API - for enterprise separation only. +// + +bool delete_storage_overloaded(struct as_transaction_s* tr); +transaction_status delete_master(struct as_transaction_s* tr, struct rw_request_s* rw); +transaction_status drop_master(struct as_transaction_s* tr, struct as_index_ref_s* r_ref, struct rw_request_s* rw); diff --git a/as/include/transaction/duplicate_resolve.h b/as/include/transaction/duplicate_resolve.h new file mode 100644 index 00000000..72fa98a5 --- /dev/null +++ b/as/include/transaction/duplicate_resolve.h @@ -0,0 +1,50 @@ +/* + * duplicate_resolve.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include "msg.h" +#include "node.h" + +#include "transaction/rw_request.h" + + +//========================================================== +// Forward declarations. +// + +struct as_transaction_s; +struct rw_request_s; + + +//========================================================== +// Public API. +// + +void dup_res_make_message(struct rw_request_s* rw, struct as_transaction_s* tr); +void dup_res_setup_rw(struct rw_request_s* rw, struct as_transaction_s* tr, dup_res_done_cb dup_res_cb, timeout_done_cb timeout_cb); +void dup_res_handle_request(cf_node node, msg* m); +void dup_res_handle_ack(cf_node node, msg* m); diff --git a/as/include/transaction/proxy.h b/as/include/transaction/proxy.h new file mode 100644 index 00000000..42291df8 --- /dev/null +++ b/as/include/transaction/proxy.h @@ -0,0 +1,60 @@ +/* + * proxy.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include + +#include "dynbuf.h" +#include "node.h" + + +//========================================================== +// Forward declarations. +// + +struct as_bin_s; +struct as_msg_op_s; +struct as_namespace_s; +struct as_transaction_s; + + +//========================================================== +// Public API. +// + +void as_proxy_init(); + +uint32_t as_proxy_hash_count(); + +void as_proxy_divert(cf_node dst, struct as_transaction_s* tr, struct as_namespace_s* ns); +void as_proxy_return_to_sender(const struct as_transaction_s* tr, struct as_namespace_s* ns); + +void as_proxy_send_response(cf_node dst, uint32_t proxy_tid, + uint32_t result_code, uint32_t generation, uint32_t void_time, + struct as_msg_op_s** ops, struct as_bin_s** bins, uint16_t bin_count, + struct as_namespace_s* ns, uint64_t trid); +void as_proxy_send_ops_response(cf_node dst, uint32_t proxy_tid, cf_dyn_buf* db); diff --git a/as/include/transaction/re_replicate.h b/as/include/transaction/re_replicate.h new file mode 100644 index 00000000..6adfef4e --- /dev/null +++ b/as/include/transaction/re_replicate.h @@ -0,0 +1,43 @@ +/* + * re_replicate.h + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include "base/transaction.h" + + +//========================================================== +// Forward declarations. +// + +struct as_transaction_s; + + +//========================================================== +// Public API. +// + +transaction_status as_re_replicate_start(struct as_transaction_s* tr); diff --git a/as/include/transaction/read.h b/as/include/transaction/read.h new file mode 100644 index 00000000..dabc8270 --- /dev/null +++ b/as/include/transaction/read.h @@ -0,0 +1,36 @@ +/* + * read.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include "base/transaction.h" + + +//========================================================== +// Public API. +// + +transaction_status as_read_start(as_transaction* tr); diff --git a/as/include/transaction/replica_ping.h b/as/include/transaction/replica_ping.h new file mode 100644 index 00000000..5d5e231e --- /dev/null +++ b/as/include/transaction/replica_ping.h @@ -0,0 +1,54 @@ +/* + * replica_ping.h + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include + +#include "msg.h" +#include "node.h" + +#include "transaction/rw_request.h" + + +//========================================================== +// Forward declarations. +// + +struct as_transaction_s; +struct rw_request_s; + + +//========================================================== +// Public API. +// + +bool repl_ping_check(struct as_transaction_s* tr); +void repl_ping_make_message(struct rw_request_s* rw, struct as_transaction_s* tr); +void repl_ping_setup_rw(struct rw_request_s* rw, struct as_transaction_s* tr, repl_ping_done_cb repl_ping_cb, timeout_done_cb timeout_cb); +void repl_ping_reset_rw(struct rw_request_s* rw, struct as_transaction_s* tr, repl_ping_done_cb cb); +void repl_ping_handle_op(cf_node node, msg* m); +void repl_ping_handle_ack(cf_node node, msg* m); diff --git a/as/include/transaction/replica_write.h b/as/include/transaction/replica_write.h new file mode 100644 index 00000000..5af68fde --- /dev/null +++ b/as/include/transaction/replica_write.h @@ -0,0 +1,51 @@ +/* + * replica_write.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include "msg.h" +#include "node.h" + +#include "transaction/rw_request.h" + + +//========================================================== +// Forward declarations. +// + +struct as_transaction_s; +struct rw_request_s; + + +//========================================================== +// Public API. +// + +void repl_write_make_message(struct rw_request_s* rw, struct as_transaction_s* tr); +void repl_write_setup_rw(struct rw_request_s* rw, struct as_transaction_s* tr, repl_write_done_cb repl_write_cb, timeout_done_cb timeout_cb); +void repl_write_reset_rw(struct rw_request_s* rw, struct as_transaction_s* tr, repl_write_done_cb cb); +void repl_write_handle_op(cf_node node, msg* m); +void repl_write_handle_ack(cf_node node, msg* m); diff --git a/as/include/transaction/rw_request.h b/as/include/transaction/rw_request.h new file mode 100644 index 00000000..69d9fb65 --- /dev/null +++ b/as/include/transaction/rw_request.h @@ -0,0 +1,209 @@ +/* + * rw_request.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_byte_order.h" +#include "citrusleaf/cf_digest.h" + +#include "dynbuf.h" +#include "msg.h" +#include "node.h" + +#include "base/proto.h" +#include "base/rec_props.h" +#include "base/transaction.h" +#include "fabric/hb.h" +#include "fabric/partition.h" + + +//========================================================== +// Forward declarations. +// + +struct as_batch_shared_s; +struct as_file_handle_s; +struct cl_msg_s; +struct iudf_origin_s; +struct rw_request_s; + + +//========================================================== +// Typedefs & constants. +// + +typedef bool (*dup_res_done_cb) (struct rw_request_s* rw); +typedef void (*repl_write_done_cb) (struct rw_request_s* rw); +typedef void (*repl_ping_done_cb) (struct rw_request_s* rw); +typedef void (*timeout_done_cb) (struct rw_request_s* rw); + +typedef struct rw_wait_ele_s { + as_transaction tr; // TODO - only needs to be transaction head + struct rw_wait_ele_s* next; +} rw_wait_ele; + + +typedef struct rw_request_s { + + //------------------------------------------------------ + // Matches as_transaction. + // + + struct cl_msg_s* msgp; + uint32_t msg_fields; + + uint8_t origin; + uint8_t from_flags; + + union { + void* any; + struct as_file_handle_s* proto_fd_h; + cf_node proxy_node; + struct iudf_origin_s* iudf_orig; + struct as_batch_shared_s* batch_shared; + } from; + + union { + uint32_t any; + uint32_t batch_index; + uint32_t proxy_tid; + } from_data; + + cf_digest keyd; + + uint64_t start_time; + uint64_t benchmark_time; + + as_partition_reservation rsv; + + uint64_t end_time; + uint8_t result_code; + uint8_t flags; + uint16_t generation; + uint32_t void_time; + uint64_t last_update_time; + + // + // End of as_transaction look-alike. + //------------------------------------------------------ + + pthread_mutex_t lock; + + rw_wait_ele* wait_queue_head; + rw_wait_ele* wait_queue_tail; + uint32_t wait_queue_depth; + + bool is_set_up; // TODO - redundant with timeout_cb + + // Store pickled data, for use in replica write. + uint8_t* pickled_buf; + size_t pickled_sz; + as_rec_props pickled_rec_props; + + // Store ops' responses here. + cf_dyn_buf response_db; + + // Manage responses for duplicate resolution and replica write requests, or + // alternatively, timeouts. + uint32_t tid; + bool dup_res_complete; + bool repl_write_complete; + bool repl_ping_complete; + dup_res_done_cb dup_res_cb; + repl_write_done_cb repl_write_cb; + repl_ping_done_cb repl_ping_cb; + timeout_done_cb timeout_cb; + + // Message being sent to dest_nodes. May be duplicate resolution or replica + // write request. Message is kept in case it needs to be retransmitted. + msg* dest_msg; + + uint64_t xmit_ms; // time of next retransmit + uint32_t retry_interval_ms; // interval to add for next retransmit + + // Destination info for duplicate resolution and replica write requests. + uint32_t n_dest_nodes; + cf_node dest_nodes[AS_CLUSTER_SZ]; + bool dest_complete[AS_CLUSTER_SZ]; + + // Duplicate resolution response messages from nodes with duplicates. + msg* best_dup_msg; + // TODO - could store best dup node-id - worth it? + uint8_t best_dup_result_code; + uint16_t best_dup_gen; + uint64_t best_dup_lut; + + bool tie_was_replicated; // enterprise only + +} rw_request; + + +//========================================================== +// Public API. +// + +rw_request* rw_request_create(); +void rw_request_destroy(rw_request* rw); +void rw_request_wait_q_push(rw_request* rw, as_transaction* tr); +void rw_request_wait_q_push_head(rw_request* rw, as_transaction* tr); + + +static inline void +rw_request_hdestroy(void* pv) +{ + rw_request_destroy((rw_request*)pv); +} + + +static inline void +rw_request_release(rw_request* rw) +{ + if (cf_rc_release(rw) == 0) { + rw_request_destroy(rw); + cf_rc_free(rw); + } +} + + +// See as_transaction_trid(). +static inline uint64_t +rw_request_trid(const rw_request* rw) +{ + if ((rw->msg_fields & AS_MSG_FIELD_BIT_TRID) == 0) { + return 0; + } + + as_msg_field *f = as_msg_field_get(&rw->msgp->msg, AS_MSG_FIELD_TYPE_TRID); + + return cf_swap_from_be64(*(uint64_t*)f->data); +} diff --git a/as/include/transaction/rw_request_hash.h b/as/include/transaction/rw_request_hash.h new file mode 100644 index 00000000..1bee799b --- /dev/null +++ b/as/include/transaction/rw_request_hash.h @@ -0,0 +1,111 @@ +/* + * rw_request_hash.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include + +#include "citrusleaf/cf_digest.h" + +#include "base/transaction.h" + + +//========================================================== +// Forward declarations. +// + +struct as_transaction_s; +struct rw_request_s; + + +//========================================================== +// Typedefs & constants. +// + +typedef enum { + // These values go on the wire, so mind backward compatibility if changing. + RW_FIELD_OP, + RW_FIELD_RESULT, + RW_FIELD_NAMESPACE, + RW_FIELD_NS_ID, + RW_FIELD_GENERATION, + RW_FIELD_DIGEST, + RW_FIELD_UNUSED_6, + RW_FIELD_UNUSED_7, + RW_FIELD_CLUSTER_KEY, + RW_FIELD_RECORD, + RW_FIELD_TID, + RW_FIELD_VOID_TIME, + RW_FIELD_INFO, + RW_FIELD_UNUSED_13, + RW_FIELD_UNUSED_14, + RW_FIELD_UNUSED_15, + RW_FIELD_LAST_UPDATE_TIME, + RW_FIELD_SET_NAME, + RW_FIELD_KEY, + RW_FIELD_REGIME, + + NUM_RW_FIELDS +} rw_msg_field; + +#define RW_OP_WRITE 1 +#define RW_OP_WRITE_ACK 2 +#define RW_OP_DUP 3 +#define RW_OP_DUP_ACK 4 +#define RW_OP_REPL_CONFIRM 5 +#define RW_OP_REPL_PING 6 +#define RW_OP_REPL_PING_ACK 7 + +#define RW_INFO_XDR 0x0001 +#define RW_INFO_NO_REPL_ACK 0x0002 +#define RW_INFO_NSUP_DELETE 0x0004 +#define RW_INFO_UNUSED_8 0x0008 // was LDT dummy (no data) +#define RW_INFO_UNUSED_10 0x0010 // was LDT parent record +#define RW_INFO_UNUSED_20 0x0020 // was LDT subrecord +#define RW_INFO_UNUSED_40 0x0040 // was LDT ESR +#define RW_INFO_SINDEX_TOUCHED 0x0080 // sindex was touched +#define RW_INFO_UNUSED_100 0x0100 // was LDT multi-op message +#define RW_INFO_UNREPLICATED 0x0200 // enterprise only +#define RW_INFO_TOMBSTONE 0x0400 // enterprise only + +typedef struct rw_request_hkey_s { + uint32_t ns_id; + cf_digest keyd; +} __attribute__((__packed__)) rw_request_hkey; + + +//========================================================== +// Public API. +// + +void as_rw_init(); + +uint32_t rw_request_hash_count(); +transaction_status rw_request_hash_insert(rw_request_hkey* hkey, struct rw_request_s* rw, struct as_transaction_s* tr); +void rw_request_hash_delete(rw_request_hkey* hkey, struct rw_request_s* rw); +struct rw_request_s* rw_request_hash_get(rw_request_hkey* hkey); + +void rw_request_hash_dump(); diff --git a/as/include/transaction/rw_utils.h b/as/include/transaction/rw_utils.h new file mode 100644 index 00000000..d6324bbe --- /dev/null +++ b/as/include/transaction/rw_utils.h @@ -0,0 +1,201 @@ +/* + * rw_utils.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include + +#include "citrusleaf/cf_digest.h" + +#include "msg.h" +#include "node.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/secondary_index.h" +#include "base/transaction.h" +#include "base/transaction_policy.h" +#include "transaction/rw_request.h" +#include "transaction/udf.h" + + +//========================================================== +// Forward declarations. +// + +struct as_bin_s; +struct as_index_s; +struct as_index_tree_s; +struct as_msg_s; +struct as_namespace_s; +struct as_remote_record_s; +struct as_storage_rd_s; +struct as_transaction_s; +struct rw_request_s; +struct udf_record_s; + + +//========================================================== +// Typedefs & constants. +// + +typedef struct index_metadata_s { + uint32_t void_time; + uint64_t last_update_time; + uint16_t generation; +} index_metadata; + +typedef struct now_times_s { + uint64_t now_ns; + uint64_t now_ms; +} now_times; + +// For now, use only for as_msg record_ttl special values. +#define TTL_NAMESPACE_DEFAULT 0 +#define TTL_NEVER_EXPIRE ((uint32_t)-1) +#define TTL_DONT_UPDATE ((uint32_t)-2) + + +//========================================================== +// Public API. +// + +bool validate_delete_durability(struct as_transaction_s* tr); +bool xdr_allows_write(struct as_transaction_s* tr); +void send_rw_messages(struct rw_request_s* rw); +void send_rw_messages_forget(struct rw_request_s* rw); +int repl_state_check(struct as_index_s* r, struct as_transaction_s* tr); +void will_replicate(struct as_index_s* r, struct as_namespace_s* ns); +bool insufficient_replica_destinations(const struct as_namespace_s* ns, uint32_t n_dests); +void finished_replicated(struct as_transaction_s* tr); +void finished_not_replicated(struct rw_request_s* rw); +bool generation_check(const struct as_index_s* r, const struct as_msg_s* m, const struct as_namespace_s* ns); +int set_set_from_msg(struct as_index_s* r, struct as_namespace_s* ns, struct as_msg_s* m); +int set_delete_durablility(const struct as_transaction_s* tr, struct as_storage_rd_s* rd); +bool check_msg_key(struct as_msg_s* m, struct as_storage_rd_s* rd); +bool get_msg_key(struct as_transaction_s* tr, struct as_storage_rd_s* rd); +int handle_msg_key(struct as_transaction_s* tr, struct as_storage_rd_s* rd); +void update_metadata_in_index(struct as_transaction_s* tr, bool increment_generation, struct as_index_s* r); +void pickle_all(struct as_storage_rd_s* rd, struct rw_request_s* rw); +bool write_sindex_update(struct as_namespace_s* ns, const char* set_name, cf_digest* keyd, struct as_bin_s* old_bins, uint32_t n_old_bins, struct as_bin_s* new_bins, uint32_t n_new_bins); +void record_delete_adjust_sindex(struct as_index_s* r, struct as_namespace_s* ns); +void delete_adjust_sindex(struct as_storage_rd_s* rd); +void remove_from_sindex(struct as_namespace_s* ns, const char* set_name, cf_digest* keyd, struct as_bin_s* bins, uint32_t n_bins); +bool xdr_must_ship_delete(struct as_namespace_s* ns, bool is_nsup_delete, bool is_xdr_op); + + +// TODO - rename as as_record_... and move to record.c? +static inline bool +record_has_sindex(const as_record* r, as_namespace* ns) +{ + if (! as_sindex_ns_has_sindex(ns)) { + return false; + } + + as_set* set = as_namespace_get_record_set(ns, r); + + return set ? set->n_sindexes != 0 : ns->n_setless_sindexes != 0; +} + + +static inline bool +respond_on_master_complete(as_transaction* tr) +{ + return tr->origin == FROM_CLIENT && + TR_WRITE_COMMIT_LEVEL(tr) == AS_WRITE_COMMIT_LEVEL_MASTER; +} + + +static inline void +destroy_stack_bins(as_bin* stack_bins, uint32_t n_bins) +{ + for (uint32_t i = 0; i < n_bins; i++) { + as_bin_particle_destroy(&stack_bins[i], true); + } +} + + +// Not a nice way to specify a read-all op - dictated by backward compatibility. +// Note - must check this before checking for normal read op! +static inline bool +op_is_read_all(as_msg_op* op, as_msg* m) +{ + return op->name_sz == 0 && op->op == AS_MSG_OP_READ && + (m->info1 & AS_MSG_INFO1_GET_ALL) != 0; +} + + +static inline bool +is_valid_ttl(as_namespace* ns, uint32_t ttl) +{ + // Note - for now, ttl must be as_msg record_ttl. + // Note - ttl <= ns->max_ttl includes ttl == TTL_NAMESPACE_DEFAULT. + return ttl <= ns->max_ttl || + ttl == TTL_NEVER_EXPIRE || ttl == TTL_DONT_UPDATE; +} + + +static inline void +clear_delete_response_metadata(as_transaction* tr) +{ + // If write became delete, respond to origin with no metadata. + if ((tr->flags & AS_TRANSACTION_FLAG_IS_DELETE) != 0) { + tr->generation = 0; + tr->void_time = 0; + tr->last_update_time = 0; + } +} + + +//========================================================== +// Private API - for enterprise separation only. +// + +bool create_only_check(const struct as_index_s* r, const struct as_msg_s* m); +void write_delete_record(struct as_index_s* r, struct as_index_tree_s* tree); + +udf_optype udf_finish_delete(struct udf_record_s* urecord); + +uint32_t dup_res_pack_repl_state_info(const struct as_index_s* r, struct as_namespace_s* ns); +uint32_t dup_res_pack_info(const struct as_index_s* r, struct as_namespace_s* ns); +bool dup_res_should_retry_transaction(struct rw_request_s* rw, uint32_t result_code); +void dup_res_handle_tie(struct rw_request_s* rw, const msg* m, uint32_t result_code); +void apply_if_tie(struct rw_request_s* rw); +void dup_res_translate_result_code(struct rw_request_s* rw); +bool dup_res_ignore_pickle(const uint8_t* buf, uint32_t info); +void dup_res_init_repl_state(struct as_remote_record_s* rr, uint32_t info); + +void repl_write_flag_pickle(const struct as_transaction_s* tr, const uint8_t* buf, uint32_t* info); +bool repl_write_pickle_is_drop(const uint8_t* buf, uint32_t info); +void repl_write_init_repl_state(struct as_remote_record_s* rr, bool from_replica); +conflict_resolution_pol repl_write_conflict_resolution_policy(const struct as_namespace_s* ns); +bool repl_write_should_retransmit_replicas(struct rw_request_s* rw, uint32_t result_code); +void repl_write_send_confirmation(struct rw_request_s* rw); +void repl_write_handle_confirmation(msg* m); + +int record_replace_check(struct as_index_s* r, struct as_namespace_s* ns); +void record_replaced(struct as_index_s* r, struct as_remote_record_s* rr); diff --git a/as/include/transaction/udf.h b/as/include/transaction/udf.h new file mode 100644 index 00000000..cb8a1668 --- /dev/null +++ b/as/include/transaction/udf.h @@ -0,0 +1,98 @@ +/* + * udf.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include + +#include "aerospike/as_aerospike.h" +#include "aerospike/as_list.h" + +#include "base/predexp.h" +#include "base/transaction.h" + + +//========================================================== +// Forward declarations. +// + +struct as_transaction_s; +struct predexp_eval_base_s; + + +//========================================================== +// Typedefs & constants. +// + +typedef enum { + UDF_OPTYPE_NONE, + UDF_OPTYPE_WAITING, + UDF_OPTYPE_READ, + UDF_OPTYPE_WRITE, + UDF_OPTYPE_DELETE +} udf_optype; + +#define UDF_MAX_STRING_SZ 128 + +typedef struct udf_def_s { + char filename[UDF_MAX_STRING_SZ]; + char function[UDF_MAX_STRING_SZ]; + as_list* arglist; + uint8_t type; +} udf_def; + +typedef int (*iudf_cb)(void* udata, int retcode); + +typedef struct iudf_origin_s { + udf_def def; + struct predexp_eval_base_s* predexp; + iudf_cb cb; + void* udata; +} iudf_origin; + + +//========================================================== +// Public API. +// + +static inline void +iudf_origin_destroy(iudf_origin* origin) +{ + if (origin->def.arglist) { + as_list_destroy(origin->def.arglist); + } + + if (origin->predexp) { + predexp_destroy(origin->predexp); + } +} + +void as_udf_init(); +udf_def* udf_def_init_from_msg(udf_def* def, const struct as_transaction_s* tr); + +transaction_status as_udf_start(struct as_transaction_s* tr); + +extern as_aerospike g_as_aerospike; diff --git a/as/include/transaction/write.h b/as/include/transaction/write.h new file mode 100644 index 00000000..dfb5f210 --- /dev/null +++ b/as/include/transaction/write.h @@ -0,0 +1,43 @@ +/* + * write.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include "base/transaction.h" + + +//========================================================== +// Forward declarations. +// + +struct as_transaction_s; + + +//========================================================== +// Public API. +// + +transaction_status as_write_start(struct as_transaction_s* tr); diff --git a/as/src/Makefile b/as/src/Makefile new file mode 100644 index 00000000..0a775f91 --- /dev/null +++ b/as/src/Makefile @@ -0,0 +1,192 @@ +# Aerospike Server +# Makefile + +DEPTH = ../.. +include $(DEPTH)/make_in/Makefile.in + +# Use SystemTap? [By default, no.] +USE_SYSTEMTAP = 0 + +ifeq ($(USE_SYSTEMTAP),1) +CFLAGS += -DUSE_SYSTEMTAP +endif + +ifeq ($(USE_SYSTEMTAP),1) +SYSTEMTAP_PROBES_D = base/probes.d +SYSTEMTAP_PROBES_H = $(GEN_DIR)/probes.h +SYSTEMTAP_PROBES_O = $(OBJECT_DIR)/probes.o +endif + +ifeq ($(USE_EE),1) + include $(EEREPO)/as/make_in/Makefile.vars + include $(EEREPO)/xdr/make_in/Makefile.vars +endif + +BASE_HEADERS += aggr.h batch.h cdt.h cfg.h datamodel.h features.h index.h job_manager.h json_init.h +BASE_HEADERS += monitor.h packet_compression.h +BASE_HEADERS += particle.h particle_blob.h particle_integer.h predexp.h +BASE_HEADERS += proto.h rec_props.h scan.h secondary_index.h security.h security_config.h stats.h system_metadata.h +BASE_HEADERS += thr_batch.h thr_info.h thr_query.h thr_sindex.h +BASE_HEADERS += thr_tsvc.h ticker.h transaction.h transaction_policy.h truncate.h +BASE_HEADERS += udf_aerospike.h udf_arglist.h udf_cask.h +BASE_HEADERS += udf_memtracker.h udf_record.h udf_timer.h +BASE_HEADERS += xdr_serverside.h xdr_config.h + +BASE_SOURCES += aggr.c as.c batch.c bin.c cdt.c cfg.c index.c job_manager.c json_init.c +BASE_SOURCES += monitor.c namespace.c packet_compression.c +BASE_SOURCES += particle.c particle_blob.c particle_float.c particle_geojson.c particle_integer.c +BASE_SOURCES += particle_list.c particle_map.c particle_string.c predexp.c +BASE_SOURCES += proto.c rec_props.c record.c scan.c signal.c secondary_index.c system_metadata.c +BASE_SOURCES += thr_batch.c thr_demarshal.c thr_info.c thr_info_port.c thr_nsup.c +BASE_SOURCES += thr_query.c thr_sindex.c thr_tsvc.c ticker.c transaction.c truncate.c +BASE_SOURCES += udf_aerospike.c udf_arglist.c udf_cask.c +BASE_SOURCES += udf_memtracker.c udf_record.c udf_timer.c +BASE_SOURCES += xdr_config.c + +ifneq ($(USE_EE),1) + BASE_SOURCES += cfg_ce.c + BASE_SOURCES += features_ce.c + BASE_SOURCES += index_ce.c + BASE_SOURCES += namespace_ce.c + BASE_SOURCES += record_ce.c + BASE_SOURCES += security_ce.c + BASE_SOURCES += truncate_ce.c + BASE_SOURCES += xdr_serverside_stubs.c +endif + +FABRIC_HEADERS += clustering.h endpoint.h exchange.h fabric.h hb.h hlc.h meta_batch.h migrate.h partition.h partition_balance.h roster.h skew_monitor.h +FABRIC_SOURCES += clustering.c endpoint.c exchange.c fabric.c hb.c hlc.c migrate.c partition.c partition_balance.c skew_monitor.c +ifneq ($(USE_EE),1) + FABRIC_SOURCES += meta_batch_ce.c + FABRIC_SOURCES += migrate_ce.c + FABRIC_SOURCES += partition_balance_ce.c + FABRIC_SOURCES += partition_ce.c + FABRIC_SOURCES += roster_ce.c +endif + +GEOSPATIAL_HEADERS += geospatial.h +GEOSPATIAL_SOURCES += geospatial.cc geojson.cc + +STORAGE_HEADERS += storage.h drv_ssd.h +STORAGE_SOURCES += storage.c drv_memory.c drv_ssd.c +ifneq ($(USE_EE),1) + STORAGE_SOURCES += drv_memory_ce.c + STORAGE_SOURCES += drv_ssd_ce.c +endif + +TRANSACTION_HEADERS += delete.h duplicate_resolve.h proxy.h re_replicate.h read.h replica_ping.h replica_write.h rw_request_hash.h rw_request.h rw_utils.h udf.h write.h +TRANSACTION_SOURCES += delete.c duplicate_resolve.c proxy.c read.c replica_write.c rw_request_hash.c rw_request.c rw_utils.c udf.c write.c +ifneq ($(USE_EE),1) + TRANSACTION_SOURCES += delete_ce.c + TRANSACTION_SOURCES += re_replicate_ce.c + TRANSACTION_SOURCES += replica_ping_ce.c + TRANSACTION_SOURCES += rw_utils_ce.c +endif + +HEADERS = $(BASE_HEADERS:%=base/%) $(FABRIC_HEADERS:%=fabric/%) $(STORAGE_HEADERS:%=storage/%) $(GEOSPATIAL_HEADERS:%=geospatial/%) $(TRANSACTION_HEADERS:%=transaction/%) +SOURCES = $(BASE_SOURCES:%=base/%) $(FABRIC_SOURCES:%=fabric/%) $(STORAGE_SOURCES:%=storage/%) $(GEOSPATIAL_SOURCES:%=geospatial/%) $(TRANSACTION_SOURCES:%=transaction/%) + +SERVER = $(BIN_DIR)/asd + +INCLUDES += $(INCLUDE_DIR:%=-I%) +INCLUDES += -I$(CF)/include +INCLUDES += -I$(AI)/include +INCLUDES += -I$(COMMON)/target/$(PLATFORM)/include +INCLUDES += -I$(MOD_LUA)/target/$(PLATFORM)/include +INCLUDES += -I$(JANSSON)/src +INCLUDES += -I$(S2) +INCLUDES += -I$(XDR_INCLUDES) + +ifeq ($(USE_LUAJIT),1) + INCLUDES += -I$(LUAJIT)/src +else + INCLUDE_LUA_5_1 = /usr/include/lua5.1 + ifneq ($(wildcard $(INCLUDE_LUA_5_1)),) + INCLUDES += -I$(INCLUDE_LUA_5_1) + LUA_SUFFIX = 5.1 + endif +endif + +AS_LIBRARIES += $(LIBRARY_DIR)/libcf.a +AS_LIBRARIES += $(LIBRARY_DIR)/libai.a +AS_LIBRARIES += $(MOD_LUA)/target/$(PLATFORM)/lib/libmod_lua.a +AS_LIBRARIES += $(COMMON)/target/$(PLATFORM)/lib/libaerospike-common.a + +ifeq ($(DOPROFILE),1) + LIBRARIES += -pg -fprofile-arcs -lgcov +endif + +# Add either the LuaJIT or Lua library +ifeq ($(USE_LUAJIT),1) + ifeq ($(LD_LUAJIT),static) + AS_LIBRARIES += $(LUAJIT)/src/libluajit.a + else + LIBRARIES += -L$(LUAJIT)/src -lluajit + endif +else + ifeq ($(LD_LUA),static) + # Find and add the static Lua library. + AS_LIBRARIES += $(or \ + $(wildcard /usr/local/lib/liblua.a), \ + $(wildcard /usr/lib64/liblua$(LUA_SUFFIX).a), \ + $(wildcard /usr/lib/x86_64-linux-gnu/liblua$(LUA_SUFFIX).a), \ + $(wildcard /usr/lib/liblua.a), \ + $(wildcard /usr/lib/powerpc64le-linux-gnu/liblua.a), \ + $(error Cannot find "liblua.a")) + else + LIBRARIES += -llua$(LUA_SUFFIX) + endif +endif + +ifeq ($(LD_JANSSON),static) + AS_LIBRARIES += $(JANSSON)/src/.libs/libjansson.a +else + LIBRARIES += -L$(JANSSON)/src/.libs -ljansson +endif + +LIBRARIES += -L$(S2) -ls2 -ls2cellid -lgoogle-strings -lgoogle-base \ + -lgoogle-util-coding -lgoogle-util-math -lstdc++ + +LIBRARIES := $(AS_LIBRARIES) $(LIBRARIES) + +AS_LIB_DEPS = $(AS_LIBRARIES) + +OBJECTS.c = $(SOURCES:%.c=$(OBJECT_DIR)/%.o) $(VERSION_OBJ) $(SYSTEMTAP_PROBES_O) +OBJECTS = $(OBJECTS.c:%.cc=$(OBJECT_DIR)/%.o) +DEPENDENCIES = $(OBJECTS:%.o=%.d) +DEPENDENCIES += $(XDR_DEPENDENCIES) + +.PHONY: all +all: $(SYSTEMTAP_PROBES_H) $(SERVER) + +.PHONY: clean +clean: + $(RM) $(OBJECTS) $(SERVER){,.stripped} + $(RM) $(DEPENDENCIES) + +# Emacs syntax check target.CHK_SOURCES is set by emacs to the files being edited. +.PHONY: check-syntax +check-syntax: + $(CC) -Wall -Wextra -pedantic -fsyntax-only $(CHK_SOURCES) + +$(SERVER): $(OBJECTS) $(AS_LIB_DEPS) $(XDR_LIBRARY) $(XDR_ALL_OBJECTS) + $(LINK.c) -o $(SERVER) $(OBJECTS) $(XDR_ALL_OBJECTS) $(LIBRARIES) + +ifeq ($(USE_EE),1) + include $(XDR)/make_in/Makefile.targets +endif + +include $(DEPTH)/make_in/Makefile.targets + +# Ignore S2 induced warnings +S2_WNO = -Wno-unused-local-typedefs -Wno-deprecated -Wno-sign-compare +$(OBJECT_DIR)/geospatial/%.o: CXXFLAGS += $(S2_WNO) +$(OBJECT_DIR)/geospatial/%.o: CFLAGS := $(filter-out -std=gnu99,$(CFLAGS)) + +ifeq ($(USE_SYSTEMTAP),1) +$(SYSTEMTAP_PROBES_H): $(SYSTEMTAP_PROBES_D) + dtrace -h -s $< -o $@ + +$(SYSTEMTAP_PROBES_O): $(SYSTEMTAP_PROBES_D) + dtrace -G -s $< -o $@ +endif diff --git a/as/src/base/aggr.c b/as/src/base/aggr.c new file mode 100644 index 00000000..88735ce9 --- /dev/null +++ b/as/src/base/aggr.c @@ -0,0 +1,337 @@ +/* + * aggr.c + * + * Copyright (C) 2014-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/aggr.h" + +#include +#include +#include +#include + + +#include "aerospike/as_val.h" +#include "aerospike/mod_lua.h" +#include "citrusleaf/cf_ll.h" + +#include "fault.h" + +#include "base/datamodel.h" +#include "base/proto.h" +#include "base/transaction.h" +#include "base/udf_arglist.h" +#include "base/udf_memtracker.h" +#include "base/udf_record.h" +#include "fabric/partition.h" + + +#define AS_AGGR_ERR -1 +#define AS_AGGR_OK 0 + +/* + * Aggregation Stream Object + */ +// ************************************************************************************************** +typedef struct { + // Iteration + cf_ll_iterator * iter; + as_index_keys_arr * keys_arr; + int keys_arr_offset; + + // Record + bool rec_open; // Record in stream open + as_rec * urec; // UDF record cloak + as_namespace * ns; + as_partition_reservation * rsv; // Reservation Object + + // Module Data + as_aggr_call * call; // Aggregation info + void * udata; // Execution context +} aggr_state; + +static as_partition_reservation * +ptn_reserve(aggr_state *astate, uint32_t pid, as_partition_reservation *rsv) +{ + as_aggr_call *call = astate->call; + if (call && call->aggr_hooks && call->aggr_hooks->ptn_reserve) { + return call->aggr_hooks->ptn_reserve(astate->udata, astate->ns, pid, rsv); + } + return NULL; +} + +static void +ptn_release(aggr_state *astate) +{ + as_aggr_call *call = astate->call; + if (call && call->aggr_hooks && call->aggr_hooks->ptn_release) { + call->aggr_hooks->ptn_release(astate->udata, astate->rsv); + } +} + +#if 0 +// In case we ever need this hook... +static void +set_error(aggr_state *astate, int err) +{ + as_aggr_call *call = astate->call; + if (call && call->aggr_hooks && call->aggr_hooks->set_error) { + call->aggr_hooks->set_error(astate->udata, err); + } +} +#endif // 0 + +static bool +pre_check(aggr_state *astate, void *skey) +{ + as_aggr_call *call = astate->call; + if (call && call->aggr_hooks && call->aggr_hooks->pre_check) { + return call->aggr_hooks->pre_check(astate->udata, as_rec_source(astate->urec), skey); + } + return true; // if not defined pre_check succeeds +} + +static int +aopen(aggr_state *astate, const cf_digest *digest) +{ + udf_record * urecord = as_rec_source(astate->urec); + as_index_ref * r_ref = urecord->r_ref; + as_transaction * tr = urecord->tr; + + int pid = as_partition_getid(digest); + urecord->keyd = *digest; + + astate->rsv = ptn_reserve(astate, pid, &tr->rsv); + if (!astate->rsv) { + cf_debug(AS_AGGR, "Reservation not done for partition %d", pid); + return -1; + } + + // NB: Partial Initialization due to heaviness. Not everything needed + // TODO: Make such initialization Commodity + tr->rsv.ns = astate->rsv->ns; + tr->rsv.p = astate->rsv->p; + tr->rsv.tree = astate->rsv->tree; + tr->keyd = urecord->keyd; + + r_ref->skip_lock = false; + if (udf_record_open(urecord) == 0) { + astate->rec_open = true; + return 0; + } + ptn_release(astate); + return -1; +} + +void +aclose(aggr_state *astate) +{ + // Bypassing doing the direct destroy because we need to + // avoid reducing the ref count. This rec (query_record + // implementation of as_rec) is ref counted when passed from + // here to Lua. If Lua access it even after moving to next + // element in the stream it does it at its own risk. Record + // may have changed under the hood. + if (astate->rec_open) { + udf_record_close(as_rec_source(astate->urec)); + ptn_release(astate); + astate->rec_open = false; + } + return; +} + +void +acleanup(aggr_state *astate) +{ + if (astate->iter) { + cf_ll_releaseIterator(astate->iter); + astate->iter = NULL; + } + aclose(astate); + + as_rec_destroy(astate->urec); +} + +// ************************************************************************************************** + +/* + * Aggregation Input Stream + */ +// ************************************************************************************************** +cf_digest * +get_next(aggr_state *astate) +{ + astate->keys_arr_offset++; + if (!astate->keys_arr || (astate->keys_arr_offset == astate->keys_arr->num)) { + + cf_ll_element * ele = cf_ll_getNext(astate->iter); + + // if NULL or number of element 0. No holes expected + if (!ele) { + return NULL; + } + + astate->keys_arr = ((as_index_keys_ll_element*)ele)->keys_arr; + if (!astate->keys_arr || (astate->keys_arr->num < 1)) { + astate->keys_arr = NULL; + return NULL; + } + + astate->keys_arr_offset = 0; + } + return &astate->keys_arr->pindex_digs[astate->keys_arr_offset]; +} + +// only operates on the record as_val in the stream points to +// and updates the references ... this function has to acquire +// partition reservation and also the object lock. So if the UDF +// does something stupid the object lock is gonna get held for +// a while ... there has to be timeout mechanism in here I think +static as_val * +istream_read(const as_stream *s) +{ + aggr_state *astate = as_stream_source(s); + + aclose(astate); + + // Iterate through stream to get next digest and + // populate record with it + while (!astate->rec_open) { + + if (get_next(astate) == NULL) { + return NULL; + } + + if (!aopen(astate, &astate->keys_arr->pindex_digs[astate->keys_arr_offset])) { + if (!pre_check(astate, &astate->keys_arr->sindex_keys[astate->keys_arr_offset])) { + aclose(astate); + } + } + } + return (as_val *)astate->urec; +} + +const as_stream_hooks istream_hooks = { + .destroy = NULL, + .read = istream_read, + .write = NULL +}; +// ************************************************************************************************** + + + +/* + * Aggregation Output Stream + */ +// ************************************************************************************************** +as_stream_status +ostream_write(const as_stream *s, as_val *val) +{ + aggr_state *astate = (aggr_state *)as_stream_source(s); + return astate->call->aggr_hooks->ostream_write(astate->udata, val); +} + +const as_stream_hooks ostream_hooks = { + .destroy = NULL, + .read = NULL, + .write = ostream_write +}; +// ************************************************************************************************** + + +/* + * Aggregation AS_AEROSPIKE interface for LUA + */ +// ************************************************************************************************** +static int +as_aggr_aerospike_log(const as_aerospike * a, const char * file, const int line, const int lvl, const char * msg) +{ + cf_fault_event(AS_AGGR, lvl, file, line, "%s", (char *) msg); + return 0; +} + +static const as_aerospike_hooks as_aggr_aerospike_hooks = { + .rec_update = NULL, + .rec_remove = NULL, + .rec_exists = NULL, + .log = as_aggr_aerospike_log, + .get_current_time = NULL, + .destroy = NULL +}; +// ************************************************************************************************** + + + +int +as_aggr_process(as_namespace *ns, as_aggr_call * ag_call, cf_ll * ap_recl, void * udata, as_result * ap_res) +{ + as_index_ref r_ref; + r_ref.skip_lock = false; + as_storage_rd rd; + bzero(&rd, sizeof(as_storage_rd)); + as_transaction tr; + + + udf_record urecord; + udf_record_init(&urecord, false); + urecord.tr = &tr; + urecord.r_ref = &r_ref; + urecord.rd = &rd; + as_rec * urec = as_rec_new(&urecord, &udf_record_hooks); + + aggr_state astate = { + .iter = cf_ll_getIterator(ap_recl, true /*forward*/), + .urec = urec, + .keys_arr = NULL, + .keys_arr_offset = 0, + .call = ag_call, + .udata = udata, + .rec_open = false, + .rsv = &tr.rsv, + .ns = ns + }; + + if (!astate.iter) { + cf_warning (AS_AGGR, "Could not set up iterator .. possibly out of memory .. Aborting Query !!"); + as_rec_destroy(urec); + return AS_AGGR_ERR; + } + + as_aerospike as; + as_aerospike_init(&as, NULL, &as_aggr_aerospike_hooks); + + // Input Stream + as_stream istream; + as_stream_init(&istream, &astate, &istream_hooks); + + // Output stream + as_stream ostream; + as_stream_init(&ostream, &astate, &ostream_hooks); + + as_udf_context ctx = { + .as = &as, + .timer = NULL, + .memtracker = NULL + }; + int ret = as_module_apply_stream(&mod_lua, &ctx, ag_call->def.filename, ag_call->def.function, &istream, ag_call->def.arglist, &ostream, ap_res); + + acleanup(&astate); + return ret; +} diff --git a/as/src/base/as.c b/as/src/base/as.c new file mode 100644 index 00000000..88fe431b --- /dev/null +++ b/as/src/base/as.c @@ -0,0 +1,520 @@ +/* + * as.c + * + * Copyright (C) 2008-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" + +#include "daemon.h" +#include "fault.h" +#include "hardware.h" +#include "tls.h" + +#include "base/batch.h" +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/json_init.h" +#include "base/monitor.h" +#include "base/scan.h" +#include "base/secondary_index.h" +#include "base/security.h" +#include "base/system_metadata.h" +#include "base/stats.h" +#include "base/thr_batch.h" +#include "base/thr_info.h" +#include "base/thr_info_port.h" +#include "base/thr_sindex.h" +#include "base/thr_tsvc.h" +#include "base/ticker.h" +#include "base/xdr_serverside.h" +#include "fabric/clustering.h" +#include "fabric/exchange.h" +#include "fabric/fabric.h" +#include "fabric/hb.h" +#include "fabric/migrate.h" +#include "fabric/skew_monitor.h" +#include "storage/storage.h" +#include "transaction/proxy.h" +#include "transaction/rw_request_hash.h" +#include "transaction/udf.h" + + +//========================================================== +// Constants. +// + +// String constants in version.c, generated by make. +extern const char aerospike_build_type[]; +extern const char aerospike_build_id[]; + +// Command line options for the Aerospike server. +static const struct option CMD_OPTS[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'v' }, + { "config-file", required_argument, NULL, 'f' }, + { "foreground", no_argument, NULL, 'd' }, + { "fgdaemon", no_argument, NULL, 'F' }, + { "cold-start", no_argument, NULL, 'c' }, + { "instance", required_argument, NULL, 'n' }, + { NULL, 0, NULL, 0 } +}; + +static const char HELP[] = + "\n" + "Aerospike server installation installs the script /etc/init.d/aerospike which\n" + "is normally used to start and stop the server. The script is also found as\n" + "as/etc/init-script in the source tree.\n" + "\n" + "asd informative command-line options:\n" + "\n" + "--help" + "\n" + "Print this message and exit.\n" + "\n" + "--version" + "\n" + "Print edition and build version information and exit.\n" + "\n" + "asd runtime command-line options:\n" + "\n" + "--config-file " + "\n" + "Specify the location of the Aerospike server config file. If this option is not\n" + "specified, the default location /etc/aerospike/aerospike.conf is used.\n" + "\n" + "--foreground" + "\n" + "Specify that Aerospike not be daemonized. This is useful for running Aerospike\n" + "in gdb. Alternatively, add 'run-as-daemon false' in the service context of the\n" + "Aerospike config file.\n" + "\n" + "--fgdaemon" + "\n" + "Specify that Aerospike is to be run as a \"new-style\" (foreground) daemon. This\n" + "is useful for running Aerospike under systemd or Docker.\n" + "\n" + "--cold-start" + "\n" + "(Enterprise edition only.) At startup, force the Aerospike server to read all\n" + "records from storage devices to rebuild the index.\n" + "\n" + "--instance <0-15>" + "\n" + "(Enterprise edition only.) If running multiple instances of Aerospike on one\n" + "machine (not recommended), each instance must be uniquely designated via this\n" + "option.\n" + ; + +static const char USAGE[] = + "\n" + "asd informative command-line options:\n" + "[--help]\n" + "[--version]\n" + "\n" + "asd runtime command-line options:\n" + "[--config-file ] " + "[--foreground] " + "[--fgdaemon] " + "[--cold-start] " + "[--instance <0-15>]\n" + ; + +static const char DEFAULT_CONFIG_FILE[] = "/etc/aerospike/aerospike.conf"; + +static const char SMD_DIR_NAME[] = "/smd"; + + +//========================================================== +// Globals. +// + +pthread_mutex_t g_main_deadlock = PTHREAD_MUTEX_INITIALIZER; +bool g_startup_complete = false; +bool g_shutdown_started = false; + + +//========================================================== +// Forward declarations. +// + +// signal.c, thr_demarshal.c and thr_nsup.c don't have header files. +extern void as_signal_setup(); +extern void as_demarshal_start(); +extern void as_nsup_start(); + +static void write_pidfile(char *pidfile); +static void validate_directory(const char *path, const char *log_tag); +static void validate_smd_directory(); + + +//========================================================== +// Aerospike server entry point. +// + +int +main(int argc, char **argv) +{ + g_start_ms = cf_getms(); + + // Initialize memory allocation. + cf_alloc_init(); + + // Initialize fault management framework. + cf_fault_init(); + + // Setup signal handlers. + as_signal_setup(); + + // Initialize TLS library. + tls_check_init(); + + int opt; + int opt_i; + const char *config_file = DEFAULT_CONFIG_FILE; + bool run_in_foreground = false; + bool new_style_daemon = false; + bool cold_start_cmd = false; + uint32_t instance = 0; + + // Parse command line options. + while ((opt = getopt_long(argc, argv, "", CMD_OPTS, &opt_i)) != -1) { + switch (opt) { + case 'h': + // printf() since we want stdout and don't want cf_fault's prefix. + printf("%s\n", HELP); + return 0; + case 'v': + // printf() since we want stdout and don't want cf_fault's prefix. + printf("%s build %s\n", aerospike_build_type, aerospike_build_id); + return 0; + case 'f': + config_file = cf_strdup(optarg); + break; + case 'F': + // As a "new-style" daemon(*), asd runs in the foreground and + // ignores the following configuration items: + // - user ('user') + // - group ('group') + // - PID file ('pidfile') + // + // If ignoring configuration items, or if the 'console' sink is not + // specified, warnings will appear in stderr. + // + // (*) http://0pointer.de/public/systemd-man/daemon.html#New-Style%20Daemons + run_in_foreground = true; + new_style_daemon = true; + break; + case 'd': + run_in_foreground = true; + break; + case 'c': + cold_start_cmd = true; + break; + case 'n': + instance = (uint32_t)strtol(optarg, NULL, 0); + break; + default: + // fprintf() since we don't want cf_fault's prefix. + fprintf(stderr, "%s\n", USAGE); + return 1; + } + } + + // Set all fields in the global runtime configuration instance. This parses + // the configuration file, and creates as_namespace objects. (Return value + // is a shortcut pointer to the global runtime configuration instance.) + as_config *c = as_config_init(config_file); + + // Detect NUMA topology and, if requested, prepare for CPU and NUMA pinning. + cf_topo_config(c->auto_pin, (cf_topo_numa_node_index)instance, + &c->service.bind); + + // Perform privilege separation as necessary. If configured user & group + // don't have root privileges, all resources created or reopened past this + // point must be set up so that they are accessible without root privileges. + // If not, the process will self-terminate with (hopefully!) a log message + // indicating which resource is not set up properly. + if (0 != c->uid && 0 == geteuid()) { + if (! new_style_daemon) { + // To see this log, change NO_SINKS_LIMIT in fault.c: + cf_info(AS_AS, "privsep to %d %d", c->uid, c->gid); + cf_process_privsep(c->uid, c->gid); + } + else { + cf_warning(AS_AS, "will not do privsep in new-style daemon mode"); + } + } + + // + // All resources such as files, devices, and shared memory must be created + // or reopened below this line! (The configuration file is the only thing + // that must be opened above, in order to parse the user & group.) + //========================================================================== + + // A "new-style" daemon expects console logging to be configured. (If not, + // log messages won't be seen via the standard path.) + if (new_style_daemon) { + if (! cf_fault_console_is_held()) { + cf_warning(AS_AS, "in new-style daemon mode, console logging is not configured"); + } + } + + // Activate log sinks. Up to this point, 'cf_' log output goes to stderr, + // filtered according to NO_SINKS_LIMIT in fault.c. After this point, 'cf_' + // log output will appear in all log file sinks specified in configuration, + // with specified filtering. If console sink is specified in configuration, + // 'cf_' log output will continue going to stderr, but filtering will switch + // from NO_SINKS_LIMIT to that specified in console sink configuration. + if (0 != cf_fault_sink_activate_all_held()) { + // Specifics of failure are logged in cf_fault_sink_activate_all_held(). + cf_crash_nostack(AS_AS, "can't open log sink(s)"); + } + + // Daemonize asd if specified. After daemonization, output to stderr will no + // longer appear in terminal. Instead, check /tmp/aerospike-console. + // for console output. + if (! run_in_foreground && c->run_as_daemon) { + // Don't close any open files when daemonizing. At this point only log + // sink files are open - instruct cf_process_daemonize() to ignore them. + int open_fds[CF_FAULT_SINKS_MAX]; + int num_open_fds = cf_fault_sink_get_fd_list(open_fds); + + cf_process_daemonize(open_fds, num_open_fds); + } + + // Log which build this is - should be the first line in the log file. + cf_info(AS_AS, "<><><><><><><><><><> %s build %s <><><><><><><><><><>", + aerospike_build_type, aerospike_build_id); + + // Includes echoing the configuration file to log. + as_config_post_process(c, config_file); + + xdr_config_post_process(); + + // If we allocated a non-default config file name, free it. + if (config_file != DEFAULT_CONFIG_FILE) { + cf_free((void*)config_file); + } + + // Write the pid file, if specified. + if (! new_style_daemon) { + write_pidfile(c->pidfile); + } + else { + if (c->pidfile) { + cf_warning(AS_AS, "will not write PID file in new-style daemon mode"); + } + } + + // Check that required directories are set up properly. + validate_directory(c->work_directory, "work"); + validate_directory(c->mod_lua.system_path, "Lua system"); + validate_directory(c->mod_lua.user_path, "Lua user"); + validate_smd_directory(); + + // Initialize subsystems. At this point we're allocating local resources, + // starting worker threads, etc. (But no communication with other server + // nodes or clients yet.) + + as_json_init(); // Jansson JSON API used by System Metadata + as_smd_init(); // System Metadata first - others depend on it + as_index_tree_gc_init(); // thread to purge dropped index trees + as_sindex_thr_init(); // defrag secondary index (ok during population) + + // Initialize namespaces. Each namespace decides here whether it will do a + // warm or cold start. Index arenas, partition structures and index tree + // structures are initialized. Secondary index system metadata is restored. + as_namespaces_init(cold_start_cmd, instance); + + // Initialize the storage system. For cold starts, this includes reading + // all the objects off the drives. This may block for a long time. The + // defrag subsystem starts operating at the end of this call. + as_storage_init(); + + // Migrate memory to correct NUMA node (includes restored index arenas). + cf_topo_migrate_memory(); + + // Populate all secondary indexes. This may block for a long time. + as_sindex_boot_populateall(); + + cf_info(AS_AS, "initializing services..."); + + as_netio_init(); + as_security_init(); // security features + as_tsvc_init(); // all transaction handling + as_hb_init(); // inter-node heartbeat + as_skew_monitor_init(); // clock skew monitor + as_fabric_init(); // inter-node communications + as_exchange_init(); // initialize the cluster exchange subsystem + as_clustering_init(); // clustering-v5 start + as_info_init(); // info transaction handling + as_migrate_init(); // move data between nodes + as_proxy_init(); // do work on behalf of others + as_rw_init(); // read & write service + as_query_init(); // query transaction handling + as_udf_init(); // user-defined functions + as_scan_init(); // scan a namespace or set + as_batch_init(); // batch transaction handling + as_batch_direct_init(); // low priority transaction handling + as_xdr_init(); // cross data-center replication + as_mon_init(); // monitor + + // Wait for enough available storage. We've been defragging all along, but + // here we wait until it's enough. This may block for a long time. + as_storage_wait_for_defrag(); + + // Start subsystems. At this point we may begin communicating with other + // cluster nodes, and ultimately with clients. + + as_smd_start(g_smd); // enables receiving cluster state change events + as_fabric_start(); // may send & receive fabric messages + as_xdr_start(); // XDR should start before it joins other nodes + as_hb_start(); // start inter-node heartbeat + as_exchange_start(); // start the cluster exchange subsystem + as_clustering_start(); // clustering-v5 start + as_nsup_start(); // may send delete transactions to other nodes + as_demarshal_start(); // server will now receive client transactions + as_info_port_start(); // server will now receive info transactions + as_ticker_start(); // only after everything else is started + + // Relevant for enterprise edition only. + as_storage_start_tomb_raider(); + + // Log a service-ready message. + cf_info(AS_AS, "service ready: soon there will be cake!"); + + //-------------------------------------------- + // Startup is done. This thread will now wait + // quietly for a shutdown signal. + // + + // Stop this thread from finishing. Intentionally deadlocking on a mutex is + // a remarkably efficient way to do this. + pthread_mutex_lock(&g_main_deadlock); + g_startup_complete = true; + pthread_mutex_lock(&g_main_deadlock); + + // When the service is running, you are here (deadlocked) - the signals that + // stop the service (yes, these signals always occur in this thread) will + // unlock the mutex, allowing us to continue. + + g_shutdown_started = true; + pthread_mutex_unlock(&g_main_deadlock); + pthread_mutex_destroy(&g_main_deadlock); + + //-------------------------------------------- + // Received a shutdown signal. + // + + as_storage_shutdown(); + as_xdr_shutdown(); + as_smd_shutdown(g_smd); + + cf_info(AS_AS, "finished clean shutdown - exiting"); + + // If shutdown was totally clean (all threads joined) we could just return, + // but for now we exit to make sure all threads die. +#ifdef DOPROFILE + exit(0); // exit(0) so profile build actually dumps gmon.out +#else + _exit(0); +#endif + + return 0; +} + + +//========================================================== +// Local helpers. +// + +static void +write_pidfile(char *pidfile) +{ + if (! pidfile) { + // If there's no pid file specified in the config file, just move on. + return; + } + + // Note - the directory the pid file is in must already exist. + + remove(pidfile); + + int pid_fd = open(pidfile, O_CREAT | O_RDWR, + S_IWUSR | S_IRUSR | S_IRGRP | S_IROTH); + + if (pid_fd < 0) { + cf_crash_nostack(AS_AS, "failed to open pid file %s: %s", pidfile, + cf_strerror(errno)); + } + + char pidstr[16]; + sprintf(pidstr, "%u\n", (uint32_t)getpid()); + + // If we can't access this resource, just log a warning and continue - + // it is not critical to the process. + if (write(pid_fd, pidstr, strlen(pidstr)) == -1) { + cf_warning(AS_AS, "failed write to pid file %s: %s", pidfile, + cf_strerror(errno)); + } + + close(pid_fd); +} + +static void +validate_directory(const char *path, const char *log_tag) +{ + struct stat buf; + + if (stat(path, &buf) != 0) { + cf_crash_nostack(AS_AS, "%s directory '%s' is not set up properly: %s", + log_tag, path, cf_strerror(errno)); + } + else if (! S_ISDIR(buf.st_mode)) { + cf_crash_nostack(AS_AS, "%s directory '%s' is not set up properly: Not a directory", + log_tag, path); + } +} + +static void +validate_smd_directory() +{ + size_t len = strlen(g_config.work_directory); + char smd_path[len + sizeof(SMD_DIR_NAME)]; + + strcpy(smd_path, g_config.work_directory); + strcpy(smd_path + len, SMD_DIR_NAME); + validate_directory(smd_path, "system metadata"); +} diff --git a/as/src/base/batch.c b/as/src/base/batch.c new file mode 100644 index 00000000..e33f3cae --- /dev/null +++ b/as/src/base/batch.c @@ -0,0 +1,1155 @@ +/* + * batch.c + * + * Copyright (C) 2012-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +#include "base/batch.h" +#include "aerospike/as_buffer_pool.h" +#include "aerospike/as_thread_pool.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_byte_order.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_queue.h" +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/security.h" +#include "base/stats.h" +#include "base/thr_tsvc.h" +#include "base/transaction.h" +#include "hardware.h" +#include "socket.h" +#include +#include + +//--------------------------------------------------------- +// MACROS +//--------------------------------------------------------- + +#define BATCH_BLOCK_SIZE (1024 * 128) // 128K +#define BATCH_MAX_TRANSACTION_SIZE (1024 * 1024 * 10) // 10MB +#define BATCH_REPEAT_SIZE 25 // index(4),digest(20) and repeat(1) + +//--------------------------------------------------------- +// TYPES +//--------------------------------------------------------- + +// Pad batch input header to 30 bytes which is also the size of a transaction header. +// This allows the input memory to be used as transaction cl_msg memory. +// This saves a large number of memory allocations while allowing different +// namespaces/bin name filters to be in the same batch. +typedef struct { + uint32_t index; + cf_digest keyd; + uint8_t repeat; + uint8_t info1; + uint16_t n_fields; + uint16_t n_ops; +} __attribute__((__packed__)) as_batch_input; + +typedef struct { + uint32_t capacity; + uint32_t size; + uint32_t tran_count; + cf_atomic32 writers; + as_proto proto; + uint8_t data[]; +} __attribute__((__packed__)) as_batch_buffer; + +struct as_batch_shared_s { + pthread_mutex_t lock; + cf_queue* response_queue; + as_file_handle* fd_h; + cl_msg* msgp; + as_batch_buffer* buffer; + uint64_t start; + uint32_t tran_count_response; + uint32_t tran_count; + uint32_t tran_max; + int result_code; + bool bad_response_fd; +}; + +typedef struct { + as_batch_shared* shared; + as_batch_buffer* buffer; +} as_batch_response; + +typedef struct { + cf_queue* response_queue; + cf_queue* complete_queue; + cf_atomic32 count; + volatile bool active; +} as_batch_queue; + +typedef struct { + as_batch_queue* batch_queue; + bool complete; +} as_batch_work; + +//--------------------------------------------------------- +// STATIC DATA +//--------------------------------------------------------- + +static as_thread_pool batch_thread_pool; +static as_buffer_pool batch_buffer_pool; + +static as_batch_queue batch_queues[MAX_BATCH_THREADS]; +static pthread_mutex_t batch_resize_lock; + +//--------------------------------------------------------- +// STATIC FUNCTIONS +//--------------------------------------------------------- + +static int +as_batch_send(cf_socket *sock, uint8_t* buf, size_t len, int flags) +{ + if (cf_socket_send_all(sock, buf, len, flags, CF_SOCKET_TIMEOUT) < 0) { + // Common when a client aborts. + cf_debug(AS_BATCH, "Batch send response error, errno %d fd %d", errno, CSFD(sock)); + return -1; + } + + return 0; +} + +static int +as_batch_send_error(as_transaction* btr, int result_code) +{ + cl_msg m; + m.proto.version = PROTO_VERSION; + m.proto.type = PROTO_TYPE_AS_MSG; + m.proto.sz = sizeof(as_msg); + as_proto_swap(&m.proto); + m.msg.header_sz = sizeof(as_msg); + m.msg.info1 = 0; + m.msg.info2 = 0; + m.msg.info3 = AS_MSG_INFO3_LAST; + m.msg.unused = 0; + m.msg.result_code = result_code; + m.msg.generation = 0; + m.msg.record_ttl = 0; + m.msg.transaction_ttl = 0; + m.msg.n_fields = 0; + m.msg.n_ops = 0; + as_msg_swap_header(&m.msg); + + int status = as_batch_send(&btr->from.proto_fd_h->sock, (uint8_t*)&m, sizeof(m), MSG_NOSIGNAL); + + as_end_of_transaction(btr->from.proto_fd_h, status != 0); + btr->from.proto_fd_h = NULL; + + cf_free(btr->msgp); + btr->msgp = 0; + + if (result_code == AS_PROTO_RESULT_FAIL_TIMEOUT) { + cf_atomic64_incr(&g_stats.batch_index_timeout); + } + else { + cf_atomic64_incr(&g_stats.batch_index_errors); + } + return status; +} + +static void +as_batch_send_buffer(as_batch_shared* shared, as_batch_buffer* buffer) +{ + // Don't send buffer if an error has already occurred. + if (shared->bad_response_fd || shared->result_code) { + return; + } + + // Send buffer block to client socket. + buffer->proto.version = PROTO_VERSION; + buffer->proto.type = PROTO_TYPE_AS_MSG; + buffer->proto.sz = buffer->size; + as_proto_swap(&buffer->proto); + + int status = as_batch_send(&shared->fd_h->sock, (uint8_t*)&buffer->proto, sizeof(as_proto) + buffer->size, MSG_NOSIGNAL | MSG_MORE); + + if (status) { + // Socket error. Release shared->fd_h after all sub-transactions are + // complete - shared->fd_h needed for security filter. + shared->bad_response_fd = true; + cf_atomic64_incr(&g_stats.batch_index_errors); + } +} + +static void +as_batch_send_final(as_batch_shared* shared) +{ + // Send protocol trailer to client socket. + if (shared->bad_response_fd) { + as_end_of_transaction_force_close(shared->fd_h); + shared->fd_h = NULL; + return; + } + + cl_msg m; + m.proto.version = PROTO_VERSION; + m.proto.type = PROTO_TYPE_AS_MSG; + m.proto.sz = sizeof(as_msg); + as_proto_swap(&m.proto); + m.msg.header_sz = sizeof(as_msg); + m.msg.info1 = 0; + m.msg.info2 = 0; + m.msg.info3 = AS_MSG_INFO3_LAST; + m.msg.unused = 0; + m.msg.result_code = shared->result_code; + m.msg.generation = 0; + m.msg.record_ttl = 0; + m.msg.transaction_ttl = 0; + m.msg.n_fields = 0; + m.msg.n_ops = 0; + as_msg_swap_header(&m.msg); + + int status = as_batch_send(&shared->fd_h->sock, (uint8_t*) &m, sizeof(m), MSG_NOSIGNAL); + + as_end_of_transaction(shared->fd_h, status != 0); + shared->fd_h = NULL; + + // For now the model is timeouts don't appear in histograms. + if (shared->result_code != AS_PROTO_RESULT_FAIL_TIMEOUT) { + G_HIST_ACTIVATE_INSERT_DATA_POINT(batch_index_hist, shared->start); + } + + // Check final return code in order to update statistics. + if (status == 0 && shared->result_code == 0) { + cf_atomic64_incr(&g_stats.batch_index_complete); + } + else { + if (shared->result_code == AS_PROTO_RESULT_FAIL_TIMEOUT) { + cf_atomic64_incr(&g_stats.batch_index_timeout); + } + else { + cf_atomic64_incr(&g_stats.batch_index_errors); + } + } +} + +static inline void +as_batch_free(as_batch_shared* shared, as_batch_queue* batch_queue) +{ + // Destroy lock + pthread_mutex_destroy(&shared->lock); + + // Release memory + cf_free(shared->msgp); + cf_free(shared); + + // It's critical that this count is decremented after the transaction is + // completely finished with the queue because "shutdown threads" relies + // on this information when performing graceful shutdown. + cf_atomic32_decr(&batch_queue->count); +} + +static void +as_batch_worker(void* udata) +{ + // Send batch data to client, one buffer block at a time. + as_batch_work* work = (as_batch_work*)udata; + as_batch_queue* batch_queue = work->batch_queue; + cf_queue* response_queue = batch_queue->response_queue; + as_batch_response response; + as_batch_shared* shared; + as_batch_buffer* buffer; + + while (cf_queue_pop(response_queue, &response, CF_QUEUE_FOREVER) == CF_QUEUE_OK) { + // Check if this thread task should end. + shared = response.shared; + if (! shared) { + break; + } + + buffer = response.buffer; + shared->tran_count_response += buffer->tran_count; + + if (buffer->capacity) { + // Send buffer block to client. + as_batch_send_buffer(shared, buffer); + + if (as_buffer_pool_push_limit(&batch_buffer_pool, buffer, buffer->capacity, g_config.batch_max_unused_buffers) != 0) { + cf_atomic64_incr(&g_stats.batch_index_destroyed_buffers); + } + } + else { + // Server error buffers should not be put into buffer pool. + cf_free(buffer); + cf_atomic64_incr(&g_stats.batch_index_destroyed_buffers); + } + + // Wait till all transactions have been received before sending + // final batch entry and releasing memory. + if (shared->tran_count_response == shared->tran_max) { + as_batch_send_final(shared); + as_batch_free(shared, batch_queue); + } + } + + // Send back completion notification. + uint32_t complete = 1; + cf_queue_push(work->batch_queue->complete_queue, &complete); +} + +static int +as_batch_create_thread_queues(uint32_t begin, uint32_t end) +{ + // Allocate one queue per batch response worker thread. + int status = 0; + + as_batch_work work; + work.complete = false; + + for (uint32_t i = begin; i < end; i++) { + work.batch_queue = &batch_queues[i]; + work.batch_queue->response_queue = cf_queue_create(sizeof(as_batch_response), true); + work.batch_queue->complete_queue = cf_queue_create(sizeof(uint32_t), true); + work.batch_queue->count = 0; + work.batch_queue->active = true; + + int rc = as_thread_pool_queue_task_fixed(&batch_thread_pool, &work); + + if (rc) { + cf_warning(AS_BATCH, "Failed to create batch thread %u: %d", i, rc); + status = rc; + } + } + return status; +} + +static bool +as_batch_wait(uint32_t begin, uint32_t end) +{ + for (uint32_t i = begin; i < end; i++) { + if (batch_queues[i].count > 0) { + return false; + } + } + return true; +} + +static int +as_batch_shutdown_thread_queues(uint32_t begin, uint32_t end) +{ + // Set excess queues to inactive. + // Existing batch transactions will be allowed to complete. + for (uint32_t i = begin; i < end; i++) { + batch_queues[i].active = false; + } + + // Wait till there are no more active batch transactions on the queues. + // Timeout after 30 seconds. + uint64_t limitus = cf_getus() + (1000 * 1000 * 30); + usleep(50 * 1000); // Sleep 50ms + do { + if (as_batch_wait(begin, end)) { + break; + } + usleep(500 * 1000); // Sleep 500ms + + if (cf_getus() > limitus) { + cf_warning(AS_BATCH, "Batch shutdown threads failed on timeout. Transactions remain on queue."); + // Reactivate queues. + for (uint32_t i = begin; i < end; i++) { + batch_queues[i].active = true; + } + return -1; + } + } while (true); + + // Send stop command to excess queues. + as_batch_response response; + memset(&response, 0, sizeof(as_batch_response)); + + for (uint32_t i = begin; i < end; i++) { + cf_queue_push(batch_queues[i].response_queue, &response); + } + + // Wait for completion events. + uint32_t complete; + for (uint32_t i = begin; i < end; i++) { + as_batch_queue* bq = &batch_queues[i]; + cf_queue_pop(bq->complete_queue, &complete, CF_QUEUE_FOREVER); + cf_queue_destroy(bq->complete_queue); + bq->complete_queue = 0; + cf_queue_destroy(bq->response_queue); + bq->response_queue = 0; + } + return 0; +} + +static as_batch_queue* +as_batch_find_queue(int queue_index) +{ + // Search backwards for an active queue. + for (int index = queue_index - 1; index >= 0; index--) { + as_batch_queue* bq = &batch_queues[index]; + + if (bq->active && cf_queue_sz(bq->response_queue) < g_config.batch_max_buffers_per_queue) { + return bq; + } + } + + // Search forwards. + for (int index = queue_index + 1; index < MAX_BATCH_THREADS; index++) { + as_batch_queue* bq = &batch_queues[index]; + + // If current queue is not active, future queues will not be active either. + if (! bq->active) { + break; + } + + if (cf_queue_sz(bq->response_queue) < g_config.batch_max_buffers_per_queue) { + return bq; + } + } + return 0; +} + +static as_batch_buffer* +as_batch_buffer_create(uint32_t size) +{ + as_batch_buffer* buffer = cf_malloc(size); + buffer->capacity = size - batch_buffer_pool.header_size; + cf_atomic64_incr(&g_stats.batch_index_created_buffers); + return buffer; +} + +static uint8_t* +as_batch_buffer_pop(as_batch_shared* shared, uint32_t size) +{ + as_batch_buffer* buffer; + uint32_t mem_size = size + batch_buffer_pool.header_size; + + if (mem_size > batch_buffer_pool.buffer_size) { + // Requested size is greater than fixed buffer size. + // Allocate new buffer, but don't put back into pool. + buffer = as_batch_buffer_create(mem_size); + cf_atomic64_incr(&g_stats.batch_index_huge_buffers); + } + else { + // Pop existing buffer from queue. + // The extra lock here is unavoidable. + int status = cf_queue_pop(batch_buffer_pool.queue, &buffer, CF_QUEUE_NOWAIT); + + if (status == CF_QUEUE_OK) { + buffer->capacity = batch_buffer_pool.buffer_size - batch_buffer_pool.header_size; + } + else if (status == CF_QUEUE_EMPTY) { + // Queue is empty. Create new buffer. + buffer = as_batch_buffer_create(batch_buffer_pool.buffer_size); + } + else { + cf_warning(AS_BATCH, "Failed to pop new batch buffer: %d", status); + // Try to allocate small buffer with just header. + as_batch_buffer* buffer = cf_malloc(sizeof(as_batch_buffer)); + buffer->capacity = 0; + buffer->size = 0; + buffer->tran_count = 1; + buffer->writers = 2; + shared->buffer = buffer; + shared->result_code = AS_PROTO_RESULT_FAIL_UNKNOWN; + return 0; + } + } + + // Reserve a slot in new buffer. + buffer->size = size; + buffer->tran_count = 1; + buffer->writers = 2; + shared->buffer = buffer; + return buffer->data; +} + +static inline void +as_batch_buffer_complete(as_batch_shared* shared, as_batch_buffer* buffer) +{ + // Flush when all writers have finished writing into the buffer. + if (cf_atomic32_decr(&buffer->writers) == 0) { + as_batch_response response = {.shared = shared, .buffer = buffer}; + cf_queue_push(shared->response_queue, &response); + } +} + +static uint8_t* +as_batch_reserve(as_batch_shared* shared, uint32_t size, int result_code, as_batch_buffer** buffer_out, bool* complete) +{ + as_batch_buffer* buffer; + uint8_t* data; + + pthread_mutex_lock(&shared->lock); + *complete = (++shared->tran_count == shared->tran_max); + buffer = shared->buffer; + + if (! buffer) { + // No previous buffer. Get new buffer. + data = as_batch_buffer_pop(shared, size); + *buffer_out = shared->buffer; + pthread_mutex_unlock(&shared->lock); + } + else if (buffer->size + size <= buffer->capacity) { + // Result fits into existing block. Reserve a slot. + data = buffer->data + buffer->size; + buffer->size += size; + buffer->tran_count++; + cf_atomic32_incr(&buffer->writers); + *buffer_out = buffer; + pthread_mutex_unlock(&shared->lock); + } + else { + // Result does not fit into existing block. + // Make copy of existing buffer. + as_batch_buffer* prev_buffer = buffer; + + // Get new buffer. + data = as_batch_buffer_pop(shared, size); + *buffer_out = shared->buffer; + pthread_mutex_unlock(&shared->lock); + + as_batch_buffer_complete(shared, prev_buffer); + } + + if (! (result_code == AS_PROTO_RESULT_OK || result_code == AS_PROTO_RESULT_FAIL_NOT_FOUND)) { + // Result code can be set outside of lock because it doesn't matter which transaction's + // result code is used as long as it's an error. + shared->result_code = result_code; + } + return data; +} + +static inline void +as_batch_transaction_end(as_batch_shared* shared, as_batch_buffer* buffer, bool complete) +{ + // This flush can only be triggered when the buffer is full. + as_batch_buffer_complete(shared, buffer); + + if (complete) { + // This flush only occurs when all transactions in batch have been processed. + as_batch_buffer_complete(shared, buffer); + } +} + +static void +as_batch_terminate(as_batch_shared* shared, uint32_t tran_count, int result_code) +{ + // Terminate batch by adding phantom transactions to shared and buffer tran counts. + // This is done so the memory is released at the end only once. + as_batch_buffer* buffer; + bool complete; + + pthread_mutex_lock(&shared->lock); + buffer = shared->buffer; + shared->result_code = result_code; + shared->tran_count += tran_count; + complete = (shared->tran_count == shared->tran_max); + + if (! buffer) { + // No previous buffer. Get new buffer. + as_batch_buffer_pop(shared, 0); + buffer = shared->buffer; + buffer->tran_count = tran_count; // Override tran_count. + } + else { + // Buffer exists. Add phantom transactions. + buffer->tran_count += tran_count; + cf_atomic32_incr(&buffer->writers); + } + pthread_mutex_unlock(&shared->lock); + as_batch_transaction_end(shared, buffer, complete); +} + +//--------------------------------------------------------- +// FUNCTIONS +//--------------------------------------------------------- + +int +as_batch_init() +{ + if (pthread_mutex_init(&batch_resize_lock, NULL)) { + cf_warning(AS_BATCH, "Failed to initialize batch resize lock"); + return -1; + } + + // Default 'batch-index-threads' can't be set before call to cf_topo_init(). + if (g_config.n_batch_index_threads == 0) { + g_config.n_batch_index_threads = cf_topo_count_cpus(); + } + + cf_info(AS_BATCH, "starting %u batch-index-threads", g_config.n_batch_index_threads); + + int rc = as_thread_pool_init_fixed(&batch_thread_pool, g_config.n_batch_index_threads, as_batch_worker, + sizeof(as_batch_work), offsetof(as_batch_work,complete)); + + if (rc) { + cf_warning(AS_BATCH, "Failed to initialize batch-index-threads to %u: %d", g_config.n_batch_index_threads, rc); + return rc; + } + + rc = as_buffer_pool_init(&batch_buffer_pool, sizeof(as_batch_buffer), BATCH_BLOCK_SIZE); + + if (rc) { + cf_warning(AS_BATCH, "Failed to initialize batch buffer pool: %d", rc); + return rc; + } + + rc = as_batch_create_thread_queues(0, g_config.n_batch_index_threads); + + if (rc) { + return rc; + } + + return 0; +} + +int +as_batch_queue_task(as_transaction* btr) +{ + uint64_t counter = cf_atomic64_incr(&g_stats.batch_index_initiate); + uint32_t thread_size = batch_thread_pool.thread_size; + + if (thread_size == 0 || thread_size > MAX_BATCH_THREADS) { + cf_warning(AS_BATCH, "batch-index-threads has been disabled: %d", thread_size); + return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_BATCH_DISABLED); + } + uint32_t queue_index = counter % thread_size; + + // Validate batch transaction + as_proto* bproto = &btr->msgp->proto; + + if (bproto->sz > PROTO_SIZE_MAX) { + cf_warning(AS_BATCH, "can't process message: invalid size %lu should be %d or less", + (uint64_t)bproto->sz, PROTO_SIZE_MAX); + return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_PARAMETER); + } + + if (bproto->type != PROTO_TYPE_AS_MSG) { + cf_warning(AS_BATCH, "Invalid proto type. Expected %d Received %d", PROTO_TYPE_AS_MSG, bproto->type); + return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_PARAMETER); + } + + // Check that the socket is authenticated. + uint8_t result = as_security_check(btr->from.proto_fd_h, PERM_NONE); + + if (result != AS_PROTO_RESULT_OK) { + as_security_log(btr->from.proto_fd_h, result, PERM_NONE, NULL, NULL); + return as_batch_send_error(btr, result); + } + + // Parse header + as_msg* bmsg = &btr->msgp->msg; + as_msg_swap_header(bmsg); + + // Parse fields + uint8_t* limit = (uint8_t*)bmsg + bproto->sz; + as_msg_field* mf = (as_msg_field*)bmsg->data; + as_msg_field* end; + as_msg_field* bf = 0; + + for (int i = 0; i < bmsg->n_fields; i++) { + if ((uint8_t*)mf >= limit) { + cf_warning(AS_BATCH, "Batch field limit reached"); + return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_PARAMETER); + } + as_msg_swap_field(mf); + end = as_msg_field_get_next(mf); + + if (mf->type == AS_MSG_FIELD_TYPE_BATCH || mf->type == AS_MSG_FIELD_TYPE_BATCH_WITH_SET) { + bf = mf; + } + mf = end; + } + + if (! bf) { + cf_warning(AS_BATCH, "Batch index field not found"); + return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_PARAMETER); + } + + // Parse batch field + uint8_t* data = bf->data; + uint32_t tran_count = cf_swap_from_be32(*(uint32_t*)data); + data += sizeof(uint32_t); + + if (tran_count == 0) { + cf_warning(AS_BATCH, "Batch request size is zero"); + return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_PARAMETER); + } + + if (tran_count > g_config.batch_max_requests) { + cf_warning(AS_BATCH, "Batch request size %u exceeds max %u", tran_count, g_config.batch_max_requests); + return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_BATCH_MAX_REQUESTS); + } + + // Initialize shared data + as_batch_shared* shared = cf_malloc(sizeof(as_batch_shared)); + + memset(shared, 0, sizeof(as_batch_shared)); + + if (pthread_mutex_init(&shared->lock, NULL)) { + cf_warning(AS_BATCH, "Failed to initialize batch lock"); + cf_free(shared); + return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_UNKNOWN); + } + + shared->start = btr->start_time; + shared->fd_h = btr->from.proto_fd_h; + shared->msgp = btr->msgp; + shared->tran_max = tran_count; + + // Find batch queue to send transaction responses. + as_batch_queue* batch_queue = &batch_queues[queue_index]; + + // batch_max_buffers_per_queue is a soft limit, but still must be checked under lock. + if (! (batch_queue->active && cf_queue_sz(batch_queue->response_queue) < g_config.batch_max_buffers_per_queue)) { + // Queue buffer limit has been exceeded or thread has been shutdown (probably due to + // downwards thread resize). Search for an available queue. + // cf_warning(AS_BATCH, "Queue %u full %d", queue_index, cf_queue_sz(batch_queue->response_queue)); + batch_queue = as_batch_find_queue(queue_index); + + if (! batch_queue) { + cf_warning(AS_BATCH, "Failed to find active batch queue that is not full"); + cf_free(shared); + return as_batch_send_error(btr, AS_PROTO_RESULT_FAIL_BATCH_QUEUES_FULL); + } + } + // Increment batch queue transaction count. + cf_atomic32_incr(&batch_queue->count); + shared->response_queue = batch_queue->response_queue; + + // Initialize generic transaction. + as_transaction tr; + as_transaction_init_head(&tr, 0, 0); + + tr.origin = FROM_BATCH; + tr.from_flags |= FROM_FLAG_BATCH_SUB; + tr.start_time = btr->start_time; + + // Read batch keys and initialize generic transactions. + as_batch_input* in; + cl_msg* out = NULL; + cl_msg* prev_msgp = NULL; + as_msg_op* op; + uint32_t tran_row = 0; + uint8_t info = *data++; // allow transaction inline. + + bool allow_inline = (g_config.n_namespaces_inlined != 0 && info); + bool check_inline = (allow_inline && g_config.n_namespaces_not_inlined != 0); + bool should_inline = (allow_inline && g_config.n_namespaces_not_inlined == 0); + + // Split batch rows into separate single record read transactions. + // The read transactions are located in the same memory block as + // the original batch transactions. This allows us to avoid performing + // an extra malloc for each transaction. + while (tran_row < tran_count && data + BATCH_REPEAT_SIZE <= limit) { + // Copy transaction data before memory gets overwritten. + in = (as_batch_input*)data; + + tr.from.batch_shared = shared; // is set NULL after sub-transaction + tr.from_data.batch_index = cf_swap_from_be32(in->index); + tr.keyd = in->keyd; + tr.benchmark_time = btr->benchmark_time; // must reset for each usage + + if (in->repeat) { + if (! prev_msgp) { + break; // bad bytes from client - repeat set on first item + } + + // Row should use previous namespace and bin names. + data += BATCH_REPEAT_SIZE; + tr.msgp = prev_msgp; + } + else { + tr.msg_fields = 0; // erase previous AS_MSG_FIELD_BIT_SET flag, if any + as_transaction_set_msg_field_flag(&tr, AS_MSG_FIELD_TYPE_NAMESPACE); + + // Row contains full namespace/bin names. + out = (cl_msg*)data; + + if (data + sizeof(cl_msg) + sizeof(as_msg_field) > limit) { + break; + } + + out->msg.header_sz = sizeof(as_msg); + out->msg.info1 = in->info1; + out->msg.info2 = 0; + out->msg.info3 = bmsg->info3 & AS_MSG_INFO3_LINEARIZE_READ; + out->msg.unused = 0; + out->msg.result_code = 0; + out->msg.generation = 0; + out->msg.record_ttl = 0; + out->msg.transaction_ttl = bmsg->transaction_ttl; // already swapped + // n_fields/n_ops is in exact same place on both input/output, but the value still + // needs to be swapped. + out->msg.n_fields = cf_swap_from_be16(in->n_fields); + + // Older clients sent zero, but always sent namespace. Adjust this. + if (out->msg.n_fields == 0) { + out->msg.n_fields = 1; + } + + out->msg.n_ops = cf_swap_from_be16(in->n_ops); + + // Namespace input is same as namespace field, so just leave in place and swap. + data += sizeof(cl_msg); + mf = (as_msg_field*)data; + as_msg_swap_field(mf); + if (check_inline) { + as_namespace* ns = as_namespace_get_bymsgfield(mf); + should_inline = ns && ns->storage_data_in_memory; + } + mf = as_msg_field_get_next(mf); + data = (uint8_t*)mf; + + // Swap remaining fields. + for (uint16_t j = 1; j < out->msg.n_fields; j++) { + if (data + sizeof(as_msg_field) > limit) { + goto TranEnd; + } + + if (mf->type == AS_MSG_FIELD_TYPE_SET) { + as_transaction_set_msg_field_flag(&tr, AS_MSG_FIELD_TYPE_SET); + } + + as_msg_swap_field(mf); + mf = as_msg_field_get_next(mf); + data = (uint8_t*)mf; + } + + if (out->msg.n_ops) { + // Bin names input is same as transaction ops, so just leave in place and swap. + uint16_t n_ops = out->msg.n_ops; + for (uint16_t j = 0; j < n_ops; j++) { + if (data + sizeof(as_msg_op) > limit) { + goto TranEnd; + } + op = (as_msg_op*)data; + as_msg_swap_op(op); + op = as_msg_op_get_next(op); + data = (uint8_t*)op; + } + } + + // Initialize msg header. + out->proto.version = PROTO_VERSION; + out->proto.type = PROTO_TYPE_AS_MSG; + out->proto.sz = (data - (uint8_t*)&out->msg); + tr.msgp = out; + prev_msgp = out; + } + + if (data > limit) { + break; + } + + // Submit transaction. + if (should_inline) { + as_tsvc_process_transaction(&tr); + } + else { + // Queue transaction to be processed by a transaction thread. + as_tsvc_enqueue(&tr); + } + tran_row++; + } + +TranEnd: + if (tran_row < tran_count) { + // Mismatch between tran_count and actual data. Terminate transaction. + cf_warning(AS_BATCH, "Batch keys mismatch. Expected %u Received %u", tran_count, tran_row); + as_batch_terminate(shared, tran_count - tran_row, AS_PROTO_RESULT_FAIL_PARAMETER); + } + + // Reset original socket because socket now owned by batch shared. + btr->from.proto_fd_h = NULL; + return 0; +} + +void +as_batch_add_result(as_transaction* tr, uint16_t n_bins, as_bin** bins, + as_msg_op** ops) +{ + as_namespace* ns = tr->rsv.ns; + + // Calculate size. + size_t size = sizeof(as_msg); + size += sizeof(as_msg_field) + sizeof(cf_digest); + + uint16_t n_fields = 1; + + for (uint16_t i = 0; i < n_bins; i++) { + as_bin* bin = bins[i]; + size += sizeof(as_msg_op); + + if (ops) { + size += ops[i]->name_sz; + } + else if (bin) { + size += ns->single_bin ? 0 : strlen(as_bin_get_name_from_id(ns, bin->id)); + } + else { + cf_crash(AS_BATCH, "making response message with null bin and op"); + } + + if (bin) { + size += as_bin_particle_client_value_size(bin); + } + } + + as_batch_shared* shared = tr->from.batch_shared; + + if (size > BATCH_MAX_TRANSACTION_SIZE) { + cf_warning(AS_BATCH, "Record size %zu exceeds max %d", size, BATCH_MAX_TRANSACTION_SIZE); + as_batch_add_error(shared, tr->from_data.batch_index, AS_PROTO_RESULT_FAIL_RECORD_TOO_BIG); + return; + } + + as_batch_buffer* buffer; + bool complete; + uint8_t* data = as_batch_reserve(shared, size, tr->result_code, &buffer, &complete); + + if (data) { + // Write header. + uint8_t* p = data; + as_msg* m = (as_msg*)p; + m->header_sz = sizeof(as_msg); + m->info1 = 0; + m->info2 = 0; + m->info3 = 0; + m->unused = 0; + m->result_code = tr->result_code; + m->generation = plain_generation(tr->generation, ns); + m->record_ttl = tr->void_time; + + // Overload transaction_ttl to store batch index. + m->transaction_ttl = tr->from_data.batch_index; + + m->n_fields = n_fields; + m->n_ops = n_bins; + as_msg_swap_header(m); + p += sizeof(as_msg); + + as_msg_field* field = (as_msg_field*)p; + field->field_sz = sizeof(cf_digest) + 1; + field->type = AS_MSG_FIELD_TYPE_DIGEST_RIPE; + memcpy(field->data, &tr->keyd, sizeof(cf_digest)); + as_msg_swap_field(field); + p += sizeof(as_msg_field) + sizeof(cf_digest); + + for (uint16_t i = 0; i < n_bins; i++) { + as_bin* bin = bins[i]; + as_msg_op* op = (as_msg_op*)p; + op->op = AS_MSG_OP_READ; + op->version = 0; + + if (ops) { + as_msg_op* src = ops[i]; + memcpy(op->name, src->name, src->name_sz); + op->name_sz = src->name_sz; + } + else { + op->name_sz = as_bin_memcpy_name(ns, op->name, bin); + } + + op->op_sz = 4 + op->name_sz; + p += sizeof(as_msg_op) + op->name_sz; + p += as_bin_particle_to_client(bin, op); + as_msg_swap_op(op); + } + } + as_batch_transaction_end(shared, buffer, complete); +} + +void +as_batch_add_proxy_result(as_batch_shared* shared, uint32_t index, cf_digest* digest, cl_msg* cmsg, size_t proxy_size) +{ + as_msg* msg = &cmsg->msg; + size_t size = proxy_size + sizeof(as_msg_field) + sizeof(cf_digest) - sizeof(as_proto); + + if (size > BATCH_MAX_TRANSACTION_SIZE) { + cf_warning(AS_BATCH, "Record size %zu exceeds max %d", size, BATCH_MAX_TRANSACTION_SIZE); + as_batch_add_error(shared, index, AS_PROTO_RESULT_FAIL_RECORD_TOO_BIG); + return; + } + + as_batch_buffer* buffer; + bool complete; + uint8_t* data = as_batch_reserve(shared, size, msg->result_code, &buffer, &complete); + + if (data) { + // Overload transaction_ttl to store batch index. + msg->transaction_ttl = htonl(index); + + // Write header + uint16_t n_fields = ntohs(msg->n_fields); + msg->n_fields = htons(n_fields + 1); + memcpy(data, msg, sizeof(as_msg)); + uint8_t* trg = data + sizeof(as_msg); + + // Write digest field + as_msg_field* field = (as_msg_field*)trg; + field->field_sz = sizeof(cf_digest) + 1; + field->type = AS_MSG_FIELD_TYPE_DIGEST_RIPE; + memcpy(field->data, digest, sizeof(cf_digest)); + as_msg_swap_field(field); + trg += sizeof(as_msg_field) + sizeof(cf_digest); + + // Copy others fields and ops. + size = ((uint8_t*)cmsg + proxy_size) - msg->data; + memcpy(trg, msg->data, size); + } + as_batch_transaction_end(shared, buffer, complete); +} + +void +as_batch_add_error(as_batch_shared* shared, uint32_t index, int result_code) +{ + as_batch_buffer* buffer; + bool complete; + uint8_t* data = as_batch_reserve(shared, sizeof(as_msg), result_code, &buffer, &complete); + + if (data) { + // Write error. + as_msg* m = (as_msg*)data; + m->header_sz = sizeof(as_msg); + m->info1 = 0; + m->info2 = 0; + m->info3 = 0; + m->unused = 0; + m->result_code = result_code; + m->generation = 0; + m->record_ttl = 0; + // Overload transaction_ttl to store batch index. + m->transaction_ttl = index; + m->n_fields = 0; + m->n_ops = 0; + as_msg_swap_header(m); + } + as_batch_transaction_end(shared, buffer, complete); +} + +int +as_batch_threads_resize(uint32_t threads) +{ + if (threads > MAX_BATCH_THREADS) { + cf_warning(AS_BATCH, "batch-index-threads %u exceeds max %u", threads, MAX_BATCH_THREADS); + return -1; + } + + if (pthread_mutex_lock(&batch_resize_lock)) { + cf_warning(AS_BATCH, "Batch resize lock failed"); + return -2; + } + + // Resize thread pool. The threads will wait for graceful shutdown on downwards resize. + uint32_t threads_orig = batch_thread_pool.thread_size; + cf_info(AS_BATCH, "Resize batch-index-threads from %u to %u", threads_orig, threads); + int status = 0; + + if (threads != threads_orig) { + if (threads > threads_orig) { + // Increase threads before initializing queues. + status = as_thread_pool_resize(&batch_thread_pool, threads); + + if (status == 0) { + g_config.n_batch_index_threads = threads; + // Adjust queues to match new thread size. + status = as_batch_create_thread_queues(threads_orig, threads); + } + else { + // Show warning, but keep going as some threads may have been successfully added/removed. + cf_warning(AS_BATCH, "Failed to resize batch-index-threads. status=%d, batch-index-threads=%u", + status, g_config.n_batch_index_threads); + threads = batch_thread_pool.thread_size; + + if (threads > threads_orig) { + g_config.n_batch_index_threads = threads; + // Adjust queues to match new thread size. + status = as_batch_create_thread_queues(threads_orig, threads); + } + } + } + else { + // Shutdown queues before shutting down threads. + status = as_batch_shutdown_thread_queues(threads, threads_orig); + + if (status == 0) { + // Adjust threads to match new queue size. + status = as_thread_pool_resize(&batch_thread_pool, threads); + g_config.n_batch_index_threads = batch_thread_pool.thread_size; + + if (status) { + cf_warning(AS_BATCH, "Failed to resize batch-index-threads. status=%d, batch-index-threads=%u", + status, g_config.n_batch_index_threads); + } + } + } + } + pthread_mutex_unlock(&batch_resize_lock); + return status; +} + +void +as_batch_queues_info(cf_dyn_buf* db) +{ + if (pthread_mutex_lock(&batch_resize_lock)) { + cf_warning(AS_BATCH, "Batch info resize lock failed"); + return; + } + + uint32_t max = batch_thread_pool.thread_size; + + for (uint32_t i = 0; i < max; i++) { + if (i > 0) { + cf_dyn_buf_append_char(db, ','); + } + as_batch_queue* bq = &batch_queues[i]; + cf_dyn_buf_append_uint32(db, bq->count); // Batch count + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_int(db, cf_queue_sz(bq->response_queue)); // Buffer count + } + pthread_mutex_unlock(&batch_resize_lock); +} + +int +as_batch_unused_buffers() +{ + return cf_queue_sz(batch_buffer_pool.queue); +} + +// Not currently called. Put in this place holder in case server decides to +// implement clean shutdowns in the future. +void +as_batch_destroy() +{ + as_thread_pool_destroy(&batch_thread_pool); + as_buffer_pool_destroy(&batch_buffer_pool); + + pthread_mutex_lock(&batch_resize_lock); + as_batch_shutdown_thread_queues(0, batch_thread_pool.thread_size); + pthread_mutex_unlock(&batch_resize_lock); + pthread_mutex_destroy(&batch_resize_lock); +} + +as_file_handle* +as_batch_get_fd_h(as_batch_shared* shared) +{ + return shared->fd_h; +} diff --git a/as/src/base/bin.c b/as/src/base/bin.c new file mode 100644 index 00000000..a04c9d96 --- /dev/null +++ b/as/src/base/bin.c @@ -0,0 +1,685 @@ +/* + * bin.c + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" + +#include "fault.h" +#include "vmapx.h" + +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "storage/storage.h" + + +//========================================================== +// Inlines & macros. +// + +// Never called if single-bin. +static inline bool +as_bin_get_id_w_len(as_namespace *ns, const uint8_t *name, size_t len, + uint32_t *p_id) +{ + return cf_vmapx_get_index_w_len(ns->p_bin_name_vmap, (const char *)name, + len, p_id) == CF_VMAPX_OK; +} + +static inline void +as_bin_init_nameless(as_bin *b) +{ + as_bin_state_set(b, AS_BIN_STATE_UNUSED); + b->particle = NULL; +} + +static inline as_bin_space * +safe_bin_space(const as_record *r) +{ + return r->dim ? as_index_get_bin_space(r) : NULL; +} + +static inline uint16_t +safe_n_bins(const as_record *r) +{ + as_bin_space* bin_space = safe_bin_space(r); + + return bin_space ? bin_space->n_bins : 0; +} + +static inline as_bin * +safe_bins(const as_record *r) +{ + as_bin_space* bin_space = safe_bin_space(r); + + return bin_space ? bin_space->bins : NULL; +} + +static inline void +as_bin_init_w_len(as_namespace *ns, as_bin *b, const uint8_t *name, size_t len) +{ + as_bin_init_nameless(b); + as_bin_set_id_from_name_buf(ns, b, name, len); + // Don't touch b->unused - like b->id, it's past the end of its enclosing + // as_index if single-bin, data-in-memory. +} + + +//========================================================== +// Public API. +// + +// Caller-beware, name cannot be null, must be null-terminated. +int16_t +as_bin_get_id(as_namespace *ns, const char *name) +{ + cf_assert(! ns->single_bin, AS_BIN, "unexpected single-bin call"); + + uint32_t idx; + + if (cf_vmapx_get_index(ns->p_bin_name_vmap, name, &idx) == CF_VMAPX_OK) { + return (uint16_t)idx; + } + + return -1; +} + + +uint16_t +as_bin_get_or_assign_id(as_namespace *ns, const char *name) +{ + cf_assert(! ns->single_bin, AS_BIN, "unexpected single-bin call"); + + uint32_t idx; + + if (cf_vmapx_get_index(ns->p_bin_name_vmap, name, &idx) == CF_VMAPX_OK) { + return (uint16_t)idx; + } + + cf_vmapx_err result = cf_vmapx_put_unique(ns->p_bin_name_vmap, name, &idx); + + if (! (result == CF_VMAPX_OK || result == CF_VMAPX_ERR_NAME_EXISTS)) { + // Tedious to handle safely for all usage paths, so for now... + cf_crash(AS_BIN, "couldn't add bin name %s, vmap err %d", name, result); + } + + return (uint16_t)idx; +} + + +uint16_t +as_bin_get_or_assign_id_w_len(as_namespace *ns, const char *name, size_t len) +{ + cf_assert(! ns->single_bin, AS_BIN, "unexpected single-bin call"); + + uint32_t idx; + + if (cf_vmapx_get_index_w_len(ns->p_bin_name_vmap, name, len, &idx) == + CF_VMAPX_OK) { + return (uint16_t)idx; + } + + cf_vmapx_err result = cf_vmapx_put_unique_w_len(ns->p_bin_name_vmap, name, + len, &idx); + + if (! (result == CF_VMAPX_OK || result == CF_VMAPX_ERR_NAME_EXISTS)) { + // Tedious to handle safely for all usage paths, so for now... + cf_crash(AS_BIN, "couldn't add bin name %s, vmap err %d", name, result); + } + + return (uint16_t)idx; +} + + +const char * +as_bin_get_name_from_id(as_namespace *ns, uint16_t id) +{ + cf_assert(! ns->single_bin, AS_BIN, "unexpected single-bin call"); + + const char* name = NULL; + + if (cf_vmapx_get_by_index(ns->p_bin_name_vmap, id, (void**)&name) != + CF_VMAPX_OK) { + // Should be impossible since id originates from vmap. + cf_crash(AS_BIN, "no bin name for id %u", id); + } + + return name; +} + + +bool +as_bin_name_within_quota(as_namespace *ns, const char *name) +{ + // Won't exceed quota if single-bin or currently below quota. + if (ns->single_bin || + cf_vmapx_count(ns->p_bin_name_vmap) < BIN_NAMES_QUOTA) { + return true; + } + + // Won't exceed quota if name is found (and so would NOT be added to vmap). + if (cf_vmapx_get_index(ns->p_bin_name_vmap, name, NULL) == CF_VMAPX_OK) { + return true; + } + + cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s", + ns->name, name); + + return false; +} + + +void +as_bin_init(as_namespace *ns, as_bin *b, const char *name) +{ + as_bin_init_nameless(b); + as_bin_set_id_from_name(ns, b, name); + // Don't touch b->unused - like b->id, it's past the end of its enclosing + // as_index if single-bin, data-in-memory. +} + + +void +as_bin_copy(as_namespace *ns, as_bin *to, const as_bin *from) +{ + if (ns->single_bin) { + as_single_bin_copy(to, from); + } + else { + *to = *from; + } +} + + +// - Seems like an as_storage_record method, but leaving it here for now. +// - sets rd->n_bins! +int +as_storage_rd_load_n_bins(as_storage_rd *rd) +{ + if (rd->ns->single_bin) { + rd->n_bins = 1; + return 0; + } + + if (rd->ns->storage_data_in_memory) { + rd->n_bins = safe_n_bins(rd->r); + return 0; + } + + rd->n_bins = 0; + + if (rd->record_on_device && ! rd->ignore_record_on_device) { + return as_storage_record_load_n_bins(rd); // sets rd->n_bins + } + + return 0; +} + + +// - Seems like an as_storage_record method, but leaving it here for now. +// - sets rd->bins! +int +as_storage_rd_load_bins(as_storage_rd *rd, as_bin *stack_bins) +{ + if (rd->ns->storage_data_in_memory) { + rd->bins = rd->ns->single_bin ? as_index_get_single_bin(rd->r) : + safe_bins(rd->r); + return 0; + } + + // Data NOT in-memory. + + rd->bins = stack_bins; + as_bin_set_all_empty(rd); + + if (rd->record_on_device && ! rd->ignore_record_on_device) { + return as_storage_record_load_bins(rd); + } + + return 0; +} + + +uint16_t +as_bin_inuse_count(as_storage_rd *rd) +{ + for (uint16_t i = 0; i < rd->n_bins; i++) { + if (! as_bin_inuse(&rd->bins[i])) { + return i; + } + } + + return rd->n_bins; +} + + +void +as_bin_get_all_p(as_storage_rd *rd, as_bin **bin_ptrs) +{ + for (uint16_t i = 0; i < rd->n_bins; i++) { + bin_ptrs[i] = &rd->bins[i]; + } +} + + +as_bin * +as_bin_get_by_id(as_storage_rd *rd, uint32_t id) +{ + for (uint16_t i = 0; i < rd->n_bins; i++) { + as_bin *b = &rd->bins[i]; + + if (! as_bin_inuse(b)) { + break; + } + + if ((uint32_t)b->id == id) { + return b; + } + } + + return NULL; +} + + +as_bin * +as_bin_get(as_storage_rd *rd, const char *name) +{ + if (rd->ns->single_bin) { + return as_bin_inuse_has(rd) ? rd->bins : NULL; + } + + uint32_t id; + + if (cf_vmapx_get_index(rd->ns->p_bin_name_vmap, name, &id) != CF_VMAPX_OK) { + return NULL; + } + + return as_bin_get_by_id(rd, id); +} + + +as_bin * +as_bin_get_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len) +{ + if (rd->ns->single_bin) { + return as_bin_inuse_has(rd) ? rd->bins : NULL; + } + + uint32_t id; + + if (! as_bin_get_id_w_len(rd->ns, name, len, &id)) { + return NULL; + } + + for (uint16_t i = 0; i < rd->n_bins; i++) { + as_bin *b = &rd->bins[i]; + + if (! as_bin_inuse(b)) { + break; + } + + if ((uint32_t)b->id == id) { + return b; + } + } + + return NULL; +} + + +// Does not check bin name length or quota. +as_bin * +as_bin_create(as_storage_rd *rd, const char *name) +{ + if (rd->ns->single_bin) { + if (as_bin_inuse(rd->bins)) { + cf_crash(AS_BIN, "single bin create found bin in use"); + } + + as_bin_init_nameless(rd->bins); + + return rd->bins; + } + + as_bin *b = NULL; + + for (uint16_t i = 0; i < rd->n_bins; i++) { + if (! as_bin_inuse(&rd->bins[i])) { + b = &rd->bins[i]; + break; + } + } + + if (b) { + as_bin_init(rd->ns, b, name); + } + + return b; +} + + +as_bin * +as_bin_create_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len, + int *result) +{ + as_namespace *ns = rd->ns; + + if (ns->single_bin) { + if (as_bin_inuse(rd->bins)) { + cf_crash(AS_BIN, "single bin create found bin in use"); + } + + as_bin_init_nameless(rd->bins); + + return rd->bins; + } + + if (len >= AS_ID_BIN_SZ) { + cf_warning(AS_BIN, "bin name too long (%lu)", len); + *result = AS_PROTO_RESULT_FAIL_BIN_NAME; + return NULL; + } + + uint32_t id = (uint32_t)-1; + + if (cf_vmapx_get_index_w_len(ns->p_bin_name_vmap, (const char *)name, len, + &id) != CF_VMAPX_OK && + cf_vmapx_count(ns->p_bin_name_vmap) >= BIN_NAMES_QUOTA) { + CF_ZSTR_DEFINE(zname, AS_ID_BIN_SZ, name, len); + + cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s", + ns->name, zname); + + *result = AS_PROTO_RESULT_FAIL_BIN_NAME; + return NULL; + } + + as_bin *b = NULL; + + for (uint16_t i = 0; i < rd->n_bins; i++) { + if (! as_bin_inuse(&rd->bins[i])) { + b = &rd->bins[i]; + break; + } + } + + cf_assert(b, AS_BIN, "ran out of allocated bins in rd"); + + if (id == (uint32_t)-1) { + as_bin_init_w_len(ns, b, name, len); + } + else { + as_bin_init_nameless(b); + b->id = (uint16_t)id; + } + + return b; +} + + +// Does not check bin name length. +// Checks bin name quota - use appropriately. +as_bin * +as_bin_get_or_create(as_storage_rd *rd, const char *name) +{ + as_namespace *ns = rd->ns; + + if (ns->single_bin) { + if (! as_bin_inuse_has(rd)) { + as_bin_init_nameless(rd->bins); + } + + return rd->bins; + } + + uint32_t id = (uint32_t)-1; + uint16_t i; + as_bin *b; + + if (cf_vmapx_get_index(ns->p_bin_name_vmap, name, &id) == CF_VMAPX_OK) { + for (i = 0; i < rd->n_bins; i++) { + b = &rd->bins[i]; + + if (! as_bin_inuse(b)) { + break; + } + + if ((uint32_t)b->id == id) { + return b; + } + } + } + else { + if (cf_vmapx_count(ns->p_bin_name_vmap) >= BIN_NAMES_QUOTA) { + cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s", + ns->name, name); + return NULL; + } + + i = as_bin_inuse_count(rd); + } + + cf_assert(i < rd->n_bins, AS_BIN, "ran out of allocated bins in rd"); + + b = &rd->bins[i]; + + if (id == (uint32_t)-1) { + as_bin_init(ns, b, name); + } + else { + as_bin_init_nameless(b); + b->id = (uint16_t)id; + } + + return b; +} + + +// Does not check bin name length. +// Checks bin name quota - use appropriately. +as_bin * +as_bin_get_or_create_from_buf(as_storage_rd *rd, const uint8_t *name, + size_t len, int *result) +{ + as_namespace *ns = rd->ns; + + if (ns->single_bin) { + if (! as_bin_inuse_has(rd)) { + as_bin_init_nameless(rd->bins); + } + + return rd->bins; + } + + uint32_t id = (uint32_t)-1; + uint16_t i; + as_bin *b; + + if (cf_vmapx_get_index_w_len(ns->p_bin_name_vmap, (const char *)name, len, + &id) == CF_VMAPX_OK) { + for (i = 0; i < rd->n_bins; i++) { + b = &rd->bins[i]; + + if (! as_bin_inuse(b)) { + break; + } + + if ((uint32_t)b->id == id) { + return b; + } + } + } + else { + if (cf_vmapx_count(ns->p_bin_name_vmap) >= BIN_NAMES_QUOTA) { + CF_ZSTR_DEFINE(zname, AS_ID_BIN_SZ, name, len); + + cf_warning(AS_BIN, "{%s} bin-name quota full - can't add new bin-name %s", + ns->name, zname); + + *result = AS_PROTO_RESULT_FAIL_BIN_NAME; + return NULL; + } + + i = as_bin_inuse_count(rd); + } + + cf_assert(i < rd->n_bins, AS_BIN, "ran out of allocated bins in rd"); + + b = &rd->bins[i]; + + if (id == (uint32_t)-1) { + as_bin_init_w_len(ns, b, name, len); + } + else { + as_bin_init_nameless(b); + b->id = (uint16_t)id; + } + + return b; +} + + +int32_t +as_bin_get_index(as_storage_rd *rd, const char *name) +{ + if (rd->ns->single_bin) { + return as_bin_inuse_has(rd) ? 0 : -1; + } + + uint32_t id; + + if (cf_vmapx_get_index(rd->ns->p_bin_name_vmap, name, &id) != CF_VMAPX_OK) { + return -1; + } + + for (uint16_t i = 0; i < rd->n_bins; i++) { + as_bin *b = &rd->bins[i]; + + if (! as_bin_inuse(b)) { + break; + } + + if ((uint32_t)b->id == id) { + return (int32_t)i; + } + } + + return -1; +} + + +int32_t +as_bin_get_index_from_buf(as_storage_rd *rd, const uint8_t *name, size_t len) +{ + if (rd->ns->single_bin) { + return as_bin_inuse_has(rd) ? 0 : -1; + } + + uint32_t id; + + if (! as_bin_get_id_w_len(rd->ns, name, len, &id)) { + return -1; + } + + for (uint16_t i = 0; i < rd->n_bins; i++) { + as_bin *b = &rd->bins[i]; + + if (! as_bin_inuse(b)) { + break; + } + + if ((uint32_t)b->id == id) { + return (int32_t)i; + } + } + + return -1; +} + + +void +as_bin_destroy(as_storage_rd *rd, uint16_t i) +{ + as_bin_particle_destroy(&rd->bins[i], rd->ns->storage_data_in_memory); + as_bin_set_empty_shift(rd, i); +} + + +void +as_bin_allocate_bin_space(as_storage_rd *rd, int32_t delta) +{ + as_record *r = rd->r; + + if (rd->n_bins == 0) { + rd->n_bins = (uint16_t)delta; + + size_t size = sizeof(as_bin_space) + (rd->n_bins * sizeof(as_bin)); + as_bin_space* bin_space = (as_bin_space*)cf_malloc_ns(size); + + rd->bins = bin_space->bins; + as_bin_set_all_empty(rd); + + bin_space->n_bins = rd->n_bins; + as_index_set_bin_space(r, bin_space); + + return; + } + // else - there were bins before. + + uint16_t new_n_bins = (uint16_t)((int32_t)rd->n_bins + delta); + + if (delta < 0) { + as_record_destroy_bins_from(rd, new_n_bins); + } + + uint16_t old_n_bins = rd->n_bins; + + rd->n_bins = new_n_bins; + + if (new_n_bins != 0) { + size_t size = sizeof(as_bin_space) + (rd->n_bins * sizeof(as_bin)); + as_bin_space* bin_space = (as_bin_space*) + cf_realloc_ns((void*)as_index_get_bin_space(r), size); + + rd->bins = bin_space->bins; + + if (delta > 0) { + as_bin_set_empty_from(rd, old_n_bins); + } + + bin_space->n_bins = rd->n_bins; + as_index_set_bin_space(r, bin_space); + } + else { + cf_free((void*)as_index_get_bin_space(r)); + as_index_set_bin_space(r, NULL); + rd->bins = NULL; + } +} diff --git a/as/src/base/cdt.c b/as/src/base/cdt.c new file mode 100644 index 00000000..02cbd9d9 --- /dev/null +++ b/as/src/base/cdt.c @@ -0,0 +1,2607 @@ +/* + * cdt.c + * + * Copyright (C) 2015-2018 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/cdt.h" + +#include +#include +#include +#include + +#include "citrusleaf/cf_byte_order.h" + +#include "bits.h" +#include "dynbuf.h" +#include "fault.h" + +#include "base/cfg.h" +#include "base/particle.h" + + +//========================================================== +// Typedefs & constants. +// + +#define VA_FIRST(first, ...) first +#define VA_REST(first, ...) __VA_ARGS__ + +#define CDT_OP_ENTRY(op, type, ...) [op].name = # op, [op].args = (const as_cdt_paramtype[]){VA_REST(__VA_ARGS__, 0)}, [op].count = VA_NARGS(__VA_ARGS__) - 1, [op].opt_args = VA_FIRST(__VA_ARGS__) + +const cdt_op_table_entry cdt_op_table[] = { + + //============================================ + // LIST + + //-------------------------------------------- + // Modify OPs + + CDT_OP_ENTRY(AS_CDT_OP_LIST_SET_TYPE, AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_FLAGS), + + // Adds + CDT_OP_ENTRY(AS_CDT_OP_LIST_APPEND, AS_OPERATOR_CDT_MODIFY, 2, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_FLAGS), + CDT_OP_ENTRY(AS_CDT_OP_LIST_APPEND_ITEMS, AS_OPERATOR_CDT_MODIFY, 2, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_FLAGS), + CDT_OP_ENTRY(AS_CDT_OP_LIST_INSERT, AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS), + CDT_OP_ENTRY(AS_CDT_OP_LIST_INSERT_ITEMS, AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS), + + // Removes + CDT_OP_ENTRY(AS_CDT_OP_LIST_POP, AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_INDEX), + CDT_OP_ENTRY(AS_CDT_OP_LIST_POP_RANGE, AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT), + CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE, AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_INDEX), + CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_RANGE, AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT), + + // Modifies + CDT_OP_ENTRY(AS_CDT_OP_LIST_SET, AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS), + CDT_OP_ENTRY(AS_CDT_OP_LIST_TRIM, AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT), + CDT_OP_ENTRY(AS_CDT_OP_LIST_CLEAR, AS_OPERATOR_CDT_MODIFY, 0), + CDT_OP_ENTRY(AS_CDT_OP_LIST_INCREMENT, AS_OPERATOR_CDT_MODIFY, 3, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_FLAGS), + + CDT_OP_ENTRY(AS_CDT_OP_LIST_SORT, AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_FLAGS), + + //-------------------------------------------- + // Read OPs + + CDT_OP_ENTRY(AS_CDT_OP_LIST_SIZE, AS_OPERATOR_CDT_READ, 0), + CDT_OP_ENTRY(AS_CDT_OP_LIST_GET, AS_OPERATOR_CDT_READ, 0, AS_CDT_PARAM_INDEX), + CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_RANGE, AS_OPERATOR_CDT_READ, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT), + + //-------------------------------------------- + // GET/REMOVE + + // GET_BYs + CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_BY_INDEX, AS_OPERATOR_CDT_READ, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX), + CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_BY_VALUE, AS_OPERATOR_CDT_READ, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_BY_RANK, AS_OPERATOR_CDT_READ, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX), + + CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_ALL_BY_VALUE, AS_OPERATOR_CDT_READ, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_ALL_BY_VALUE_LIST, AS_OPERATOR_CDT_READ, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD), + + CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_BY_INDEX_RANGE, AS_OPERATOR_CDT_READ, 1, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT), + CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_BY_VALUE_INTERVAL, AS_OPERATOR_CDT_READ, 1, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_LIST_GET_BY_RANK_RANGE, AS_OPERATOR_CDT_READ, 1, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT), + + // REMOVE_BYs + CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_BY_INDEX, AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX), + CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_BY_VALUE, AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_BY_RANK, AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX), + + CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE, AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE_LIST, AS_OPERATOR_CDT_MODIFY, 0, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD), + + CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_BY_INDEX_RANGE, AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT), + CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_BY_VALUE_INTERVAL, AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_LIST_REMOVE_BY_RANK_RANGE, AS_OPERATOR_CDT_MODIFY, 1, AS_CDT_PARAM_FLAGS, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT), + + //============================================ + // MAP + + //-------------------------------------------- + // Create and flags + + CDT_OP_ENTRY(AS_CDT_OP_MAP_SET_TYPE, AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_FLAGS), + + //-------------------------------------------- + // Modify OPs + + CDT_OP_ENTRY(AS_CDT_OP_MAP_ADD, AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS), + CDT_OP_ENTRY(AS_CDT_OP_MAP_ADD_ITEMS, AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS), + CDT_OP_ENTRY(AS_CDT_OP_MAP_PUT, AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS), + CDT_OP_ENTRY(AS_CDT_OP_MAP_PUT_ITEMS, AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS), + CDT_OP_ENTRY(AS_CDT_OP_MAP_REPLACE, AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_MAP_REPLACE_ITEMS, AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_PAYLOAD), + + CDT_OP_ENTRY(AS_CDT_OP_MAP_INCREMENT, AS_OPERATOR_MAP_MODIFY, 2, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS), + CDT_OP_ENTRY(AS_CDT_OP_MAP_DECREMENT, AS_OPERATOR_MAP_MODIFY, 2, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_FLAGS), + + CDT_OP_ENTRY(AS_CDT_OP_MAP_CLEAR, AS_OPERATOR_MAP_MODIFY, 0), + + CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_KEY, AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_VALUE, AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_INDEX, AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX), + CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_RANK, AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX), + + CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_KEY_LIST, AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_ALL_BY_VALUE, AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_VALUE_LIST, AS_OPERATOR_MAP_MODIFY, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD), + + CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_KEY_INTERVAL, AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_INDEX_RANGE, AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT), + CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_VALUE_INTERVAL, AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_MAP_REMOVE_BY_RANK_RANGE, AS_OPERATOR_MAP_MODIFY, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT), + + //-------------------------------------------- + // Read OPs + + CDT_OP_ENTRY(AS_CDT_OP_MAP_SIZE, AS_OPERATOR_MAP_READ, 0), + + CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_KEY, AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_INDEX, AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX), + CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_VALUE, AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_RANK, AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX), + + CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_ALL_BY_VALUE, AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD), + + CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_KEY_INTERVAL, AS_OPERATOR_MAP_READ, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_INDEX_RANGE, AS_OPERATOR_MAP_READ, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT), + CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_VALUE_INTERVAL, AS_OPERATOR_MAP_READ, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_RANK_RANGE, AS_OPERATOR_MAP_READ, 1, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_COUNT), + + CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_KEY_LIST, AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD), + CDT_OP_ENTRY(AS_CDT_OP_MAP_GET_BY_VALUE_LIST, AS_OPERATOR_MAP_READ, 0, AS_CDT_PARAM_INDEX, AS_CDT_PARAM_PAYLOAD), + +}; + +static const size_t cdt_op_table_size = sizeof(cdt_op_table) / sizeof(cdt_op_table_entry); + +extern const as_particle_vtable *particle_vtable[]; + +typedef struct index_pack24_s { + uint32_t value:24; +} __attribute__ ((__packed__)) index_pack24; + +typedef struct { + const order_index *ordidx; + bool error; +} index_sort_userdata; + + +//========================================================== +// Forward declares. +// + +static bool unpack_list_value(as_unpacker *pk, cdt_payload *payload_r); +static bool unpack_map_key(as_unpacker *pk, cdt_payload *payload_r); +static bool unpack_map_value(as_unpacker *pk, cdt_payload *payload_r); + +inline static void cdt_payload_pack_val(cdt_payload *value, const as_val *val); + +static inline uint32_t order_index_ele_sz(uint32_t max_idx); + + +//========================================================== +// CDT helpers. +// + +// Calculate count given index and max_index. +// Assumes index < ele_count. +static uint32_t +calc_count(uint32_t index, uint64_t in_count, uint32_t max_index) +{ + // Since we assume index < ele_count, (max - index) will never overflow. + if (in_count >= (uint64_t)max_index - index) { + return max_index - index; + } + + return (uint32_t)in_count; +} + +static void +calc_index_count_multi(int64_t in_index, uint64_t in_count, uint32_t ele_count, + uint32_t *out_index, uint32_t *out_count) +{ + if (in_index >= ele_count) { + *out_index = ele_count; + *out_count = 0; + } + else if ((in_index = calc_index(in_index, ele_count)) < 0) { + if ((uint64_t)(-in_index) < in_count) { + uint64_t out64 = in_count + in_index; + + if (out64 > (uint64_t)ele_count) { + out64 = ele_count; + } + + *out_count = (uint32_t)out64; + } + else { + *out_count = 0; + } + + *out_index = 0; + } + else { + *out_index = (uint32_t)in_index; + *out_count = calc_count((uint32_t)in_index, in_count, ele_count); + } +} + +// Transform to absolute (uint32_t) index/count bounded by ele_count. +bool +calc_index_count(int64_t in_index, uint64_t in_count, uint32_t ele_count, + uint32_t *out_index, uint32_t *out_count, bool is_multi) +{ + if (is_multi) { + calc_index_count_multi(in_index, in_count, ele_count, out_index, + out_count); + return true; + } + + if (in_index >= (int64_t)ele_count || + (in_index = calc_index(in_index, ele_count)) < 0) { + return false; + } + + *out_index = (uint32_t)in_index; + *out_count = calc_count((uint32_t)in_index, in_count, ele_count); + + return true; +} + +static bool +unpack_list_value(as_unpacker *pk, cdt_payload *payload_r) +{ + payload_r->ptr = pk->buffer + pk->offset; + + int64_t sz = as_unpack_size(pk); + + if (sz <= 0) { + cf_warning(AS_PARTICLE, "unpack_list_value() invalid msgpack"); + return false; + } + + payload_r->sz = (uint32_t)sz; + + return true; +} + +static bool +unpack_map_key(as_unpacker *pk, cdt_payload *payload_r) +{ + payload_r->ptr = pk->buffer + pk->offset; + + int64_t sz = as_unpack_size(pk); + + if (sz <= 0) { + cf_warning(AS_PARTICLE, "unpack_map_key() invalid msgpack"); + return false; + } + + payload_r->sz = (uint32_t)sz; + + if (as_unpack_size(pk) <= 0) { // skip value + cf_warning(AS_PARTICLE, "unpack_map_key() invalid msgpack"); + return false; + } + + return true; +} + +static bool +unpack_map_value(as_unpacker *pk, cdt_payload *payload_r) +{ + if (as_unpack_size(pk) <= 0) { // skip key + cf_warning(AS_PARTICLE, "unpack_map_value() invalid msgpack"); + return false; + } + + payload_r->ptr = pk->buffer + pk->offset; + + int64_t sz = as_unpack_size(pk); + + if (sz <= 0) { + cf_warning(AS_PARTICLE, "unpack_map_value() invalid msgpack"); + return false; + } + + payload_r->sz = (uint32_t)sz; + + return true; +} + + +//========================================================== +// cdt_result_data +// + +bool +result_data_set_not_found(cdt_result_data *rd, int64_t index) +{ + switch (rd->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_REVINDEX_RANGE: + case RESULT_TYPE_INDEX_RANGE: + case RESULT_TYPE_RANK_RANGE: + case RESULT_TYPE_REVRANK_RANGE: + result_data_set_list_int2x(rd, index, 0); + break; + case RESULT_TYPE_INDEX: + case RESULT_TYPE_REVINDEX: + case RESULT_TYPE_RANK: + case RESULT_TYPE_REVRANK: + if (rd->is_multi) { + as_bin_set_unordered_empty_list(rd->result, rd->alloc); + break; + } + + as_bin_set_int(rd->result, -1); + break; + case RESULT_TYPE_COUNT: + as_bin_set_int(rd->result, 0); + break; + case RESULT_TYPE_KEY: + case RESULT_TYPE_VALUE: + if (rd->is_multi) { + as_bin_set_unordered_empty_list(rd->result, rd->alloc); + } + break; + case RESULT_TYPE_MAP: + as_bin_set_empty_packed_map(rd->result, rd->alloc, + AS_PACKED_MAP_FLAG_PRESERVE_ORDER); + break; + default: + cf_warning(AS_PARTICLE, "result_data_set_not_found() invalid result type %d", rd->type); + return false; + } + + return true; +} + +void +result_data_set_list_int2x(cdt_result_data *rd, int64_t i1, int64_t i2) +{ + define_int_list_builder(builder, rd->alloc, 2); + + cdt_container_builder_add_int64(&builder, i1); + cdt_container_builder_add_int64(&builder, i2); + cdt_container_builder_set_result(&builder, rd); +} + +int +result_data_set_index_rank_count(cdt_result_data *rd, uint32_t start, + uint32_t count, uint32_t ele_count) +{ + bool is_rev = false; + bool inverted = result_data_is_inverted(rd); + + switch (rd->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_COUNT: + as_bin_set_int(rd->result, inverted ? ele_count - count : count); + break; + case RESULT_TYPE_REVINDEX: + case RESULT_TYPE_REVRANK: + is_rev = true; + /* no break */ + case RESULT_TYPE_INDEX: + case RESULT_TYPE_RANK: { + if (! rd->is_multi) { + if (count == 0) { + as_bin_set_int(rd->result, -1); + break; + } + + if (is_rev) { + start = ele_count - start - 1; + } + + as_bin_set_int(rd->result, start); + break; + } + + cdt_container_builder builder; + + if (inverted) { + uint32_t inv_count = ele_count - count; + + cdt_int_list_builder_start(&builder, rd->alloc, inv_count); + cdt_container_builder_add_int_range(&builder, 0, start, ele_count, + is_rev); + cdt_container_builder_add_int_range(&builder, start + count, + ele_count - start - count, ele_count, is_rev); + } + else { + cdt_int_list_builder_start(&builder, rd->alloc, count); + cdt_container_builder_add_int_range(&builder, start, count, + ele_count, is_rev); + } + + cdt_container_builder_set_result(&builder, rd); + break; + } + default: + cf_warning(AS_PARTICLE, "result_data_set_index_rank_count() invalid return type %d", rd->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; +} + +int +result_data_set_range(cdt_result_data *rd, uint32_t start, uint32_t count, + uint32_t ele_count) +{ + switch (rd->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_COUNT: + case RESULT_TYPE_REVINDEX: + case RESULT_TYPE_REVRANK: + case RESULT_TYPE_INDEX: + case RESULT_TYPE_RANK: + return result_data_set_index_rank_count(rd, start, count, ele_count); + case RESULT_TYPE_REVINDEX_RANGE: + case RESULT_TYPE_REVRANK_RANGE: + start = ele_count - start - count; + /* no break */ + case RESULT_TYPE_INDEX_RANGE: + case RESULT_TYPE_RANK_RANGE: { + if (result_data_is_inverted(rd)) { + cf_warning(AS_PARTICLE, "result_data_set_range() result_type %d not supported with INVERTED flag", rd->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + result_data_set_list_int2x(rd, start, count); + break; + } + default: + cf_warning(AS_PARTICLE, "result_data_set_range() invalid return type %d", rd->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; +} + +// Does not respect inverted flag. +void +result_data_set_by_irc(cdt_result_data *rd, + const order_index *irc, const order_index *idx_map, + uint32_t total_count) +{ + bool is_rev = rd->type == RESULT_TYPE_REVINDEX || + rd->type == RESULT_TYPE_REVRANK; + uint32_t items_count = irc->_.ele_count / 2; + define_int_list_builder(builder, rd->alloc, total_count); + + for (uint32_t i = 0; i < items_count; i++) { + uint32_t count = order_index_get(irc, (2 * i) + 1); + + if (count == 0) { + continue; + } + + uint32_t rank = order_index_get(irc, 2 * i); + + if (idx_map) { + for (uint32_t j = rank; j < rank + count; j++) { + cdt_container_builder_add_int_range(&builder, + order_index_get(idx_map, j), 1, irc->max_idx, is_rev); + } + } + else { + cdt_container_builder_add_int_range(&builder, rank, count, + irc->max_idx, is_rev); + } + } + + cdt_container_builder_set_result(&builder, rd); +} + +void +result_data_set_by_itemlist_irc(cdt_result_data *rd, + const order_index *items_ord, order_index *irc, + uint32_t total_count) +{ + cdt_container_builder builder; + bool inverted = result_data_is_inverted(rd); + uint32_t items_count = items_ord->_.ele_count; + uint32_t ele_count = irc->max_idx; + bool is_rev = rd->type == RESULT_TYPE_REVINDEX || + rd->type == RESULT_TYPE_REVRANK; + + if (! inverted) { + cdt_int_list_builder_start(&builder, rd->alloc, total_count); + + for (uint32_t i = 0; i < items_count; i++) { + uint32_t count = order_index_get(irc, (i * 2) + 1); + + if (count == 0) { + continue; + } + + uint32_t rank = order_index_get(irc, i * 2); + + for (uint32_t j = 0; j < count; j++) { + cdt_container_builder_add_int_range(&builder, + rank + j, 1, ele_count, is_rev); + } + } + } + else { + cdt_int_list_builder_start(&builder, rd->alloc, total_count); + + uint32_t prev = 0; + + for (uint32_t i = 0; i < items_count; i++) { + uint32_t kl_idx = order_index_get(items_ord, i); + uint32_t count = order_index_get(irc, (kl_idx * 2) + 1); + + if (count == 0) { + continue; + } + + uint32_t index = order_index_get(irc, kl_idx * 2); + + cdt_container_builder_add_int_range(&builder, prev, + index - prev, ele_count, is_rev); + prev = index + count; + } + + cdt_container_builder_add_int_range(&builder, prev, + ele_count - prev, ele_count, is_rev); + } + + cdt_container_builder_set_result(&builder, rd); +} + +// Does not respect inverted flag. +void +result_data_set_int_list_by_mask(cdt_result_data *rd, const uint64_t *mask, + uint32_t count, uint32_t ele_count) +{ + bool is_rev = rd->type == RESULT_TYPE_REVINDEX || + rd->type == RESULT_TYPE_REVRANK; + + if (! rd->is_multi) { + uint32_t idx = cdt_idx_mask_find(mask, 0, ele_count, false); + + if (is_rev) { + idx = ele_count - idx - 1; + } + + as_bin_set_int(rd->result, (int64_t)idx); + return; + } + + define_int_list_builder(builder, rd->alloc, count); + uint32_t idx = 0; + + for (uint32_t i = 0; i < count; i++) { + idx = cdt_idx_mask_find(mask, idx, ele_count, false); + + int64_t val = (is_rev ? ele_count - idx - 1 : idx); + + cdt_container_builder_add_int64(&builder, val); + idx++; + } + + cdt_container_builder_set_result(&builder, rd); +} + + +//========================================================== +// as_bin functions. +// + +void +as_bin_set_int(as_bin *b, int64_t value) +{ + b->particle = (as_particle *)value; + as_bin_state_set_from_type(b, AS_PARTICLE_TYPE_INTEGER); +} + +void +as_bin_set_double(as_bin *b, double value) +{ + *((double *)(&b->particle)) = value; + as_bin_state_set_from_type(b, AS_PARTICLE_TYPE_FLOAT); +} + + +//========================================================== +//cdt_calc_delta +// + +bool +cdt_calc_delta_init(cdt_calc_delta *cdv, const cdt_payload *delta_value, + bool is_decrement) +{ + if (delta_value && delta_value->ptr) { + as_unpacker pk_delta_value = { + .buffer = delta_value->ptr, + .length = delta_value->sz + }; + + cdv->type = as_unpack_peek_type(&pk_delta_value); + + if (cdv->type == AS_INTEGER) { + if (as_unpack_int64(&pk_delta_value, &cdv->incr_int) != 0) { + cf_warning(AS_PARTICLE, "cdt_delta_value_init() invalid packed delta value"); + return false; + } + } + else if (cdv->type == AS_DOUBLE) { + if (as_unpack_double(&pk_delta_value, &cdv->incr_double) != 0) { + cf_warning(AS_PARTICLE, "cdt_delta_value_init() invalid packed delta value"); + return false; + } + } + else { + cf_warning(AS_PARTICLE, "cdt_delta_value_init() delta is not int/double"); + return false; + } + } + else { + cdv->type = AS_UNDEF; + cdv->incr_int = 1; + cdv->incr_double = 1; + } + + if (is_decrement) { + cdv->incr_int = -cdv->incr_int; + cdv->incr_double = -cdv->incr_double; + } + + cdv->value_int = 0; + cdv->value_double = 0; + + return true; +} + +bool +cdt_calc_delta_add(cdt_calc_delta *cdv, as_unpacker *pk_value) +{ + if (pk_value) { + as_val_t packed_value_type = as_unpack_peek_type(pk_value); + + if (packed_value_type == AS_INTEGER) { + if (as_unpack_int64(pk_value, &cdv->value_int) != 0) { + cf_warning(AS_PARTICLE, "cdt_delta_value_add() invalid packed int"); + return false; + } + + if (cdv->type == AS_DOUBLE) { + cdv->value_int += (int64_t)cdv->incr_double; + } + else { + cdv->value_int += cdv->incr_int; + } + } + else if (packed_value_type == AS_DOUBLE) { + if (as_unpack_double(pk_value, &cdv->value_double) != 0) { + cf_warning(AS_PARTICLE, "cdt_delta_value_add() invalid packed double"); + return false; + } + + if (cdv->type == AS_DOUBLE) { + cdv->value_double += cdv->incr_double; + } + else { + cdv->value_double += (double)cdv->incr_int; + } + } + else { + cf_warning(AS_PARTICLE, "cdt_delta_value_add() only valid for int/double"); + return false; + } + + cdv->type = packed_value_type; + } + else if (cdv->type == AS_DOUBLE) { + cdv->value_double += cdv->incr_double; + } + else { + cdv->type = AS_INTEGER; // default to AS_INTEGER if UNDEF + cdv->value_int += cdv->incr_int; + } + + return true; +} + +void +cdt_calc_delta_pack_and_result(cdt_calc_delta *cdv, cdt_payload *value, + as_bin *result) +{ + if (cdv->type == AS_DOUBLE) { + cdt_payload_pack_double(value, cdv->value_double); + as_bin_set_double(result, cdv->value_double); + } + else { + cdt_payload_pack_int(value, cdv->value_int); + as_bin_set_int(result, cdv->value_int); + } +} + + +//========================================================== +// cdt_payload functions. +// + +bool +cdt_payload_is_int(const cdt_payload *payload) +{ + return as_unpack_buf_peek_type(payload->ptr, payload->sz) == AS_INTEGER; +} + +int64_t +cdt_payload_get_int64(const cdt_payload *payload) +{ + int64_t ret = 0; + as_unpacker pk = { + .buffer = payload->ptr, + .offset = 0, + .length = payload->sz + }; + + as_unpack_int64(&pk, &ret); + + return ret; +} + +inline static void +cdt_payload_pack_val(cdt_payload *value, const as_val *val) +{ + as_serializer ser; + as_msgpack_init(&ser); + + value->sz = as_serializer_serialize_presized(&ser, val, + (uint8_t *)value->ptr); + + as_serializer_destroy(&ser); +} + +void +cdt_payload_pack_int(cdt_payload *packed, int64_t value) +{ + as_integer val; + as_integer_init(&val, value); + + cdt_payload_pack_val(packed, (as_val *)&val); +} + +void +cdt_payload_pack_double(cdt_payload *packed, double value) +{ + as_double val; + as_double_init(&val, value); + + return cdt_payload_pack_val(packed, (as_val *)&val); +} + + +//========================================================== +// cdt_container_builder functions. +// + +void +cdt_container_builder_add(cdt_container_builder *builder, const uint8_t *buf, + uint32_t sz) +{ + memcpy(builder->write_ptr, buf, sz); + builder->write_ptr += sz; + *builder->sz += sz; + builder->ele_count++; +} + +void +cdt_container_builder_add_n(cdt_container_builder *builder, const uint8_t *buf, + uint32_t count, uint32_t sz) +{ + if (buf) { + memcpy(builder->write_ptr, buf, sz); + } + + builder->write_ptr += sz; + *builder->sz += sz; + builder->ele_count += count; +} + +void +cdt_container_builder_add_int64(cdt_container_builder *builder, int64_t value) +{ + as_integer val64; + + as_packer pk = { + .buffer = builder->write_ptr, + .capacity = INT_MAX + }; + + as_integer_init(&val64, value); + as_pack_val(&pk, (const as_val *)&val64); + builder->write_ptr += pk.offset; + *builder->sz += (uint32_t)pk.offset; + builder->ele_count++; +} + +void +cdt_container_builder_add_int_range(cdt_container_builder *builder, + uint32_t start, uint32_t count, uint32_t ele_count, bool is_rev) +{ + if (is_rev) { + start = ele_count - start - count; + } + + for (uint32_t i = 0; i < count; i++) { + cdt_container_builder_add_int64(builder, (int64_t)(start + i)); + } +} + +void +cdt_container_builder_set_result(cdt_container_builder *builder, + cdt_result_data *result) +{ + result->result->particle = builder->particle; + as_bin_state_set_from_type(result->result, (as_particle_type)((uint8_t *)builder->particle)[0]); +} + + +//========================================================== +// cdt_process_state functions. +// + +bool +cdt_process_state_init(cdt_process_state *cdt_state, const as_msg_op *op) +{ + const uint8_t *data = op->name + op->name_sz; + uint32_t sz = op->op_sz - 4 - op->name_sz; + + if (data[0] == 0) { // TODO - deprecate this in "6 months" + if (sz < sizeof(uint16_t)) { + cf_warning(AS_PARTICLE, "cdt_parse_state_init() as_msg_op data too small to be valid: size=%u", sz); + return false; + } + + const uint16_t *type_ptr = (const uint16_t *)data; + + cdt_state->type = cf_swap_from_be16(*type_ptr); + cdt_state->pk.buffer = data + sizeof(uint16_t); + cdt_state->pk.length = sz - sizeof(uint16_t); + cdt_state->pk.offset = 0; + + int64_t ele_count = (cdt_state->pk.length == 0) ? + 0 : as_unpack_list_header_element_count(&cdt_state->pk); + + if (ele_count < 0) { + cf_warning(AS_PARTICLE, "cdt_parse_state_init() unpack list header failed: size=%u type=%u ele_count=%ld", sz, cdt_state->type, ele_count); + return false; + } + + cdt_state->ele_count = (uint32_t)ele_count; + + return true; + } + + cdt_state->pk.buffer = data; + cdt_state->pk.length = sz; + cdt_state->pk.offset = 0; + + int64_t ele_count = as_unpack_list_header_element_count(&cdt_state->pk); + uint64_t type64; + + if (ele_count < 1 || as_unpack_uint64(&cdt_state->pk, &type64) != 0) { + cf_warning(AS_PARTICLE, "cdt_parse_state_init() unpack parameters failed: size=%u ele_count=%ld", sz, ele_count); + return false; + } + + cdt_state->type = (as_cdt_optype)type64; + cdt_state->ele_count = (uint32_t)ele_count; + + return true; +} + +bool +cdt_process_state_get_params(cdt_process_state *state, size_t n, ...) +{ + as_cdt_optype op = state->type; + + if (op >= cdt_op_table_size) { + return false; + } + + const cdt_op_table_entry *entry = &cdt_op_table[op]; + uint32_t required_count = entry->count - entry->opt_args; + + cf_assert(n >= (size_t)required_count, AS_PARTICLE, "cdt_process_state_get_params() called with %zu params, require at least %u - %u = %u params", n, entry->count, entry->opt_args, required_count); + + if (n == 0 || entry->args[0] == 0) { + return true; + } + + if (state->ele_count < required_count) { + cf_warning(AS_PARTICLE, "cdt_process_state_get_params() count mismatch: got %u from client < expected %u", state->ele_count, required_count); + return false; + } + + if (state->ele_count > (uint32_t)entry->count) { + cf_warning(AS_PARTICLE, "cdt_process_state_get_params() count mismatch: got %u from client > expected %u", state->ele_count, entry->count); + return false; + } + + va_list vl; + va_start(vl, n); + + for (uint32_t i = 0; i < state->ele_count; i++) { + switch (entry->args[i]) { + case AS_CDT_PARAM_PAYLOAD: { + cdt_payload *arg = va_arg(vl, cdt_payload *); + + arg->ptr = state->pk.buffer + state->pk.offset; + + int64_t sz = as_unpack_size(&state->pk); + + if (sz <= 0) { + va_end(vl); + return false; + } + + arg->sz = (uint32_t)sz; + + break; + } + case AS_CDT_PARAM_FLAGS: + case AS_CDT_PARAM_COUNT: { + uint64_t *arg = va_arg(vl, uint64_t *); + + if (as_unpack_uint64(&state->pk, arg) != 0) { + va_end(vl); + return false; + } + + break; + } + case AS_CDT_PARAM_INDEX: { + int64_t *arg = va_arg(vl, int64_t *); + + if (as_unpack_int64(&state->pk, arg) != 0) { + va_end(vl); + return false; + } + + break; + } + default: + va_end(vl); + return false; + } + } + + va_end(vl); + + return true; +} + +const char * +cdt_process_state_get_op_name(const cdt_process_state *state) +{ + as_cdt_optype op = state->type; + + if (op >= cdt_op_table_size) { + return NULL; + } + + const cdt_op_table_entry *entry = &cdt_op_table[op]; + + return entry->name; +} + + +//========================================================== +// rollback_alloc functions. +// + +void +rollback_alloc_push(rollback_alloc *packed_alloc, void *ptr) +{ + if (packed_alloc->malloc_list_sz >= packed_alloc->malloc_list_cap) { + cf_crash(AS_PARTICLE, "rollback_alloc_push() need to make rollback list larger: cap=%zu", packed_alloc->malloc_list_cap); + } + + packed_alloc->malloc_list[packed_alloc->malloc_list_sz++] = ptr; +} + +uint8_t * +rollback_alloc_reserve(rollback_alloc *alloc_buf, size_t size) +{ + cf_assert(alloc_buf, AS_PARTICLE, "alloc_buf NULL"); + + uint8_t *ptr; + + if (alloc_buf->ll_buf) { + cf_ll_buf_reserve(alloc_buf->ll_buf, size, &ptr); + } + else { + ptr = alloc_buf->malloc_ns ? cf_malloc_ns(size) : cf_malloc(size); + rollback_alloc_push(alloc_buf, ptr); + } + + return ptr; +} + +void +rollback_alloc_rollback(rollback_alloc *alloc_buf) +{ + if (alloc_buf->ll_buf) { + return; + } + + for (size_t i = 0; i < alloc_buf->malloc_list_sz; i++) { + cf_free(alloc_buf->malloc_list[i]); + } + + alloc_buf->malloc_list_sz = 0; +} + +bool +rollback_alloc_from_msgpack(rollback_alloc *alloc_buf, as_bin *b, + const cdt_payload *seg) +{ + // We assume the bin is empty. + + as_particle_type type = as_particle_type_from_msgpack(seg->ptr, seg->sz); + + if (type == AS_PARTICLE_TYPE_BAD) { + return false; + } + + if (type == AS_PARTICLE_TYPE_NULL) { + return true; + } + + uint32_t sz = + particle_vtable[type]->size_from_msgpack_fn(seg->ptr, seg->sz); + + if (sz != 0) { + b->particle = (as_particle *)rollback_alloc_reserve(alloc_buf, sz); + + if (! b->particle) { + return false; + } + } + + particle_vtable[type]->from_msgpack_fn(seg->ptr, seg->sz, &b->particle); + + // Set the bin's iparticle metadata. + as_bin_state_set_from_type(b, type); + + return true; +} + + +//========================================================== +// as_bin_cdt_packed functions. +// + +int +as_bin_cdt_packed_modify(as_bin *b, const as_msg_op *op, as_bin *result, + cf_ll_buf *particles_llb) +{ + cdt_process_state state; + + if (! cdt_process_state_init(&state, op)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + cdt_modify_data udata = { + .b = b, + .result = result, + .alloc_buf = particles_llb, + .ret_code = AS_PROTO_RESULT_OK, + }; + + bool success; + + if ((int)state.type <= (int)AS_CDT_OP_LIST_LAST) { + success = cdt_process_state_packed_list_modify_optype(&state, &udata); + } + else { + success = cdt_process_state_packed_map_modify_optype(&state, &udata); + } + + if (! success) { + as_bin_set_empty(b); + as_bin_set_empty(result); + } + + return udata.ret_code; +} + +int +as_bin_cdt_packed_read(const as_bin *b, const as_msg_op *op, as_bin *result) +{ + cdt_process_state state; + + if (! cdt_process_state_init(&state, op)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + cdt_read_data udata = { + .b = b, + .result = result, + .ret_code = AS_PROTO_RESULT_OK, + }; + + bool success; + + if ((int)state.type <= AS_CDT_OP_LIST_LAST) { + success = cdt_process_state_packed_list_read_optype(&state, &udata); + } + else { + success = cdt_process_state_packed_map_read_optype(&state, &udata); + } + + if (! success) { + as_bin_set_empty(result); + } + + return udata.ret_code; +} + + +//========================================================== +// msgpacked_index +// + +void +msgpacked_index_set(msgpacked_index *idxs, uint32_t index, uint32_t value) +{ + switch (idxs->ele_sz) { + case 1: + idxs->ptr[index] = (uint8_t)value; + break; + case 2: + ((uint16_t *)idxs->ptr)[index] = (uint16_t)value; + break; + case 3: + ((index_pack24 *)idxs->ptr)[index].value = value; + break; + default: + ((uint32_t *)idxs->ptr)[index] = value; + break; + } +} + +void +msgpacked_index_incr(msgpacked_index *idxs, uint32_t index) +{ + switch (idxs->ele_sz) { + case 1: + idxs->ptr[index]++; + break; + case 2: + ((uint16_t *)idxs->ptr)[index]++; + break; + case 3: + ((index_pack24 *)idxs->ptr)[index].value++; + break; + default: + ((uint32_t *)idxs->ptr)[index]++; + break; + } +} + +void +msgpacked_index_set_ptr(msgpacked_index *idxs, uint8_t *ptr) +{ + idxs->ptr = ptr; +} + +// Get pointer at index. +void * +msgpacked_index_get_mem(const msgpacked_index *idxs, uint32_t index) +{ + return (void *)(idxs->ptr + idxs->ele_sz * index); +} + +uint32_t +msgpacked_index_size(const msgpacked_index *idxs) +{ + return idxs->ele_sz * idxs->ele_count; +} + +uint32_t +msgpacked_index_ptr2value(const msgpacked_index *idxs, const void *ptr) +{ + switch (idxs->ele_sz) { + case 1: + return *((const uint8_t *)ptr); + case 2: + return *((const uint16_t *)ptr); + case 3: + return ((const index_pack24 *)ptr)->value; + default: + break; + } + + return *((const uint32_t *)ptr); +} + +uint32_t +msgpacked_index_get(const msgpacked_index *idxs, uint32_t index) +{ + switch (idxs->ele_sz) { + case 1: + return idxs->ptr[index]; + case 2: + return ((const uint16_t *)idxs->ptr)[index]; + case 3: + return ((const index_pack24 *)idxs->ptr)[index].value; + default: + break; + } + + return ((const uint32_t *)idxs->ptr)[index]; +} + +// Find find_index in a list of sorted_indexes. +// *where will be the location where find_index is (if exist) or is suppose to +// be (if not exist). +// Return true if find_index is in sorted_indexes. +bool +msgpacked_index_find_index_sorted(const msgpacked_index *sorted_indexes, + uint32_t find_index, uint32_t count, uint32_t *where) +{ + if (count == 0) { + *where = 0; + return false; + } + + uint32_t upper = count; + uint32_t lower = 0; + uint32_t i = count / 2; + + while (true) { + uint32_t index = msgpacked_index_get(sorted_indexes, i); + + if (find_index == index) { + *where = i; + return true; + } + + if (find_index > index) { + if (i >= upper - 1) { + *where = i + 1; + break; + } + + lower = i + 1; + i += upper; + i /= 2; + } + else { + if (i <= lower) { + *where = i; + break; + } + + upper = i; + i += lower; + i /= 2; + } + } + + return false; +} + +void +msgpacked_index_print(const msgpacked_index *idxs, const char *name) +{ + size_t ele_count = idxs->ele_count; + char buf[1024]; + char *ptr = buf; + + if (idxs->ptr) { + for (size_t i = 0; i < ele_count; i++) { + if (buf + 1024 - ptr < 12) { + break; + } + + ptr += sprintf(ptr, "%u, ", msgpacked_index_get(idxs, i)); + } + + if (ele_count > 0) { + ptr -= 2; + } + + *ptr = '\0'; + } + else { + strcpy(buf, "(null)"); + } + + cf_warning(AS_PARTICLE, "%s: index[%zu]={%s}", name, ele_count, buf); +} + + +//========================================================== +// offset_index +// + +void +offset_index_init(offset_index *offidx, uint8_t *idx_mem_ptr, + uint32_t ele_count, const uint8_t *contents, uint32_t content_sz) +{ + offidx->_.ele_count = ele_count; + offidx->content_sz = content_sz; + + if (content_sz < (1 << 8)) { + offidx->_.ele_sz = 1; + } + else if (content_sz < (1 << 16)) { + offidx->_.ele_sz = 2; + } + else if (content_sz < (1 << 24)) { + offidx->_.ele_sz = 3; + } + else { + offidx->_.ele_sz = 4; + } + + offidx->_.ptr = idx_mem_ptr; + offidx->contents = contents; + offidx->is_partial = false; +} + +void +offset_index_set(offset_index *offidx, uint32_t index, uint32_t value) +{ + if (index == 0 || index == offidx->_.ele_count) { + return; + } + + msgpacked_index_set((msgpacked_index *)offidx, index, value); +} + +bool +offset_index_set_next(offset_index *offidx, uint32_t index, uint32_t value) +{ + if (index >= offidx->_.ele_count) { + return true; + } + + uint32_t filled = offset_index_get_filled(offidx); + + if (index == filled) { + offset_index_set(offidx, index, value); + offset_index_set_filled(offidx, filled + 1); + + return true; + } + + if (index < filled) { + return value == offset_index_get_const(offidx, index); + } + + return false; +} + +void +offset_index_set_filled(offset_index *offidx, uint32_t ele_filled) +{ + if (offidx->_.ele_count == 0) { + return; + } + + cf_assert(ele_filled <= offidx->_.ele_count, AS_PARTICLE, "ele_filled(%u) > ele_count(%u)", ele_filled, offidx->_.ele_count); + msgpacked_index_set((msgpacked_index *)offidx, 0, ele_filled); +} + +void +offset_index_set_ptr(offset_index *offidx, uint8_t *idx_mem, + const uint8_t *packed_mem) +{ + msgpacked_index_set_ptr((msgpacked_index *)offidx, idx_mem); + offidx->contents = packed_mem; +} + +void +offset_index_copy(offset_index *dest, const offset_index *src, uint32_t d_start, + uint32_t s_start, uint32_t count, int delta) +{ + cf_assert(d_start + count <= dest->_.ele_count, AS_PARTICLE, "d_start(%u) + count(%u) > dest.ele_count(%u)", d_start, count, dest->_.ele_count); + cf_assert(s_start + count <= src->_.ele_count, AS_PARTICLE, "s_start(%u) + count(%u) > src.ele_count(%u)", s_start, count, src->_.ele_count); + + if (dest->_.ele_sz == src->_.ele_sz && delta == 0) { + memcpy(offset_index_get_mem(dest, d_start), + offset_index_get_mem(src, s_start), + dest->_.ele_sz * count); + } + else { + for (size_t i = 0; i < count; i++) { + uint32_t value = offset_index_get_const(src, s_start + i); + + value += delta; + offset_index_set(dest, d_start + i, value); + } + } +} + +void +offset_index_append_size(offset_index *offidx, uint32_t delta) +{ + uint32_t filled = offset_index_get_filled(offidx); + + if (filled == offidx->_.ele_count) { + return; + } + + uint32_t last = offset_index_get_const(offidx, filled - 1); + + offset_index_set_filled(offidx, filled + 1); + offset_index_set(offidx, filled, last + delta); +} + +bool +offset_index_find_items(offset_index *full_offidx, + cdt_find_items_idxs_type find_type, as_unpacker *items_pk, + order_index *items_ordidx_r, bool inverted, uint64_t *rm_mask, + uint32_t *rm_count_r, order_index *rm_ranks_r) +{ + bool (*unpack_fn)(as_unpacker *pk, cdt_payload *payload_r); + uint32_t items_count = items_ordidx_r->_.ele_count; + define_offset_index(items_offidx, items_pk->buffer + items_pk->offset, + items_pk->length - items_pk->offset, items_count); + + switch (find_type) { + case CDT_FIND_ITEMS_IDXS_FOR_LIST_VALUE: + unpack_fn = unpack_list_value; + break; + case CDT_FIND_ITEMS_IDXS_FOR_MAP_KEY: + unpack_fn = unpack_map_key; + break; + case CDT_FIND_ITEMS_IDXS_FOR_MAP_VALUE: + unpack_fn = unpack_map_value; + break; + default: + cf_crash(AS_PARTICLE, "bad input"); + return false; // dummy return to quash warning + } + + if (! list_full_offset_index_fill_all(&items_offidx)) { + cf_warning(AS_PARTICLE, "offset_index_find_items() invalid parameter key list"); + return false; + } + + bool success = list_order_index_sort(items_ordidx_r, &items_offidx, + AS_CDT_SORT_ASCENDING); + + cf_assert(success, AS_PARTICLE, "offset_index_find_items() sort failed after index filled"); + + uint32_t rm_count = 0; + + as_unpacker pk = { + .buffer = full_offidx->contents, + .length = full_offidx->content_sz + }; + + if (rm_ranks_r) { + order_index_clear(rm_ranks_r); + } + + for (uint32_t i = 0; i < full_offidx->_.ele_count; i++) { + cdt_payload value; + + if (! unpack_fn(&pk, &value)) { + cf_warning(AS_PARTICLE, "offset_index_find_items() invalid msgpack in unpack_fn()"); + return false; + } + + if (! offset_index_set_next(full_offidx, i + 1, (uint32_t)pk.offset)) { + cf_warning(AS_PARTICLE, "offset_index_find_items() invalid msgpack in offset_index_set_next()"); + return false; + } + + order_index_find find = { + .count = items_count, + .target = items_count + (rm_ranks_r != NULL ? 0 : 1) + }; + + if (! order_index_find_rank_by_value(items_ordidx_r, &value, + &items_offidx, &find)) { + cf_warning(AS_PARTICLE, "offset_index_find_items() invalid items list"); + return false; + } + + if (rm_ranks_r) { + uint32_t vl_rank = find.result; + + if (find.found) { + uint32_t idx = order_index_get(items_ordidx_r, find.result); + + order_index_incr(rm_ranks_r, (idx * 2) + 1); + vl_rank++; + } + + if (vl_rank != items_count) { + uint32_t idx = order_index_get(items_ordidx_r, vl_rank); + + order_index_incr(rm_ranks_r, idx * 2); + } + } + + if (! inverted) { + if (find.found) { + cdt_idx_mask_set(rm_mask, i); + rm_count++; + } + } + else if (! find.found) { + cdt_idx_mask_set(rm_mask, i); + rm_count++; + } + } + + if (rm_ranks_r) { + for (uint32_t i = 1; i < items_count; i++) { + uint32_t idx0 = order_index_get(items_ordidx_r, i - 1); + uint32_t idx1 = order_index_get(items_ordidx_r, i); + uint32_t rank0 = order_index_get(rm_ranks_r, idx0 * 2); + uint32_t rank1 = order_index_get(rm_ranks_r, idx1 * 2); + + order_index_set(rm_ranks_r, idx1 * 2, rank0 + rank1); + } + } + + *rm_count_r = rm_count; + + return true; +} + +void * +offset_index_get_mem(const offset_index *offidx, uint32_t index) +{ + return msgpacked_index_get_mem((msgpacked_index *)offidx, index); +} + +uint32_t +offset_index_size(const offset_index *offidx) +{ + return msgpacked_index_size((const msgpacked_index *)offidx); +} + +bool +offset_index_is_null(const offset_index *offidx) +{ + return offidx->_.ptr == NULL; +} + +bool +offset_index_is_valid(const offset_index *offidx) +{ + return offidx->_.ptr != NULL; +} + +bool +offset_index_is_full(const offset_index *offidx) +{ + if (offset_index_is_null(offidx)) { + return false; + } + + if (offidx->_.ele_count == 0) { + return true; + } + + uint32_t filled = offset_index_get_filled(offidx); + + cf_assert(filled <= offidx->_.ele_count, AS_PARTICLE, "filled(%u) > ele_count(%u)", filled, offidx->_.ele_count); + + if (filled == offidx->_.ele_count) { + return true; + } + + return false; +} + +uint32_t +offset_index_get_const(const offset_index *offidx, uint32_t idx) +{ + if (idx == 0) { + return 0; + } + + if (idx == offidx->_.ele_count) { + return offidx->content_sz; + } + + if (idx >= offset_index_get_filled(offidx)) { + offset_index_print(offidx, "offset_index_get_const() offidx"); + print_packed(offidx->contents, offidx->content_sz, "offset_index_get_const() offidx->ele_start"); + cf_crash(AS_PARTICLE, "offset_index_get_const() idx=%u >= filled=%u ele_count=%u", idx, offset_index_get_filled(offidx), offidx->_.ele_count); + } + + return msgpacked_index_get((const msgpacked_index *)offidx, idx); +} + +uint32_t +offset_index_get_delta_const(const offset_index *offidx, uint32_t index) +{ + uint32_t offset = offset_index_get_const(offidx, index); + + if (index == offidx->_.ele_count - 1) { + return offidx->content_sz - offset; + } + + return offset_index_get_const(offidx, index + 1) - offset; +} + +uint32_t +offset_index_get_filled(const offset_index *offidx) +{ + if (offidx->_.ele_count == 0) { + return 1; + } + + return msgpacked_index_get((const msgpacked_index *)offidx, 0); +} + +void +offset_index_print(const offset_index *offidx, const char *name) +{ + if (! name) { + name = "offset"; + } + + msgpacked_index_print((msgpacked_index *)offidx, name); +} + +void +offset_index_delta_print(const offset_index *offidx, const char *name) +{ + size_t ele_count = offidx->_.ele_count; + char buf[1024]; + char *ptr = buf; + + if (offidx->_.ptr) { + for (size_t i = 0; i < ele_count; i++) { + if (buf + 1024 - ptr < 12) { + break; + } + + ptr += sprintf(ptr, "%u, ", offset_index_get_delta_const(offidx, i)); + } + + if (ele_count > 0) { + ptr -= 2; + } + + *ptr = '\0'; + } + else { + strcpy(buf, "(null)"); + } + + cf_warning(AS_PARTICLE, "%s: delta_off[%zu]={%s} %u", name, ele_count, buf, offidx->content_sz); +} + + +//========================================================== +// order_index +// + +static inline uint32_t +order_index_ele_sz(uint32_t max_idx) +{ + // Allow for values [0, ele_count] for ele_count to indicate invalid values. + if (max_idx < (1 << 8)) { + return 1; + } + else if (max_idx < (1 << 16)) { + return 2; + } + else if (max_idx < (1 << 24)) { + return 3; + } + + return 4; +} + +void +order_index_init(order_index *ordidx, uint8_t *ptr, uint32_t ele_count) +{ + ordidx->_.ele_count = ele_count; + ordidx->_.ele_sz = order_index_ele_sz(ele_count); + ordidx->_.ptr = ptr; + ordidx->max_idx = ele_count; +} + +void +order_index_init2(order_index *ordidx, uint8_t *ptr, uint32_t max_idx, + uint32_t ele_count) +{ + ordidx->_.ele_count = ele_count; + ordidx->_.ele_sz = order_index_ele_sz(max_idx); + ordidx->_.ptr = ptr; + ordidx->max_idx = max_idx; +} + +void +order_index_init_ref(order_index *dst, const order_index *src, uint32_t start, + uint32_t count) +{ + order_index_init2(dst, order_index_get_mem(src, start), src->max_idx, + count); +} + +void +order_index_set(order_index *ordidx, uint32_t idx, uint32_t value) +{ + msgpacked_index_set((msgpacked_index *)ordidx, idx, value); +} + +void +order_index_set_ptr(order_index *ordidx, uint8_t *ptr) +{ + msgpacked_index_set_ptr((msgpacked_index *)ordidx, ptr); +} + +void +order_index_incr(order_index *ordidx, uint32_t idx) +{ + msgpacked_index_incr((msgpacked_index *)ordidx, idx); +} + +void +order_index_clear(order_index *ordidx) +{ + memset(ordidx->_.ptr, 0, order_index_size(ordidx)); +} + +bool +order_index_sorted_mark_dup_eles(order_index *ordidx, + const offset_index *full_offidx, uint32_t *count_r, uint32_t *sz_r) +{ + cf_assert(count_r, AS_PARTICLE, "count_r NULL"); + cf_assert(sz_r, AS_PARTICLE, "sz_r NULL"); + + as_unpacker pk = { + .buffer = full_offidx->contents, + .length = full_offidx->content_sz + }; + + as_unpacker prev = pk; + uint32_t prev_idx = order_index_get(ordidx, 0); + uint32_t ele_count = full_offidx->_.ele_count; + + prev.offset = offset_index_get_const(full_offidx, prev_idx); + *count_r = 0; + *sz_r = 0; + + for (uint32_t i = 1; i < ele_count; i++) { + uint32_t idx = order_index_get(ordidx, i); + uint32_t off = offset_index_get_const(full_offidx, idx); + + pk.offset = off; + + msgpack_compare_t cmp = as_unpack_compare(&prev, &pk); + + if (cmp == MSGPACK_COMPARE_EQUAL) { + (*sz_r) += pk.offset - off; + (*count_r)++; + order_index_set(ordidx, i, ele_count); + } + else if (cmp == MSGPACK_COMPARE_LESS) { + // no-op + } + else { + return false; + } + + prev.offset = off; + } + + return true; +} + +uint32_t +order_index_size(const order_index *ordidx) +{ + return msgpacked_index_size((const msgpacked_index *)ordidx); +} + +bool +order_index_is_null(const order_index *ordidx) +{ + return ordidx->_.ptr == NULL; +} + +bool +order_index_is_valid(const order_index *ordidx) +{ + return ordidx->_.ptr != NULL; +} + +bool +order_index_is_filled(const order_index *ordidx) +{ + if (! order_index_is_valid(ordidx)) { + return false; + } + + if (ordidx->_.ele_count > 0 && + order_index_get(ordidx, 0) >= ordidx->_.ele_count) { + return false; + } + + return true; +} + +// Get pointer at index. +void * +order_index_get_mem(const order_index *ordidx, uint32_t index) +{ + return msgpacked_index_get_mem((const msgpacked_index *)ordidx, index); +} + +uint32_t +order_index_ptr2value(const order_index *ordidx, const void *ptr) +{ + return msgpacked_index_ptr2value((const msgpacked_index *)ordidx, ptr); +} + +uint32_t +order_index_get(const order_index *ordidx, uint32_t index) +{ + return msgpacked_index_get((const msgpacked_index *)ordidx, index); +} + +// Find (closest) rank given value. +// Find closest rank for find->idx. +// target == 0 means find first instance of value. +// target == ele_count means find last instance of value. +// target > ele_count means don't check idx. +// Return true success. +bool +order_index_find_rank_by_value(const order_index *ordidx, + const cdt_payload *value, const offset_index *full_offidx, + order_index_find *find) +{ + uint32_t ele_count = full_offidx->_.ele_count; + + find->found = false; + + if (ele_count == 0 || find->count == 0) { + find->result = ele_count; + return true; + } + + uint32_t lower = find->start; + uint32_t upper = find->start + find->count; + uint32_t rank = find->start + find->count / 2; + + as_unpacker pk_value = { + .buffer = value->ptr, + .length = value->sz + }; + + as_unpacker pk_buf = { + .buffer = full_offidx->contents, + .length = full_offidx->content_sz + }; + + while (true) { + uint32_t idx = ordidx ? order_index_get(ordidx, rank) : rank; + + pk_value.offset = 0; // reset + pk_buf.offset = offset_index_get_const(full_offidx, idx); + + msgpack_compare_t cmp = as_unpack_compare(&pk_value, &pk_buf); + + if (cmp == MSGPACK_COMPARE_EQUAL) { + find->found = true; + + if (find->target > ele_count) { // means don't check + break; + } + + if (find->target < idx) { + cmp = MSGPACK_COMPARE_LESS; + } + else if (find->target > idx) { + if (rank == upper - 1) { + break; + } + + cmp = MSGPACK_COMPARE_GREATER; + } + else { + break; + } + } + + if (cmp == MSGPACK_COMPARE_GREATER) { + if (rank >= upper - 1) { + rank++; + break; + } + + lower = rank + (find->found ? 0 : 1); + rank += upper; + rank /= 2; + } + else if (cmp == MSGPACK_COMPARE_LESS) { + if (rank == lower) { + break; + } + + upper = rank; + rank += lower; + rank /= 2; + } + else { + return false; + } + } + + find->result = rank; + + return true; +} + +uint32_t +order_index_get_ele_size(const order_index *ordidx, uint32_t count, + const offset_index *full_offidx) +{ + uint32_t sz = 0; + + for (uint32_t i = 0; i < count; i++) { + uint32_t idx = order_index_get(ordidx, i); + + if (idx == ordidx->max_idx) { + continue; + } + + sz += offset_index_get_delta_const(full_offidx, idx); + } + + return sz; +} + +uint8_t * +order_index_write_eles(const order_index *ordidx, uint32_t count, + const offset_index *full_offidx, uint8_t *ptr, bool invert) +{ + uint32_t start = 0; + uint32_t offset = 0; + uint32_t sz = 0; + + for (uint32_t i = 0; i < count; i++) { + uint32_t idx = order_index_get(ordidx, i); + + if (idx == ordidx->max_idx) { + continue; + } + + offset = offset_index_get_const(full_offidx, idx); + sz = offset_index_get_delta_const(full_offidx, idx); + + if (! invert) { + memcpy(ptr, full_offidx->contents + offset, sz); + ptr += sz; + } + else { + uint32_t invert_sz = offset - start; + + if (invert_sz != 0) { + memcpy(ptr, full_offidx->contents + start, invert_sz); + ptr += invert_sz; + } + } + + start = offset + sz; + } + + if (! invert) { + return ptr; + } + + uint32_t invert_sz = full_offidx->content_sz - start; + + memcpy(ptr, full_offidx->contents + start, invert_sz); + + return ptr + invert_sz; +} + +uint32_t +order_index_adjust_value(const order_index_adjust *via, uint32_t src) +{ + if (via) { + return via->f(via, src); + } + + return src; +} + +void +order_index_copy(order_index *dest, const order_index *src, uint32_t d_start, + uint32_t s_start, uint32_t count, const order_index_adjust *adjust) +{ + if (dest->_.ele_sz == src->_.ele_sz && ! adjust) { + memcpy(order_index_get_mem(dest, d_start), + order_index_get_mem(src, s_start), + src->_.ele_sz * count); + } + else { + for (uint32_t i = 0; i < count; i++) { + uint32_t value = order_index_get(src, s_start + i); + + value = order_index_adjust_value(adjust, value); + order_index_set(dest, d_start + i, value); + } + } +} + +size_t +order_index_calc_size(uint32_t max_idx, uint32_t ele_count) +{ + return order_index_ele_sz(max_idx) * ele_count; +} + +void +order_index_print(const order_index *ordidx, const char *name) +{ + if (! name) { + name = "value"; + } + + msgpacked_index_print(&ordidx->_, name); +} + + +//========================================================== +// order_heap +// + +bool +order_heap_init_build_by_range(order_heap *heap, uint8_t *heap_mem, + uint32_t idx, uint32_t count, uint32_t ele_count, + order_heap_compare_fn cmp_fn, const void *udata) +{ + uint32_t tail_distance = ele_count - idx - count; + uint32_t discard; + msgpack_compare_t cmp; + + if (idx <= tail_distance) { + cmp = MSGPACK_COMPARE_LESS; // min k + discard = idx; + } + else { + cmp = MSGPACK_COMPARE_GREATER; // max k + discard = tail_distance; + } + + order_index_init(&heap->_, heap_mem, ele_count); + heap->filled = 0; + heap->userdata = udata; + heap->cmp = cmp; + heap->cmp_fn = cmp_fn; + order_heap_build(heap, true); + + if (! order_heap_order_at_end(heap, count + discard)) { + return false; + } + + return true; +} + +void +order_heap_swap(order_heap *heap, uint32_t index1, uint32_t index2) +{ + uint32_t temp = order_heap_get(heap, index1); + order_heap_set(heap, index1, order_heap_get(heap, index2)); + order_heap_set(heap, index2, temp); +} + +bool +order_heap_remove_top(order_heap *heap) +{ + if (heap->filled == 0) { + return true; + } + + uint32_t index = order_heap_get(heap, (heap->filled--) - 1); + + return order_heap_replace_top(heap, index); +} + +bool +order_heap_replace_top(order_heap *heap, uint32_t value) +{ + order_heap_set(heap, 0, value); + + return order_heap_heapify(heap, 0); +} + +bool +order_heap_heapify(order_heap *heap, uint32_t index) +{ + while (true) { + uint32_t child1 = 2 * index + 1; + uint32_t child2 = 2 * index + 2; + uint32_t child; + + if (child1 >= heap->filled) { + break; + } + + if (child2 >= heap->filled) { + child = child1; + } + else { + msgpack_compare_t cmp = heap->cmp_fn(heap->userdata, + order_heap_get(heap, child1), + order_heap_get(heap, child2)); + + if (cmp == MSGPACK_COMPARE_ERROR) { + return false; + } + + if (cmp == heap->cmp || cmp == MSGPACK_COMPARE_EQUAL) { + child = child1; + } + else { + child = child2; + } + } + + msgpack_compare_t cmp = heap->cmp_fn(heap->userdata, + order_heap_get(heap, child), + order_heap_get(heap, index)); + + if (cmp == MSGPACK_COMPARE_ERROR) { + return false; + } + + if (cmp == heap->cmp) { + order_heap_swap(heap, index, child); + index = child; + } + else { + break; + } + } + + return true; +} + +// O(n) +bool +order_heap_build(order_heap *heap, bool init) +{ + if (init) { + heap->filled = heap->_._.ele_count; + + for (size_t i = 0; i < heap->filled; i++) { + order_heap_set(heap, i, i); + } + } + + int64_t start = (int64_t)heap->filled / 2 - 1; + + for (int64_t i = start; i >= 0; i--) { + if (! order_heap_heapify(heap, (uint32_t)i)) { + return false; + } + } + + return true; +} + +bool +order_heap_order_at_end(order_heap *heap, uint32_t count) +{ + uint32_t end_index = heap->filled - 1; + + for (uint32_t i = 0; i < count; i++) { + uint32_t value = order_heap_get(heap, 0); + + if (! order_heap_remove_top(heap)) { + return false; + } + + order_heap_set(heap, end_index--, value); + } + + cf_assert(heap->filled == end_index + 1, AS_PARTICLE, "FIXME"); // FIXME + heap->filled = end_index + 1; + + return true; +} + +// Reverse order of end indexes. +void +order_heap_reverse_end(order_heap *heap, uint32_t count) +{ + uint32_t start = heap->filled; + uint32_t end = start + count; + uint32_t stop = (start + end) / 2; + + end--; + + for (uint32_t i = start; i < stop; i++) { + uint32_t left = order_heap_get(heap, i); + uint32_t right = order_heap_get(heap, end); + + order_heap_set(heap, end--, left); + order_heap_set(heap, i, right); + } +} + +void +order_heap_print(const order_heap *heap) +{ + order_index_print(&heap->_, "heap"); +} + + +//========================================================== +// cdt_idx_mask +// + +size_t +cdt_idx_mask_count(uint32_t ele_count) +{ + return (ele_count + 63) / 64; +} + +void +cdt_idx_mask_init(uint64_t *mask, uint32_t ele_count) +{ + memset(mask, 0, cdt_idx_mask_count(ele_count) * sizeof(uint64_t)); +} + +void +cdt_idx_mask_set(uint64_t *mask, uint32_t idx) +{ + uint32_t shift = idx % 64; + + mask[idx / 64] |= 1ULL << shift; +} + +void +cdt_idx_mask_set_by_ordidx(uint64_t *mask, const order_index *ordidx, + uint32_t start, uint32_t count, bool inverted) +{ + for (uint32_t i = 0; i < count; i++) { + cdt_idx_mask_set(mask, order_index_get(ordidx, start + i)); + } + + if (inverted) { + cdt_idx_mask_invert(mask, ordidx->max_idx); + } +} + +void +cdt_idx_mask_set_by_irc(uint64_t *mask, const order_index *irc, + const order_index *idx_map, bool inverted) +{ + uint32_t items_count = irc->_.ele_count / 2; + + for (uint32_t i = 0; i < items_count; i++) { + uint32_t rank = order_index_get(irc, 2 * i); + uint32_t count = order_index_get(irc, (2 * i) + 1); + + if (count == 0) { + continue; + } + + uint32_t end = rank + count; + + for (uint32_t j = rank; j < end; j++) { + cdt_idx_mask_set(mask, idx_map ? order_index_get(idx_map, j) : j); + } + } + + if (inverted) { + cdt_idx_mask_invert(mask, irc->max_idx); + } +} + +void +cdt_idx_mask_invert(uint64_t *mask, uint32_t ele_count) +{ + uint32_t mask_count = cdt_idx_mask_count(ele_count); + + for (uint32_t i = 0; i < mask_count; i++) { + mask[i] = ~mask[i]; + } +} + +uint64_t +cdt_idx_mask_get(const uint64_t *mask, uint32_t idx) +{ + return mask[idx / 64]; +} + +bool +cdt_idx_mask_is_set(const uint64_t *mask, uint32_t idx) +{ + uint32_t shift = idx % 64; + + return (mask[idx / 64] & (1ULL << shift)) != 0; +} + +// Find first 1 or 0. +uint32_t +cdt_idx_mask_find(const uint64_t *mask, uint32_t start, uint32_t end, + bool is_find0) +{ + cf_assert(start <= end, AS_PARTICLE, "start %u > end %u", start, end); + + if (start == end) { + return end; + } + + uint32_t offset = start % 64; + uint32_t i = start / 64; + uint64_t bit_mask = ~((1ULL << offset) - 1); + uint64_t bits = (is_find0 ? ~mask[i] : mask[i]) & bit_mask; + uint32_t count = cf_lsb64(bits); + + if (count != 64) { + offset = start - offset + count; + + if (offset > end) { + return end; + } + + return offset; + } + + uint32_t i_end = (end + 63) / 64; + + for (i++; i < i_end; i++) { + count = cf_lsb64(is_find0 ? ~mask[i] : mask[i]); + + if (count != 64) { + break; + } + } + + offset = (i * 64) + count; + + if (offset > end) { + return end; + } + + return offset; +} + +uint8_t * +cdt_idx_mask_write_eles(const uint64_t *mask, uint32_t count, + const offset_index *full_offidx, uint8_t *ptr, bool invert) +{ + if (count == 0) { + if (! invert) { + return ptr; + } + + memcpy(ptr, full_offidx->contents, full_offidx->content_sz); + return ptr + full_offidx->content_sz; + } + + uint32_t ele_count = full_offidx->_.ele_count; + uint32_t start_offset = 0; + uint32_t idx = 0; + uint32_t count_left = count; + + while (idx < ele_count) { + uint32_t idx0 = cdt_idx_mask_find(mask, idx, ele_count, false); + + cf_assert(idx0 < ele_count, AS_PARTICLE, "idx0 %u out of bounds from idx %u ele_count %u", idx0, idx, ele_count); + idx = cdt_idx_mask_find(mask, idx0 + 1, ele_count, true); + + if (idx - idx0 > count_left) { + idx = idx0 + count_left; + } + + uint32_t offset0 = offset_index_get_const(full_offidx, idx0); + uint32_t offset1 = offset_index_get_const(full_offidx, idx); + + if (invert) { + uint32_t sz = offset0 - start_offset; + + memcpy(ptr, full_offidx->contents + start_offset, sz); + ptr += sz; + start_offset = offset1; + } + else { + uint32_t sz = offset1 - offset0; + + memcpy(ptr, full_offidx->contents + offset0, sz); + ptr += sz; + } + + count_left -= idx - idx0; + + if (count_left == 0) { + break; + } + + idx++; + } + + if (invert) { + uint32_t sz = full_offidx->content_sz - start_offset; + + memcpy(ptr, full_offidx->contents + start_offset, sz); + ptr += sz; + } + + return ptr; +} + +uint32_t +cdt_idx_mask_get_content_sz(const uint64_t *mask, uint32_t count, + const offset_index *full_offidx) +{ + uint32_t sz = 0; + uint32_t idx = 0; + uint32_t ele_count = full_offidx->_.ele_count; + + for (uint32_t i = 0; i < count; i++) { + idx = cdt_idx_mask_find(mask, idx, ele_count, false); + sz += offset_index_get_delta_const(full_offidx, idx); + idx++; + } + + return sz; +} + +void +cdt_idx_mask_print(const uint64_t *mask, uint32_t ele_count, const char *name) +{ + if (! name) { + name = "mask"; + } + + size_t max = (ele_count + 63) / 64; + char buf[1024]; + char *ptr = buf; + + for (size_t i = 0; i < max; i++) { + if (buf + 1024 - ptr < 18) { + break; + } + + ptr += sprintf(ptr, "%016lX, ", mask[i]); + } + + if (ele_count != 0) { + ptr -= 2; + } + + *ptr = '\0'; + + cf_warning(AS_PARTICLE, "%s: index[%u]={%s}", name, ele_count, buf); +} + + +//========================================================== +// list +// + +bool +list_param_parse(const cdt_payload *items, as_unpacker *pk, uint32_t *count_r) +{ + pk->buffer = items->ptr; + pk->offset = 0; + pk->length = items->sz; + + int64_t items_hdr = as_unpack_list_header_element_count(pk); + + if (items_hdr > 0 && as_unpack_peek_is_ext(pk)) { + if (as_unpack_size(pk) <= 0) { + cf_warning(AS_PARTICLE, "list_param_parse() invalid parameter"); + return false; + } + + items_hdr--; + } + + if (items_hdr < 0 || items_hdr > CDT_MAX_PARAM_LIST_COUNT) { + cf_warning(AS_PARTICLE, "list_param_parse() invalid param items_hdr %ld", items_hdr); + return false; + } + + *count_r = (uint32_t)items_hdr; + + return true; +} + + +//========================================================== +// Debugging support. +// + +void +print_hex(const uint8_t *packed, uint32_t packed_sz, char *buf, uint32_t buf_sz) +{ + uint32_t n = (buf_sz - 3) / 2; + + if (n > packed_sz) { + n = packed_sz; + buf[buf_sz - 3] = '.'; + buf[buf_sz - 2] = '.'; + buf[buf_sz - 1] = '\0'; + } + + char *ptr = (char *)buf; + + for (int i = 0; i < n; i++) { + sprintf(ptr, "%02X", packed[i]); + ptr += 2; + } +} + +void +print_packed(const uint8_t *packed, uint32_t sz, const char *name) +{ + cf_warning(AS_PARTICLE, "%s: data=%p sz=%u", name, packed, sz); + + const uint32_t limit = 256; + uint32_t n = (sz + limit - 1) / limit; + uint32_t line_sz = limit; + char mem[1024]; + + for (uint32_t i = 0; i < n; i++) { + if (i == n - 1) { + line_sz = sz % limit; + } + + print_hex(packed + limit * i, line_sz, mem, sizeof(mem)); + cf_warning(AS_PARTICLE, "%s:%0X: [%s]", name, i, mem); + } +} + +void +cdt_bin_print(const as_bin *b, const char *name) +{ + typedef struct { + uint8_t type; + uint32_t sz; + uint8_t data[]; + } __attribute__ ((__packed__)) cdt_mem; + + const cdt_mem *p = (const cdt_mem *)b->particle; + uint8_t bintype = as_bin_get_particle_type(b); + + if (! p || (bintype != AS_PARTICLE_TYPE_MAP && + bintype != AS_PARTICLE_TYPE_LIST)) { + cf_warning(AS_PARTICLE, "%s: particle NULL type %u", name, bintype); + return; + } + + cf_warning(AS_PARTICLE, "%s: btype %u data=%p sz=%u type=%d", name, bintype, p->data, p->sz, p->type); + char buf[4096]; + print_hex(p->data, p->sz, buf, 4096); + cf_warning(AS_PARTICLE, "%s: buf=%s", name, buf); +} diff --git a/as/src/base/cfg.c b/as/src/base/cfg.c new file mode 100644 index 00000000..b1f54c2d --- /dev/null +++ b/as/src/base/cfg.c @@ -0,0 +1,4671 @@ +/* + * cfg.c + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/cfg.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "aerospike/mod_lua_config.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_vector.h" + +#include "bits.h" +#include "cf_str.h" +#include "dynbuf.h" +#include "fault.h" +#include "hardware.h" +#include "hist.h" +#include "hist_track.h" +#include "msg.h" +#include "node.h" +#include "olock.h" +#include "socket.h" +#include "tls.h" + +#include "base/datamodel.h" +#include "base/proto.h" +#include "base/secondary_index.h" +#include "base/security_config.h" +#include "base/thr_demarshal.h" +#include "base/thr_info.h" +#include "base/thr_info_port.h" +#include "base/thr_query.h" +#include "base/thr_sindex.h" +#include "base/thr_tsvc.h" +#include "base/transaction_policy.h" +#include "base/xdr_config.h" +#include "base/xdr_serverside.h" +#include "fabric/fabric.h" +#include "fabric/hb.h" +#include "fabric/migrate.h" +#include "fabric/partition_balance.h" +#include "storage/drv_ssd.h" + + +//========================================================== +// Globals. +// + +// The runtime configuration instance. +as_config g_config; + + +//========================================================== +// Forward declarations. +// + +void init_addr_list(cf_addr_list* addrs); +void add_addr(const char* name, cf_addr_list* addrs); +void add_tls_peer_name(const char* name, cf_serv_spec* spec); +void copy_addrs(const cf_addr_list* from, cf_addr_list* to); +void default_addrs(cf_addr_list* one, cf_addr_list* two); +void bind_to_access(const cf_serv_spec* from, cf_addr_list* to); +void cfg_add_addr_bind(const char* name, cf_serv_spec* spec); +void cfg_add_addr_std(const char* name, cf_serv_spec* spec); +void cfg_add_addr_alt(const char* name, cf_serv_spec* spec); +void cfg_mserv_config_from_addrs(cf_addr_list* addrs, cf_addr_list* bind_addrs, cf_mserv_cfg* serv_cfg, cf_ip_port port, cf_sock_owner owner, uint8_t ttl); +void cfg_serv_spec_to_bind(const cf_serv_spec* spec, const cf_serv_spec* def_spec, cf_serv_cfg* bind, cf_sock_owner owner); +void cfg_serv_spec_std_to_access(const cf_serv_spec* spec, cf_addr_list* access); +void cfg_serv_spec_alt_to_access(const cf_serv_spec* spec, cf_addr_list* access); +void cfg_add_mesh_seed_addr_port(char* addr, cf_ip_port port, bool tls); +as_set* cfg_add_set(as_namespace* ns); +void cfg_add_storage_file(as_namespace* ns, char* file_name); +void cfg_add_storage_device(as_namespace* ns, char* device_name, char* shadow_name); +uint32_t cfg_obj_size_hist_max(uint32_t hist_max); +void cfg_set_cluster_name(char* cluster_name); +void create_and_check_hist_track(cf_hist_track** h, const char* name, histogram_scale scale); +void cfg_create_all_histograms(); +void cfg_init_serv_spec(cf_serv_spec* spec_p); +cf_tls_spec* cfg_create_tls_spec(as_config* cfg, char* name); +char* cfg_resolve_tls_name(char* tls_name, const char* cluster_name, const char* which); + +void xdr_cfg_add_datacenter(char* dc, uint32_t nsid); +void xdr_cfg_add_node_addr_port(dc_config_opt *dc_cfg, char* addr, int port); +void xdr_cfg_add_tls_node(dc_config_opt *dc_cfg, char* addr, char *tls_name, int port); + + +//========================================================== +// Helper - set as_config defaults. +// + +void +cfg_set_defaults() +{ + as_config* c = &g_config; + + memset(c, 0, sizeof(as_config)); + + cfg_init_serv_spec(&c->service); + cfg_init_serv_spec(&c->tls_service); + cfg_init_serv_spec(&c->hb_serv_spec); + cfg_init_serv_spec(&c->hb_tls_serv_spec); + cfg_init_serv_spec(&c->fabric); + cfg_init_serv_spec(&c->tls_fabric); + cfg_init_serv_spec(&c->info); + + c->paxos_single_replica_limit = 1; // by default all clusters obey replication counts + c->n_proto_fd_max = 15000; + c->n_batch_threads = 4; + c->batch_max_buffers_per_queue = 255; // maximum number of buffers allowed in a single queue + c->batch_max_requests = 5000; // maximum requests/digests in a single batch + c->batch_max_unused_buffers = 256; // maximum number of buffers allowed in batch buffer pool + c->batch_priority = 200; // # of rows between a quick context switch? + c->feature_key_file = "/etc/aerospike/features.conf"; + c->hist_track_back = 300; + c->hist_track_slice = 10; + c->n_info_threads = 16; + c->migrate_max_num_incoming = AS_MIGRATE_DEFAULT_MAX_NUM_INCOMING; // for receiver-side migration flow-control + c->n_migrate_threads = 1; + c->nsup_delete_sleep = 100; // 100 microseconds means a delete rate of 10k TPS + c->nsup_period = 120; // run nsup once every 2 minutes + c->nsup_startup_evict = true; + c->proto_fd_idle_ms = 60000; // 1 minute reaping of proto file descriptors + c->proto_slow_netio_sleep_ms = 1; // 1 ms sleep between retry for slow queries + c->run_as_daemon = true; // set false only to run in debugger & see console output + c->scan_max_active = 100; + c->scan_max_done = 100; + c->scan_max_udf_transactions = 32; + c->scan_threads = 4; + c->ticker_interval = 10; + c->transaction_max_ns = 1000 * 1000 * 1000; // 1 second + c->transaction_pending_limit = 20; + c->transaction_retry_ms = 1000 + 2; // 1 second + epsilon, so default timeout happens first + c->n_transaction_threads_per_queue = 4; + as_sindex_gconfig_default(c); + as_query_gconfig_default(c); + c->work_directory = "/opt/aerospike"; + c->debug_allocations = CF_ALLOC_DEBUG_NONE; + c->fabric_dump_msgs = false; + + // Network heartbeat defaults. + c->hb_config.mode = AS_HB_MODE_UNDEF; + c->hb_config.tx_interval = 150; + c->hb_config.max_intervals_missed = 10; + c->hb_config.protocol = AS_HB_PROTOCOL_V3; + c->hb_config.override_mtu = 0; + + // Fabric defaults. + c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_BULK] = 2; + c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_BULK] = 4; + c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_CTRL] = 1; + c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_CTRL] = 4; + c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_META] = 1; + c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_META] = 4; + c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_RW] = 8; + c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_RW] = 16; + c->fabric_keepalive_enabled = true; + c->fabric_keepalive_intvl = 1; // seconds + c->fabric_keepalive_probes = 10; // tries + c->fabric_keepalive_time = 1; // seconds + c->fabric_latency_max_ms = 5; // assume a one way latency of 5 milliseconds by default + c->fabric_recv_rearm_threshold = 1024; + c->n_fabric_send_threads = 8; + + // Clustering defaults. + c->clustering_config.cluster_size_min = 1; + c->clustering_config.clique_based_eviction_enabled = true; + + // XDR defaults. + for (int i = 0; i < AS_CLUSTER_SZ ; i++) { + c->xdr_peers_lst[i].node = 0; + + for (int j = 0; j < DC_MAX_NUM; j++) { + c->xdr_peers_lst[i].time[j] = 0; + } + + c->xdr_clmap[i] = 0; + } + + for (int j = 0; j < DC_MAX_NUM; j++) { + c->xdr_self_lastshiptime[j] = 0; + } + + // Mod-lua defaults. + c->mod_lua.server_mode = true; + c->mod_lua.cache_enabled = true; + strcpy(c->mod_lua.system_path, "/opt/aerospike/sys/udf/lua"); + strcpy(c->mod_lua.user_path, "/opt/aerospike/usr/udf/lua"); + + // TODO - security set default config API? + c->sec_cfg.privilege_refresh_period = 60 * 5; // refresh socket privileges every 5 minutes + c->sec_cfg.syslog_local = AS_SYSLOG_NONE; +} + +//========================================================== +// All configuration items must have a switch case +// identifier somewhere in this enum. The order is not +// important, other than for organizational sanity. +// + +typedef enum { + // Generic: + // Token not found: + CASE_NOT_FOUND, + // Start of parsing context: + CASE_CONTEXT_BEGIN, + // End of parsing context: + CASE_CONTEXT_END, + + // Top-level options: + // In canonical configuration file order: + CASE_SERVICE_BEGIN, + CASE_LOGGING_BEGIN, + CASE_NETWORK_BEGIN, + CASE_NAMESPACE_BEGIN, + CASE_MOD_LUA_BEGIN, + CASE_CLUSTER_BEGIN, + // Enterprise-only: + CASE_SECURITY_BEGIN, + CASE_XDR_BEGIN, + + // Service options: + // Normally visible, in canonical configuration file order: + CASE_SERVICE_USER, + CASE_SERVICE_GROUP, + CASE_SERVICE_PAXOS_SINGLE_REPLICA_LIMIT, + CASE_SERVICE_PIDFILE, + CASE_SERVICE_CLIENT_FD_MAX, // renamed + CASE_SERVICE_PROTO_FD_MAX, + // Normally hidden: + CASE_SERVICE_ADVERTISE_IPV6, + CASE_SERVICE_AUTO_PIN, + CASE_SERVICE_BATCH_THREADS, + CASE_SERVICE_BATCH_MAX_BUFFERS_PER_QUEUE, + CASE_SERVICE_BATCH_MAX_REQUESTS, + CASE_SERVICE_BATCH_MAX_UNUSED_BUFFERS, + CASE_SERVICE_BATCH_PRIORITY, + CASE_SERVICE_BATCH_INDEX_THREADS, + CASE_SERVICE_CLUSTER_NAME, + CASE_SERVICE_ENABLE_BENCHMARKS_FABRIC, + CASE_SERVICE_ENABLE_BENCHMARKS_SVC, + CASE_SERVICE_ENABLE_HIST_INFO, + CASE_SERVICE_FEATURE_KEY_FILE, + CASE_SERVICE_HIST_TRACK_BACK, + CASE_SERVICE_HIST_TRACK_SLICE, + CASE_SERVICE_HIST_TRACK_THRESHOLDS, + CASE_SERVICE_INFO_THREADS, + CASE_SERVICE_LOG_LOCAL_TIME, + CASE_SERVICE_LOG_MILLIS, + CASE_SERVICE_MIGRATE_MAX_NUM_INCOMING, + CASE_SERVICE_MIGRATE_THREADS, + CASE_SERVICE_MIN_CLUSTER_SIZE, + CASE_SERVICE_NODE_ID, + CASE_SERVICE_NODE_ID_INTERFACE, + CASE_SERVICE_NSUP_DELETE_SLEEP, + CASE_SERVICE_NSUP_PERIOD, + CASE_SERVICE_NSUP_STARTUP_EVICT, + CASE_SERVICE_PROTO_FD_IDLE_MS, + CASE_SERVICE_QUERY_BATCH_SIZE, + CASE_SERVICE_QUERY_BUFPOOL_SIZE, + CASE_SERVICE_QUERY_IN_TRANSACTION_THREAD, + CASE_SERVICE_QUERY_LONG_Q_MAX_SIZE, + CASE_SERVICE_QUERY_PRE_RESERVE_PARTITIONS, + CASE_SERVICE_QUERY_PRIORITY, + CASE_SERVICE_QUERY_PRIORITY_SLEEP_US, + CASE_SERVICE_QUERY_REC_COUNT_BOUND, + CASE_SERVICE_QUERY_REQ_IN_QUERY_THREAD, + CASE_SERVICE_QUERY_REQ_MAX_INFLIGHT, + CASE_SERVICE_QUERY_SHORT_Q_MAX_SIZE, + CASE_SERVICE_QUERY_THREADS, + CASE_SERVICE_QUERY_THRESHOLD, + CASE_SERVICE_QUERY_UNTRACKED_TIME_MS, + CASE_SERVICE_QUERY_WORKER_THREADS, + CASE_SERVICE_RUN_AS_DAEMON, + CASE_SERVICE_SCAN_MAX_ACTIVE, + CASE_SERVICE_SCAN_MAX_DONE, + CASE_SERVICE_SCAN_MAX_UDF_TRANSACTIONS, + CASE_SERVICE_SCAN_THREADS, + CASE_SERVICE_SERVICE_THREADS, + CASE_SERVICE_SINDEX_BUILDER_THREADS, + CASE_SERVICE_SINDEX_GC_MAX_RATE, + CASE_SERVICE_SINDEX_GC_PERIOD, + CASE_SERVICE_TICKER_INTERVAL, + CASE_SERVICE_TRANSACTION_MAX_MS, + CASE_SERVICE_TRANSACTION_PENDING_LIMIT, + CASE_SERVICE_TRANSACTION_QUEUES, + CASE_SERVICE_TRANSACTION_RETRY_MS, + CASE_SERVICE_TRANSACTION_THREADS_PER_QUEUE, + CASE_SERVICE_WORK_DIRECTORY, + // For special debugging or bug-related repair: + CASE_SERVICE_DEBUG_ALLOCATIONS, + CASE_SERVICE_FABRIC_DUMP_MSGS, + CASE_SERVICE_PROLE_EXTRA_TTL, + // Obsoleted: + CASE_SERVICE_ALLOW_INLINE_TRANSACTIONS, + CASE_SERVICE_RESPOND_CLIENT_ON_MASTER_COMPLETION, + CASE_SERVICE_TRANSACTION_REPEATABLE_READ, + // Deprecated: + CASE_SERVICE_AUTO_DUN, + CASE_SERVICE_AUTO_UNDUN, + CASE_SERVICE_BATCH_RETRANSMIT, + CASE_SERVICE_CLIB_LIBRARY, + CASE_SERVICE_DEFRAG_QUEUE_ESCAPE, + CASE_SERVICE_DEFRAG_QUEUE_HWM, + CASE_SERVICE_DEFRAG_QUEUE_LWM, + CASE_SERVICE_DEFRAG_QUEUE_PRIORITY, + CASE_SERVICE_DUMP_MESSAGE_ABOVE_SIZE, + CASE_SERVICE_FABRIC_WORKERS, + CASE_SERVICE_FB_HEALTH_BAD_PCT, + CASE_SERVICE_FB_HEALTH_GOOD_PCT, + CASE_SERVICE_FB_HEALTH_MSG_PER_BURST, + CASE_SERVICE_FB_HEALTH_MSG_TIMEOUT, + CASE_SERVICE_GENERATION_DISABLE, + CASE_SERVICE_MAX_MSGS_PER_TYPE, + CASE_SERVICE_MIGRATE_READ_PRIORITY, + CASE_SERVICE_MIGRATE_READ_SLEEP, + CASE_SERVICE_MIGRATE_RX_LIFETIME_MS, + CASE_SERVICE_MIGRATE_XMIT_HWM, + CASE_SERVICE_MIGRATE_XMIT_LWM, + CASE_SERVICE_MIGRATE_PRIORITY, // renamed + CASE_SERVICE_MIGRATE_XMIT_PRIORITY, + CASE_SERVICE_MIGRATE_XMIT_SLEEP, + CASE_SERVICE_NSUP_AUTO_HWM, + CASE_SERVICE_NSUP_AUTO_HWM_PCT, + CASE_SERVICE_NSUP_MAX_DELETES, + CASE_SERVICE_NSUP_QUEUE_HWM, + CASE_SERVICE_NSUP_QUEUE_LWM, + CASE_SERVICE_NSUP_QUEUE_ESCAPE, + CASE_SERVICE_NSUP_REDUCE_PRIORITY, + CASE_SERVICE_NSUP_REDUCE_SLEEP, + CASE_SERVICE_NSUP_THREADS, + CASE_SERVICE_PAXOS_MAX_CLUSTER_SIZE, + CASE_SERVICE_PAXOS_PROTOCOL, + CASE_SERVICE_PAXOS_RECOVERY_POLICY, + CASE_SERVICE_PAXOS_RETRANSMIT_PERIOD, + CASE_SERVICE_REPLICATION_FIRE_AND_FORGET, + CASE_SERVICE_SCAN_MEMORY, + CASE_SERVICE_SCAN_PRIORITY, + CASE_SERVICE_SCAN_RETRANSMIT, + CASE_SERVICE_SCHEDULER_PRIORITY, + CASE_SERVICE_SCHEDULER_TYPE, + CASE_SERVICE_TRANSACTION_DUPLICATE_THREADS, + CASE_SERVICE_TRIAL_ACCOUNT_KEY, + CASE_SERVICE_UDF_RUNTIME_MAX_GMEMORY, + CASE_SERVICE_UDF_RUNTIME_MAX_MEMORY, + CASE_SERVICE_USE_QUEUE_PER_DEVICE, + CASE_SERVICE_WRITE_DUPLICATE_RESOLUTION_DISABLE, + + // Service auto-pin options (value tokens): + CASE_SERVICE_AUTO_PIN_NONE, + CASE_SERVICE_AUTO_PIN_CPU, + CASE_SERVICE_AUTO_PIN_NUMA, + + // Service debug-allocations options (value tokens): + CASE_SERVICE_DEBUG_ALLOCATIONS_NONE, + CASE_SERVICE_DEBUG_ALLOCATIONS_TRANSIENT, + CASE_SERVICE_DEBUG_ALLOCATIONS_PERSISTENT, + CASE_SERVICE_DEBUG_ALLOCATIONS_ALL, + + // Logging options: + // Normally visible: + CASE_LOG_FILE_BEGIN, + // Normally hidden: + CASE_LOG_CONSOLE_BEGIN, + + // Logging file options: + // Normally visible: + CASE_LOG_FILE_CONTEXT, + + // Logging console options: + // Normally visible: + CASE_LOG_CONSOLE_CONTEXT, + + // Network options: + // Normally visible, in canonical configuration file order: + CASE_NETWORK_SERVICE_BEGIN, + CASE_NETWORK_HEARTBEAT_BEGIN, + CASE_NETWORK_FABRIC_BEGIN, + CASE_NETWORK_INFO_BEGIN, + // Normally hidden: + CASE_NETWORK_TLS_BEGIN, + + // Network service options: + // Normally visible, in canonical configuration file order: + CASE_NETWORK_SERVICE_ADDRESS, + CASE_NETWORK_SERVICE_PORT, + // Normally hidden: + CASE_NETWORK_SERVICE_EXTERNAL_ADDRESS, // renamed + CASE_NETWORK_SERVICE_ACCESS_ADDRESS, + CASE_NETWORK_SERVICE_ACCESS_PORT, + CASE_NETWORK_SERVICE_ALTERNATE_ACCESS_ADDRESS, + CASE_NETWORK_SERVICE_ALTERNATE_ACCESS_PORT, + CASE_NETWORK_SERVICE_TLS_ACCESS_ADDRESS, + CASE_NETWORK_SERVICE_TLS_ACCESS_PORT, + CASE_NETWORK_SERVICE_TLS_ADDRESS, + CASE_NETWORK_SERVICE_TLS_ALTERNATE_ACCESS_ADDRESS, + CASE_NETWORK_SERVICE_TLS_ALTERNATE_ACCESS_PORT, + CASE_NETWORK_SERVICE_TLS_AUTHENTICATE_CLIENT, + CASE_NETWORK_SERVICE_TLS_NAME, + CASE_NETWORK_SERVICE_TLS_PORT, + // Obsoleted: + CASE_NETWORK_SERVICE_ALTERNATE_ADDRESS, + CASE_NETWORK_SERVICE_NETWORK_INTERFACE_NAME, + // Deprecated: + CASE_NETWORK_SERVICE_REUSE_ADDRESS, + + // Network heartbeat options: + // Normally visible, in canonical configuration file order: + CASE_NETWORK_HEARTBEAT_MODE, + CASE_NETWORK_HEARTBEAT_ADDRESS, + CASE_NETWORK_HEARTBEAT_MULTICAST_GROUP, + CASE_NETWORK_HEARTBEAT_PORT, + CASE_NETWORK_HEARTBEAT_MESH_SEED_ADDRESS_PORT, + CASE_NETWORK_HEARTBEAT_INTERVAL, + CASE_NETWORK_HEARTBEAT_TIMEOUT, + // Normally hidden: + CASE_NETWORK_HEARTBEAT_MTU, + CASE_NETWORK_HEARTBEAT_MCAST_TTL, // renamed + CASE_NETWORK_HEARTBEAT_MULTICAST_TTL, + CASE_NETWORK_HEARTBEAT_PROTOCOL, + CASE_NETWORK_HEARTBEAT_TLS_ADDRESS, + CASE_NETWORK_HEARTBEAT_TLS_MESH_SEED_ADDRESS_PORT, + CASE_NETWORK_HEARTBEAT_TLS_NAME, + CASE_NETWORK_HEARTBEAT_TLS_PORT, + // Obsoleted: + CASE_NETWORK_HEARTBEAT_INTERFACE_ADDRESS, + + // Network heartbeat mode options (value tokens): + CASE_NETWORK_HEARTBEAT_MODE_MESH, + CASE_NETWORK_HEARTBEAT_MODE_MULTICAST, + + // Network heartbeat protocol options (value tokens): + CASE_NETWORK_HEARTBEAT_PROTOCOL_NONE, + CASE_NETWORK_HEARTBEAT_PROTOCOL_V3, + + // Network fabric options: + // Normally visible, in canonical configuration file order: + CASE_NETWORK_FABRIC_ADDRESS, + CASE_NETWORK_FABRIC_PORT, + // Normally hidden: + CASE_NETWORK_FABRIC_CHANNEL_BULK_FDS, + CASE_NETWORK_FABRIC_CHANNEL_BULK_RECV_THREADS, + CASE_NETWORK_FABRIC_CHANNEL_CTRL_FDS, + CASE_NETWORK_FABRIC_CHANNEL_CTRL_RECV_THREADS, + CASE_NETWORK_FABRIC_CHANNEL_META_FDS, + CASE_NETWORK_FABRIC_CHANNEL_META_RECV_THREADS, + CASE_NETWORK_FABRIC_CHANNEL_RW_FDS, + CASE_NETWORK_FABRIC_CHANNEL_RW_RECV_THREADS, + CASE_NETWORK_FABRIC_KEEPALIVE_ENABLED, + CASE_NETWORK_FABRIC_KEEPALIVE_INTVL, + CASE_NETWORK_FABRIC_KEEPALIVE_PROBES, + CASE_NETWORK_FABRIC_KEEPALIVE_TIME, + CASE_NETWORK_FABRIC_LATENCY_MAX_MS, + CASE_NETWORK_FABRIC_RECV_REARM_THRESHOLD, + CASE_NETWORK_FABRIC_SEND_THREADS, + CASE_NETWORK_FABRIC_TLS_ADDRESS, + CASE_NETWORK_FABRIC_TLS_NAME, + CASE_NETWORK_FABRIC_TLS_PORT, + + // Network info options: + // Normally visible, in canonical configuration file order: + CASE_NETWORK_INFO_ADDRESS, + CASE_NETWORK_INFO_PORT, + // Deprecated: + CASE_NETWORK_INFO_ENABLE_FASTPATH, + + // Network TLS options: + CASE_NETWORK_TLS_CA_FILE, + CASE_NETWORK_TLS_CA_PATH, + CASE_NETWORK_TLS_CERT_BLACKLIST, + CASE_NETWORK_TLS_CERT_FILE, + CASE_NETWORK_TLS_CIPHER_SUITE, + CASE_NETWORK_TLS_KEY_FILE, + CASE_NETWORK_TLS_PROTOCOLS, + + // Namespace options: + // Normally visible, in canonical configuration file order: + CASE_NAMESPACE_REPLICATION_FACTOR, + CASE_NAMESPACE_LIMIT_SIZE, // renamed + CASE_NAMESPACE_MEMORY_SIZE, + CASE_NAMESPACE_DEFAULT_TTL, + CASE_NAMESPACE_STORAGE_ENGINE_BEGIN, + // For XDR only: + CASE_NAMESPACE_ENABLE_XDR, + CASE_NAMESPACE_SETS_ENABLE_XDR, + CASE_NAMESPACE_XDR_REMOTE_DATACENTER, + CASE_NAMESPACE_FORWARD_XDR_WRITES, + CASE_NAMESPACE_ALLOW_NONXDR_WRITES, + CASE_NAMESPACE_ALLOW_XDR_WRITES, + // Normally hidden: + CASE_NAMESPACE_COLD_START_EVICT_TTL, + CASE_NAMESPACE_CONFLICT_RESOLUTION_POLICY, + CASE_NAMESPACE_DATA_IN_INDEX, + CASE_NAMESPACE_DISABLE_WRITE_DUP_RES, + CASE_NAMESPACE_DISALLOW_NULL_SETNAME, + CASE_NAMESPACE_ENABLE_BENCHMARKS_BATCH_SUB, + CASE_NAMESPACE_ENABLE_BENCHMARKS_READ, + CASE_NAMESPACE_ENABLE_BENCHMARKS_UDF, + CASE_NAMESPACE_ENABLE_BENCHMARKS_UDF_SUB, + CASE_NAMESPACE_ENABLE_BENCHMARKS_WRITE, + CASE_NAMESPACE_ENABLE_HIST_PROXY, + CASE_NAMESPACE_EVICT_HIST_BUCKETS, + CASE_NAMESPACE_EVICT_TENTHS_PCT, + CASE_NAMESPACE_HIGH_WATER_DISK_PCT, + CASE_NAMESPACE_HIGH_WATER_MEMORY_PCT, + CASE_NAMESPACE_MAX_TTL, + CASE_NAMESPACE_MIGRATE_ORDER, + CASE_NAMESPACE_MIGRATE_RETRANSMIT_MS, + CASE_NAMESPACE_MIGRATE_SLEEP, + CASE_NAMESPACE_OBJ_SIZE_HIST_MAX, + CASE_NAMESPACE_PARTITION_TREE_LOCKS, + CASE_NAMESPACE_PARTITION_TREE_SPRIGS, + CASE_NAMESPACE_RACK_ID, + CASE_NAMESPACE_READ_CONSISTENCY_LEVEL_OVERRIDE, + CASE_NAMESPACE_SET_BEGIN, + CASE_NAMESPACE_SINDEX_BEGIN, + CASE_NAMESPACE_GEO2DSPHERE_WITHIN_BEGIN, + CASE_NAMESPACE_SINGLE_BIN, + CASE_NAMESPACE_STOP_WRITES_PCT, + CASE_NAMESPACE_STRONG_CONSISTENCY, + CASE_NAMESPACE_STRONG_CONSISTENCY_ALLOW_EXPUNGE, + CASE_NAMESPACE_TOMB_RAIDER_ELIGIBLE_AGE, + CASE_NAMESPACE_TOMB_RAIDER_PERIOD, + CASE_NAMESPACE_WRITE_COMMIT_LEVEL_OVERRIDE, + // Deprecated: + CASE_NAMESPACE_ALLOW_VERSIONS, + CASE_NAMESPACE_DEMO_READ_MULTIPLIER, + CASE_NAMESPACE_DEMO_WRITE_MULTIPLIER, + CASE_NAMESPACE_HIGH_WATER_PCT, + CASE_NAMESPACE_LOW_WATER_PCT, + CASE_NAMESPACE_SI_BEGIN, + + // Namespace conflict-resolution-policy options (value tokens): + CASE_NAMESPACE_CONFLICT_RESOLUTION_GENERATION, + CASE_NAMESPACE_CONFLICT_RESOLUTION_LAST_UPDATE_TIME, + + // Namespace read consistency level options: + CASE_NAMESPACE_READ_CONSISTENCY_ALL, + CASE_NAMESPACE_READ_CONSISTENCY_OFF, + CASE_NAMESPACE_READ_CONSISTENCY_ONE, + + // Namespace write commit level options: + CASE_NAMESPACE_WRITE_COMMIT_ALL, + CASE_NAMESPACE_WRITE_COMMIT_MASTER, + CASE_NAMESPACE_WRITE_COMMIT_OFF, + + // Namespace storage-engine options (value tokens): + CASE_NAMESPACE_STORAGE_MEMORY, + CASE_NAMESPACE_STORAGE_SSD, + CASE_NAMESPACE_STORAGE_DEVICE, + + // Namespace storage-engine device options: + // Normally visible, in canonical configuration file order: + CASE_NAMESPACE_STORAGE_DEVICE_DEVICE, + CASE_NAMESPACE_STORAGE_DEVICE_FILE, + CASE_NAMESPACE_STORAGE_DEVICE_FILESIZE, + CASE_NAMESPACE_STORAGE_DEVICE_SCHEDULER_MODE, + CASE_NAMESPACE_STORAGE_DEVICE_WRITE_BLOCK_SIZE, + CASE_NAMESPACE_STORAGE_DEVICE_MEMORY_ALL, // renamed + CASE_NAMESPACE_STORAGE_DEVICE_DATA_IN_MEMORY, + // Normally hidden: + CASE_NAMESPACE_STORAGE_DEVICE_COLD_START_EMPTY, + CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_TO_DEVICE, + CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_MIN_SIZE, + CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_LWM_PCT, + CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_QUEUE_MIN, + CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_SLEEP, + CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_STARTUP_MINIMUM, + CASE_NAMESPACE_STORAGE_DEVICE_DISABLE_ODIRECT, + CASE_NAMESPACE_STORAGE_DEVICE_ENABLE_BENCHMARKS_STORAGE, + CASE_NAMESPACE_STORAGE_DEVICE_ENABLE_OSYNC, + CASE_NAMESPACE_STORAGE_DEVICE_ENCRYPTION_KEY_FILE, + CASE_NAMESPACE_STORAGE_DEVICE_FLUSH_MAX_MS, + CASE_NAMESPACE_STORAGE_DEVICE_FSYNC_MAX_SEC, + CASE_NAMESPACE_STORAGE_DEVICE_MAX_WRITE_CACHE, + CASE_NAMESPACE_STORAGE_DEVICE_MIN_AVAIL_PCT, + CASE_NAMESPACE_STORAGE_DEVICE_POST_WRITE_QUEUE, + CASE_NAMESPACE_STORAGE_DEVICE_TOMB_RAIDER_SLEEP, + CASE_NAMESPACE_STORAGE_DEVICE_WRITE_THREADS, + // Deprecated: + CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_MAX_BLOCKS, + CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_PERIOD, + CASE_NAMESPACE_STORAGE_DEVICE_LOAD_AT_STARTUP, + CASE_NAMESPACE_STORAGE_DEVICE_PERSIST, + CASE_NAMESPACE_STORAGE_DEVICE_READONLY, + CASE_NAMESPACE_STORAGE_DEVICE_SIGNATURE, + CASE_NAMESPACE_STORAGE_DEVICE_WRITE_SMOOTHING_PERIOD, + + // Namespace set options: + CASE_NAMESPACE_SET_DISABLE_EVICTION, + CASE_NAMESPACE_SET_ENABLE_XDR, + CASE_NAMESPACE_SET_STOP_WRITES_COUNT, + // Deprecated: + CASE_NAMESPACE_SET_EVICT_HWM_COUNT, + CASE_NAMESPACE_SET_EVICT_HWM_PCT, + CASE_NAMESPACE_SET_STOP_WRITE_COUNT, + CASE_NAMESPACE_SET_STOP_WRITE_PCT, + + // Namespace set set-enable-xdr options (value tokens): + CASE_NAMESPACE_SET_ENABLE_XDR_USE_DEFAULT, + CASE_NAMESPACE_SET_ENABLE_XDR_FALSE, + CASE_NAMESPACE_SET_ENABLE_XDR_TRUE, + + // Namespace secondary-index options: + // Deprecated: + CASE_NAMESPACE_SI_GC_PERIOD, + CASE_NAMESPACE_SI_GC_MAX_UNITS, + CASE_NAMESPACE_SI_HISTOGRAM, + CASE_NAMESPACE_SI_IGNORE_NOT_SYNC, + + // Namespace sindex options: + CASE_NAMESPACE_SINDEX_NUM_PARTITIONS, + + // Namespace geo2dsphere within options: + CASE_NAMESPACE_GEO2DSPHERE_WITHIN_STRICT, + CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MIN_LEVEL, + CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MAX_LEVEL, + CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MAX_CELLS, + CASE_NAMESPACE_GEO2DSPHERE_WITHIN_LEVEL_MOD, + CASE_NAMESPACE_GEO2DSPHERE_WITHIN_EARTH_RADIUS_METERS, + + // Mod-lua options: + CASE_MOD_LUA_CACHE_ENABLED, + CASE_MOD_LUA_SYSTEM_PATH, + CASE_MOD_LUA_USER_PATH, + + // Security options: + CASE_SECURITY_ENABLE_SECURITY, + CASE_SECURITY_PRIVILEGE_REFRESH_PERIOD, + CASE_SECURITY_LOG_BEGIN, + CASE_SECURITY_SYSLOG_BEGIN, + + // Security (Aerospike) log options: + CASE_SECURITY_LOG_REPORT_AUTHENTICATION, + CASE_SECURITY_LOG_REPORT_DATA_OP, + CASE_SECURITY_LOG_REPORT_SYS_ADMIN, + CASE_SECURITY_LOG_REPORT_USER_ADMIN, + CASE_SECURITY_LOG_REPORT_VIOLATION, + + // Security syslog options: + CASE_SECURITY_SYSLOG_LOCAL, + CASE_SECURITY_SYSLOG_REPORT_AUTHENTICATION, + CASE_SECURITY_SYSLOG_REPORT_DATA_OP, + CASE_SECURITY_SYSLOG_REPORT_SYS_ADMIN, + CASE_SECURITY_SYSLOG_REPORT_USER_ADMIN, + CASE_SECURITY_SYSLOG_REPORT_VIOLATION, + + // XDR options: + // Normally visible, in canonical configuration file order: + CASE_XDR_ENABLE_XDR, + CASE_XDR_DIGESTLOG_PATH, + CASE_XDR_DATACENTER_BEGIN, + // Normally hidden: + CASE_XDR_CLIENT_THREADS, + CASE_XDR_COMPRESSION_THRESHOLD, + CASE_XDR_DELETE_SHIPPING_ENABLED, + CASE_XDR_DIGESTLOG_IOWAIT_MS, + CASE_XDR_FORWARD_XDR_WRITES, + CASE_XDR_HOTKEY_TIME_MS, + CASE_XDR_INFO_PORT, + CASE_XDR_INFO_TIMEOUT, + CASE_XDR_MAX_SHIP_BANDWIDTH, + CASE_XDR_MAX_SHIP_THROUGHPUT, + CASE_XDR_MIN_DIGESTLOG_FREE_PCT, + CASE_XDR_NSUP_DELETES_ENABLED, + CASE_XDR_READ_THREADS, + CASE_XDR_SHIP_BINS, + CASE_XDR_SHIP_DELAY, + CASE_XDR_SHIPPING_ENABLED, + CASE_XDR_WRITE_TIMEOUT, + + // XDR (remote) datacenter options: + // Normally visible, in canonical configuration file order: + CASE_XDR_DATACENTER_DC_NODE_ADDRESS_PORT, + // Normally hidden: + CASE_XDR_DATACENTER_DC_CONNECTIONS, + CASE_XDR_DATACENTER_DC_CONNECTIONS_IDLE_MS, + CASE_XDR_DATACENTER_DC_INT_EXT_IPMAP, + CASE_XDR_DATACENTER_DC_SECURITY_CONFIG_FILE, + CASE_XDR_DATACENTER_DC_USE_ALTERNATE_SERVICES, + CASE_XDR_DATACENTER_TLS_NAME, + CASE_XDR_DATACENTER_TLS_NODE, + + // Used parsing separate file, but share this enum: + + // XDR security top-level options: + XDR_SEC_CASE_CREDENTIALS_BEGIN, + + // XDR security credentials options: + // Normally visible, in canonical configuration file order: + XDR_SEC_CASE_CREDENTIALS_USERNAME, + XDR_SEC_CASE_CREDENTIALS_PASSWORD + +} cfg_case_id; + + +//========================================================== +// All configuration items must appear below as a cfg_opt +// struct in the appropriate array. Order within an array is +// not important, other than for organizational sanity. +// + +typedef struct cfg_opt_s { + const char* tok; + cfg_case_id case_id; +} cfg_opt; + +const cfg_opt GLOBAL_OPTS[] = { + { "service", CASE_SERVICE_BEGIN }, + { "logging", CASE_LOGGING_BEGIN }, + { "network", CASE_NETWORK_BEGIN }, + { "namespace", CASE_NAMESPACE_BEGIN }, + { "mod-lua", CASE_MOD_LUA_BEGIN }, + { "cluster", CASE_CLUSTER_BEGIN }, + { "security", CASE_SECURITY_BEGIN }, + { "xdr", CASE_XDR_BEGIN } +}; + +const cfg_opt SERVICE_OPTS[] = { + { "user", CASE_SERVICE_USER }, + { "group", CASE_SERVICE_GROUP }, + { "paxos-single-replica-limit", CASE_SERVICE_PAXOS_SINGLE_REPLICA_LIMIT }, + { "pidfile", CASE_SERVICE_PIDFILE }, + { "client-fd-max", CASE_SERVICE_CLIENT_FD_MAX }, + { "proto-fd-max", CASE_SERVICE_PROTO_FD_MAX }, + { "advertise-ipv6", CASE_SERVICE_ADVERTISE_IPV6 }, + { "auto-pin", CASE_SERVICE_AUTO_PIN }, + { "batch-threads", CASE_SERVICE_BATCH_THREADS }, + { "batch-max-buffers-per-queue", CASE_SERVICE_BATCH_MAX_BUFFERS_PER_QUEUE }, + { "batch-max-requests", CASE_SERVICE_BATCH_MAX_REQUESTS }, + { "batch-max-unused-buffers", CASE_SERVICE_BATCH_MAX_UNUSED_BUFFERS }, + { "batch-priority", CASE_SERVICE_BATCH_PRIORITY }, + { "batch-index-threads", CASE_SERVICE_BATCH_INDEX_THREADS }, + { "cluster-name", CASE_SERVICE_CLUSTER_NAME }, + { "enable-benchmarks-fabric", CASE_SERVICE_ENABLE_BENCHMARKS_FABRIC }, + { "enable-benchmarks-svc", CASE_SERVICE_ENABLE_BENCHMARKS_SVC }, + { "enable-hist-info", CASE_SERVICE_ENABLE_HIST_INFO }, + { "feature-key-file", CASE_SERVICE_FEATURE_KEY_FILE }, + { "hist-track-back", CASE_SERVICE_HIST_TRACK_BACK }, + { "hist-track-slice", CASE_SERVICE_HIST_TRACK_SLICE }, + { "hist-track-thresholds", CASE_SERVICE_HIST_TRACK_THRESHOLDS }, + { "info-threads", CASE_SERVICE_INFO_THREADS }, + { "log-local-time", CASE_SERVICE_LOG_LOCAL_TIME }, + { "log-millis", CASE_SERVICE_LOG_MILLIS}, + { "migrate-max-num-incoming", CASE_SERVICE_MIGRATE_MAX_NUM_INCOMING }, + { "migrate-threads", CASE_SERVICE_MIGRATE_THREADS }, + { "min-cluster-size", CASE_SERVICE_MIN_CLUSTER_SIZE }, + { "node-id", CASE_SERVICE_NODE_ID }, + { "node-id-interface", CASE_SERVICE_NODE_ID_INTERFACE }, + { "nsup-delete-sleep", CASE_SERVICE_NSUP_DELETE_SLEEP }, + { "nsup-period", CASE_SERVICE_NSUP_PERIOD }, + { "nsup-startup-evict", CASE_SERVICE_NSUP_STARTUP_EVICT }, + { "proto-fd-idle-ms", CASE_SERVICE_PROTO_FD_IDLE_MS }, + { "query-batch-size", CASE_SERVICE_QUERY_BATCH_SIZE }, + { "query-bufpool-size", CASE_SERVICE_QUERY_BUFPOOL_SIZE }, + { "query-in-transaction-thread", CASE_SERVICE_QUERY_IN_TRANSACTION_THREAD }, + { "query-long-q-max-size", CASE_SERVICE_QUERY_LONG_Q_MAX_SIZE }, + { "query-pre-reserve-partitions", CASE_SERVICE_QUERY_PRE_RESERVE_PARTITIONS }, + { "query-priority", CASE_SERVICE_QUERY_PRIORITY }, + { "query-priority-sleep-us", CASE_SERVICE_QUERY_PRIORITY_SLEEP_US }, + { "query-rec-count-bound", CASE_SERVICE_QUERY_REC_COUNT_BOUND }, + { "query-req-in-query-thread", CASE_SERVICE_QUERY_REQ_IN_QUERY_THREAD }, + { "query-req-max-inflight", CASE_SERVICE_QUERY_REQ_MAX_INFLIGHT }, + { "query-short-q-max-size", CASE_SERVICE_QUERY_SHORT_Q_MAX_SIZE }, + { "query-threads", CASE_SERVICE_QUERY_THREADS }, + { "query-threshold", CASE_SERVICE_QUERY_THRESHOLD }, + { "query-untracked-time-ms", CASE_SERVICE_QUERY_UNTRACKED_TIME_MS }, + { "query-worker-threads", CASE_SERVICE_QUERY_WORKER_THREADS }, + { "run-as-daemon", CASE_SERVICE_RUN_AS_DAEMON }, + { "scan-max-active", CASE_SERVICE_SCAN_MAX_ACTIVE }, + { "scan-max-done", CASE_SERVICE_SCAN_MAX_DONE }, + { "scan-max-udf-transactions", CASE_SERVICE_SCAN_MAX_UDF_TRANSACTIONS }, + { "scan-threads", CASE_SERVICE_SCAN_THREADS }, + { "service-threads", CASE_SERVICE_SERVICE_THREADS }, + { "sindex-builder-threads", CASE_SERVICE_SINDEX_BUILDER_THREADS }, + { "sindex-gc-max-rate", CASE_SERVICE_SINDEX_GC_MAX_RATE }, + { "sindex-gc-period", CASE_SERVICE_SINDEX_GC_PERIOD }, + { "ticker-interval", CASE_SERVICE_TICKER_INTERVAL }, + { "transaction-max-ms", CASE_SERVICE_TRANSACTION_MAX_MS }, + { "transaction-pending-limit", CASE_SERVICE_TRANSACTION_PENDING_LIMIT }, + { "transaction-queues", CASE_SERVICE_TRANSACTION_QUEUES }, + { "transaction-retry-ms", CASE_SERVICE_TRANSACTION_RETRY_MS }, + { "transaction-threads-per-queue", CASE_SERVICE_TRANSACTION_THREADS_PER_QUEUE }, + { "work-directory", CASE_SERVICE_WORK_DIRECTORY }, + { "debug-allocations", CASE_SERVICE_DEBUG_ALLOCATIONS }, + { "fabric-dump-msgs", CASE_SERVICE_FABRIC_DUMP_MSGS }, + { "prole-extra-ttl", CASE_SERVICE_PROLE_EXTRA_TTL }, + { "allow-inline-transactions", CASE_SERVICE_ALLOW_INLINE_TRANSACTIONS }, + { "respond-client-on-master-completion", CASE_SERVICE_RESPOND_CLIENT_ON_MASTER_COMPLETION }, + { "transaction-repeatable-read", CASE_SERVICE_TRANSACTION_REPEATABLE_READ }, + { "auto-dun", CASE_SERVICE_AUTO_DUN }, + { "auto-undun", CASE_SERVICE_AUTO_UNDUN }, + { "batch-retransmit", CASE_SERVICE_BATCH_RETRANSMIT }, + { "clib-library", CASE_SERVICE_CLIB_LIBRARY }, + { "defrag-queue-escape", CASE_SERVICE_DEFRAG_QUEUE_ESCAPE }, + { "defrag-queue-hwm", CASE_SERVICE_DEFRAG_QUEUE_HWM }, + { "defrag-queue-lwm", CASE_SERVICE_DEFRAG_QUEUE_LWM }, + { "defrag-queue-priority", CASE_SERVICE_DEFRAG_QUEUE_PRIORITY }, + { "dump-message-above-size", CASE_SERVICE_DUMP_MESSAGE_ABOVE_SIZE }, + { "fabric-workers", CASE_SERVICE_FABRIC_WORKERS }, + { "fb-health-bad-pct", CASE_SERVICE_FB_HEALTH_BAD_PCT }, + { "fb-health-good-pct", CASE_SERVICE_FB_HEALTH_GOOD_PCT }, + { "fb-health-msg-per-burst", CASE_SERVICE_FB_HEALTH_MSG_PER_BURST }, + { "fb-health-msg-timeout", CASE_SERVICE_FB_HEALTH_MSG_TIMEOUT }, + { "generation-disable", CASE_SERVICE_GENERATION_DISABLE }, + { "max-msgs-per-type", CASE_SERVICE_MAX_MSGS_PER_TYPE }, + { "migrate-read-priority", CASE_SERVICE_MIGRATE_READ_PRIORITY }, + { "migrate-read-sleep", CASE_SERVICE_MIGRATE_READ_SLEEP }, + { "migrate-rx-lifetime-ms", CASE_SERVICE_MIGRATE_RX_LIFETIME_MS }, + { "migrate-xmit-hwm", CASE_SERVICE_MIGRATE_XMIT_HWM }, + { "migrate-xmit-lwm", CASE_SERVICE_MIGRATE_XMIT_LWM }, + { "migrate-priority", CASE_SERVICE_MIGRATE_PRIORITY }, + { "migrate-xmit-priority", CASE_SERVICE_MIGRATE_XMIT_PRIORITY }, + { "migrate-xmit-sleep", CASE_SERVICE_MIGRATE_XMIT_SLEEP }, + { "nsup-auto-hwm", CASE_SERVICE_NSUP_AUTO_HWM }, + { "nsup-auto-hwm-pct", CASE_SERVICE_NSUP_AUTO_HWM_PCT }, + { "nsup-max-deletes", CASE_SERVICE_NSUP_MAX_DELETES }, + { "nsup-queue-escape", CASE_SERVICE_NSUP_QUEUE_ESCAPE }, + { "nsup-queue-hwm", CASE_SERVICE_NSUP_QUEUE_HWM }, + { "nsup-queue-lwm", CASE_SERVICE_NSUP_QUEUE_LWM }, + { "nsup-reduce-priority", CASE_SERVICE_NSUP_REDUCE_PRIORITY }, + { "nsup-reduce-sleep", CASE_SERVICE_NSUP_REDUCE_SLEEP }, + { "nsup-threads", CASE_SERVICE_NSUP_THREADS }, + { "paxos-max-cluster-size", CASE_SERVICE_PAXOS_MAX_CLUSTER_SIZE }, + { "paxos-protocol", CASE_SERVICE_PAXOS_PROTOCOL }, + { "paxos-recovery-policy", CASE_SERVICE_PAXOS_RECOVERY_POLICY }, + { "paxos-retransmit-period", CASE_SERVICE_PAXOS_RETRANSMIT_PERIOD }, + { "replication-fire-and-forget", CASE_SERVICE_REPLICATION_FIRE_AND_FORGET }, + { "scan-memory", CASE_SERVICE_SCAN_MEMORY }, + { "scan-priority", CASE_SERVICE_SCAN_PRIORITY }, + { "scan-retransmit", CASE_SERVICE_SCAN_RETRANSMIT }, + { "scheduler-priority", CASE_SERVICE_SCHEDULER_PRIORITY }, + { "scheduler-type", CASE_SERVICE_SCHEDULER_TYPE }, + { "transaction-duplicate-threads", CASE_SERVICE_TRANSACTION_DUPLICATE_THREADS }, + { "trial-account-key", CASE_SERVICE_TRIAL_ACCOUNT_KEY }, + { "udf-runtime-max-gmemory", CASE_SERVICE_UDF_RUNTIME_MAX_GMEMORY }, + { "udf-runtime-max-memory", CASE_SERVICE_UDF_RUNTIME_MAX_MEMORY }, + { "use-queue-per-device", CASE_SERVICE_USE_QUEUE_PER_DEVICE }, + { "write-duplicate-resolution-disable", CASE_SERVICE_WRITE_DUPLICATE_RESOLUTION_DISABLE }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt SERVICE_AUTO_PIN_OPTS[] = { + { "none", CASE_SERVICE_AUTO_PIN_NONE }, + { "cpu", CASE_SERVICE_AUTO_PIN_CPU }, + { "numa", CASE_SERVICE_AUTO_PIN_NUMA } +}; + +const cfg_opt SERVICE_DEBUG_ALLOCATIONS_OPTS[] = { + { "none", CASE_SERVICE_DEBUG_ALLOCATIONS_NONE }, + { "transient", CASE_SERVICE_DEBUG_ALLOCATIONS_TRANSIENT }, + { "persistent", CASE_SERVICE_DEBUG_ALLOCATIONS_PERSISTENT }, + { "all", CASE_SERVICE_DEBUG_ALLOCATIONS_ALL } +}; + +const cfg_opt LOGGING_OPTS[] = { + { "file", CASE_LOG_FILE_BEGIN }, + { "console", CASE_LOG_CONSOLE_BEGIN }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt LOGGING_FILE_OPTS[] = { + { "context", CASE_LOG_FILE_CONTEXT }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt LOGGING_CONSOLE_OPTS[] = { + { "context", CASE_LOG_CONSOLE_CONTEXT }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt NETWORK_OPTS[] = { + { "service", CASE_NETWORK_SERVICE_BEGIN }, + { "heartbeat", CASE_NETWORK_HEARTBEAT_BEGIN }, + { "fabric", CASE_NETWORK_FABRIC_BEGIN }, + { "info", CASE_NETWORK_INFO_BEGIN }, + { "tls", CASE_NETWORK_TLS_BEGIN }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt NETWORK_SERVICE_OPTS[] = { + { "address", CASE_NETWORK_SERVICE_ADDRESS }, + { "port", CASE_NETWORK_SERVICE_PORT }, + { "external-address", CASE_NETWORK_SERVICE_EXTERNAL_ADDRESS }, + { "access-address", CASE_NETWORK_SERVICE_ACCESS_ADDRESS }, + { "access-port", CASE_NETWORK_SERVICE_ACCESS_PORT }, + { "alternate-access-address", CASE_NETWORK_SERVICE_ALTERNATE_ACCESS_ADDRESS }, + { "alternate-access-port", CASE_NETWORK_SERVICE_ALTERNATE_ACCESS_PORT }, + { "tls-access-address", CASE_NETWORK_SERVICE_TLS_ACCESS_ADDRESS }, + { "tls-access-port", CASE_NETWORK_SERVICE_TLS_ACCESS_PORT }, + { "tls-address", CASE_NETWORK_SERVICE_TLS_ADDRESS }, + { "tls-alternate-access-address", CASE_NETWORK_SERVICE_TLS_ALTERNATE_ACCESS_ADDRESS }, + { "tls-alternate-access-port", CASE_NETWORK_SERVICE_TLS_ALTERNATE_ACCESS_PORT }, + { "tls-authenticate-client", CASE_NETWORK_SERVICE_TLS_AUTHENTICATE_CLIENT }, + { "tls-name", CASE_NETWORK_SERVICE_TLS_NAME }, + { "tls-port", CASE_NETWORK_SERVICE_TLS_PORT }, + { "alternate-address", CASE_NETWORK_SERVICE_ALTERNATE_ADDRESS }, + { "network-interface-name", CASE_NETWORK_SERVICE_NETWORK_INTERFACE_NAME }, + { "reuse-address", CASE_NETWORK_SERVICE_REUSE_ADDRESS }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt NETWORK_HEARTBEAT_OPTS[] = { + { "mode", CASE_NETWORK_HEARTBEAT_MODE }, + { "address", CASE_NETWORK_HEARTBEAT_ADDRESS }, + { "multicast-group", CASE_NETWORK_HEARTBEAT_MULTICAST_GROUP }, + { "port", CASE_NETWORK_HEARTBEAT_PORT }, + { "mesh-seed-address-port", CASE_NETWORK_HEARTBEAT_MESH_SEED_ADDRESS_PORT }, + { "interval", CASE_NETWORK_HEARTBEAT_INTERVAL }, + { "timeout", CASE_NETWORK_HEARTBEAT_TIMEOUT }, + { "mtu", CASE_NETWORK_HEARTBEAT_MTU }, + { "mcast-ttl", CASE_NETWORK_HEARTBEAT_MCAST_TTL }, + { "multicast-ttl", CASE_NETWORK_HEARTBEAT_MULTICAST_TTL }, + { "protocol", CASE_NETWORK_HEARTBEAT_PROTOCOL }, + { "tls-address", CASE_NETWORK_HEARTBEAT_TLS_ADDRESS }, + { "tls-mesh-seed-address-port", CASE_NETWORK_HEARTBEAT_TLS_MESH_SEED_ADDRESS_PORT }, + { "tls-name", CASE_NETWORK_HEARTBEAT_TLS_NAME }, + { "tls-port", CASE_NETWORK_HEARTBEAT_TLS_PORT }, + { "interface-address", CASE_NETWORK_HEARTBEAT_INTERFACE_ADDRESS }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt NETWORK_HEARTBEAT_MODE_OPTS[] = { + { "mesh", CASE_NETWORK_HEARTBEAT_MODE_MESH }, + { "multicast", CASE_NETWORK_HEARTBEAT_MODE_MULTICAST } +}; + +const cfg_opt NETWORK_HEARTBEAT_PROTOCOL_OPTS[] = { + { "none", CASE_NETWORK_HEARTBEAT_PROTOCOL_NONE }, + { "v3", CASE_NETWORK_HEARTBEAT_PROTOCOL_V3} +}; + +const cfg_opt NETWORK_FABRIC_OPTS[] = { + { "address", CASE_NETWORK_FABRIC_ADDRESS }, + { "port", CASE_NETWORK_FABRIC_PORT }, + { "channel-bulk-fds", CASE_NETWORK_FABRIC_CHANNEL_BULK_FDS }, + { "channel-bulk-recv-threads", CASE_NETWORK_FABRIC_CHANNEL_BULK_RECV_THREADS }, + { "channel-ctrl-fds", CASE_NETWORK_FABRIC_CHANNEL_CTRL_FDS }, + { "channel-ctrl-recv-threads", CASE_NETWORK_FABRIC_CHANNEL_CTRL_RECV_THREADS }, + { "channel-meta-fds", CASE_NETWORK_FABRIC_CHANNEL_META_FDS }, + { "channel-meta-recv-threads", CASE_NETWORK_FABRIC_CHANNEL_META_RECV_THREADS }, + { "channel-rw-fds", CASE_NETWORK_FABRIC_CHANNEL_RW_FDS }, + { "channel-rw-recv-threads", CASE_NETWORK_FABRIC_CHANNEL_RW_RECV_THREADS }, + { "keepalive-enabled", CASE_NETWORK_FABRIC_KEEPALIVE_ENABLED }, + { "keepalive-intvl", CASE_NETWORK_FABRIC_KEEPALIVE_INTVL }, + { "keepalive-probes", CASE_NETWORK_FABRIC_KEEPALIVE_PROBES }, + { "keepalive-time", CASE_NETWORK_FABRIC_KEEPALIVE_TIME }, + { "latency-max-ms", CASE_NETWORK_FABRIC_LATENCY_MAX_MS }, + { "recv-rearm-threshold", CASE_NETWORK_FABRIC_RECV_REARM_THRESHOLD }, + { "send-threads", CASE_NETWORK_FABRIC_SEND_THREADS }, + { "tls-address", CASE_NETWORK_FABRIC_TLS_ADDRESS }, + { "tls-name", CASE_NETWORK_FABRIC_TLS_NAME }, + { "tls-port", CASE_NETWORK_FABRIC_TLS_PORT }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt NETWORK_INFO_OPTS[] = { + { "address", CASE_NETWORK_INFO_ADDRESS }, + { "port", CASE_NETWORK_INFO_PORT }, + { "enable-fastpath", CASE_NETWORK_INFO_ENABLE_FASTPATH }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt NETWORK_TLS_OPTS[] = { + { "ca-file", CASE_NETWORK_TLS_CA_FILE }, + { "ca-path", CASE_NETWORK_TLS_CA_PATH }, + { "cert-blacklist", CASE_NETWORK_TLS_CERT_BLACKLIST }, + { "cert-file", CASE_NETWORK_TLS_CERT_FILE }, + { "cipher-suite", CASE_NETWORK_TLS_CIPHER_SUITE }, + { "key-file", CASE_NETWORK_TLS_KEY_FILE }, + { "protocols", CASE_NETWORK_TLS_PROTOCOLS }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt NAMESPACE_OPTS[] = { + { "replication-factor", CASE_NAMESPACE_REPLICATION_FACTOR }, + { "limit-size", CASE_NAMESPACE_LIMIT_SIZE }, + { "memory-size", CASE_NAMESPACE_MEMORY_SIZE }, + { "default-ttl", CASE_NAMESPACE_DEFAULT_TTL }, + { "storage-engine", CASE_NAMESPACE_STORAGE_ENGINE_BEGIN }, + { "enable-xdr", CASE_NAMESPACE_ENABLE_XDR }, + { "sets-enable-xdr", CASE_NAMESPACE_SETS_ENABLE_XDR }, + { "xdr-remote-datacenter", CASE_NAMESPACE_XDR_REMOTE_DATACENTER }, + { "ns-forward-xdr-writes", CASE_NAMESPACE_FORWARD_XDR_WRITES }, + { "allow-nonxdr-writes", CASE_NAMESPACE_ALLOW_NONXDR_WRITES }, + { "allow-xdr-writes", CASE_NAMESPACE_ALLOW_XDR_WRITES }, + { "cold-start-evict-ttl", CASE_NAMESPACE_COLD_START_EVICT_TTL }, + { "conflict-resolution-policy", CASE_NAMESPACE_CONFLICT_RESOLUTION_POLICY }, + { "data-in-index", CASE_NAMESPACE_DATA_IN_INDEX }, + { "disable-write-dup-res", CASE_NAMESPACE_DISABLE_WRITE_DUP_RES }, + { "disallow-null-setname", CASE_NAMESPACE_DISALLOW_NULL_SETNAME }, + { "enable-benchmarks-batch-sub", CASE_NAMESPACE_ENABLE_BENCHMARKS_BATCH_SUB }, + { "enable-benchmarks-read", CASE_NAMESPACE_ENABLE_BENCHMARKS_READ }, + { "enable-benchmarks-udf", CASE_NAMESPACE_ENABLE_BENCHMARKS_UDF }, + { "enable-benchmarks-udf-sub", CASE_NAMESPACE_ENABLE_BENCHMARKS_UDF_SUB }, + { "enable-benchmarks-write", CASE_NAMESPACE_ENABLE_BENCHMARKS_WRITE }, + { "enable-hist-proxy", CASE_NAMESPACE_ENABLE_HIST_PROXY }, + { "evict-hist-buckets", CASE_NAMESPACE_EVICT_HIST_BUCKETS }, + { "evict-tenths-pct", CASE_NAMESPACE_EVICT_TENTHS_PCT }, + { "high-water-disk-pct", CASE_NAMESPACE_HIGH_WATER_DISK_PCT }, + { "high-water-memory-pct", CASE_NAMESPACE_HIGH_WATER_MEMORY_PCT }, + { "max-ttl", CASE_NAMESPACE_MAX_TTL }, + { "migrate-order", CASE_NAMESPACE_MIGRATE_ORDER }, + { "migrate-retransmit-ms", CASE_NAMESPACE_MIGRATE_RETRANSMIT_MS }, + { "migrate-sleep", CASE_NAMESPACE_MIGRATE_SLEEP }, + { "obj-size-hist-max", CASE_NAMESPACE_OBJ_SIZE_HIST_MAX }, + { "partition-tree-locks", CASE_NAMESPACE_PARTITION_TREE_LOCKS }, + { "partition-tree-sprigs", CASE_NAMESPACE_PARTITION_TREE_SPRIGS }, + { "rack-id", CASE_NAMESPACE_RACK_ID }, + { "read-consistency-level-override", CASE_NAMESPACE_READ_CONSISTENCY_LEVEL_OVERRIDE }, + { "set", CASE_NAMESPACE_SET_BEGIN }, + { "sindex", CASE_NAMESPACE_SINDEX_BEGIN }, + { "geo2dsphere-within", CASE_NAMESPACE_GEO2DSPHERE_WITHIN_BEGIN }, + { "single-bin", CASE_NAMESPACE_SINGLE_BIN }, + { "stop-writes-pct", CASE_NAMESPACE_STOP_WRITES_PCT }, + { "strong-consistency", CASE_NAMESPACE_STRONG_CONSISTENCY }, + { "strong-consistency-allow-expunge", CASE_NAMESPACE_STRONG_CONSISTENCY_ALLOW_EXPUNGE }, + { "tomb-raider-eligible-age", CASE_NAMESPACE_TOMB_RAIDER_ELIGIBLE_AGE }, + { "tomb-raider-period", CASE_NAMESPACE_TOMB_RAIDER_PERIOD }, + { "write-commit-level-override", CASE_NAMESPACE_WRITE_COMMIT_LEVEL_OVERRIDE }, + { "allow-versions", CASE_NAMESPACE_ALLOW_VERSIONS }, + { "demo-read-multiplier", CASE_NAMESPACE_DEMO_READ_MULTIPLIER }, + { "demo-write-multiplier", CASE_NAMESPACE_DEMO_WRITE_MULTIPLIER }, + { "high-water-pct", CASE_NAMESPACE_HIGH_WATER_PCT }, + { "low-water-pct", CASE_NAMESPACE_LOW_WATER_PCT }, + { "si", CASE_NAMESPACE_SI_BEGIN }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt NAMESPACE_CONFLICT_RESOLUTION_OPTS[] = { + { "generation", CASE_NAMESPACE_CONFLICT_RESOLUTION_GENERATION }, + { "last-update-time", CASE_NAMESPACE_CONFLICT_RESOLUTION_LAST_UPDATE_TIME } +}; + +const cfg_opt NAMESPACE_READ_CONSISTENCY_OPTS[] = { + { "all", CASE_NAMESPACE_READ_CONSISTENCY_ALL }, + { "off", CASE_NAMESPACE_READ_CONSISTENCY_OFF }, + { "one", CASE_NAMESPACE_READ_CONSISTENCY_ONE } +}; + +const cfg_opt NAMESPACE_WRITE_COMMIT_OPTS[] = { + { "all", CASE_NAMESPACE_WRITE_COMMIT_ALL }, + { "master", CASE_NAMESPACE_WRITE_COMMIT_MASTER }, + { "off", CASE_NAMESPACE_WRITE_COMMIT_OFF } +}; + +const cfg_opt NAMESPACE_STORAGE_OPTS[] = { + { "memory", CASE_NAMESPACE_STORAGE_MEMORY }, + { "ssd", CASE_NAMESPACE_STORAGE_SSD }, + { "device", CASE_NAMESPACE_STORAGE_DEVICE } +}; + +const cfg_opt NAMESPACE_STORAGE_DEVICE_OPTS[] = { + { "device", CASE_NAMESPACE_STORAGE_DEVICE_DEVICE }, + { "file", CASE_NAMESPACE_STORAGE_DEVICE_FILE }, + { "filesize", CASE_NAMESPACE_STORAGE_DEVICE_FILESIZE }, + { "scheduler-mode", CASE_NAMESPACE_STORAGE_DEVICE_SCHEDULER_MODE }, + { "write-block-size", CASE_NAMESPACE_STORAGE_DEVICE_WRITE_BLOCK_SIZE }, + { "memory-all", CASE_NAMESPACE_STORAGE_DEVICE_MEMORY_ALL }, + { "data-in-memory", CASE_NAMESPACE_STORAGE_DEVICE_DATA_IN_MEMORY }, + { "cold-start-empty", CASE_NAMESPACE_STORAGE_DEVICE_COLD_START_EMPTY }, + { "commit-to-device", CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_TO_DEVICE }, + { "commit-min-size", CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_MIN_SIZE }, + { "defrag-lwm-pct", CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_LWM_PCT }, + { "defrag-queue-min", CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_QUEUE_MIN }, + { "defrag-sleep", CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_SLEEP }, + { "defrag-startup-minimum", CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_STARTUP_MINIMUM }, + { "disable-odirect", CASE_NAMESPACE_STORAGE_DEVICE_DISABLE_ODIRECT }, + { "enable-benchmarks-storage", CASE_NAMESPACE_STORAGE_DEVICE_ENABLE_BENCHMARKS_STORAGE }, + { "enable-osync", CASE_NAMESPACE_STORAGE_DEVICE_ENABLE_OSYNC }, + { "encryption-key-file", CASE_NAMESPACE_STORAGE_DEVICE_ENCRYPTION_KEY_FILE }, + { "flush-max-ms", CASE_NAMESPACE_STORAGE_DEVICE_FLUSH_MAX_MS }, + { "fsync-max-sec", CASE_NAMESPACE_STORAGE_DEVICE_FSYNC_MAX_SEC }, + { "max-write-cache", CASE_NAMESPACE_STORAGE_DEVICE_MAX_WRITE_CACHE }, + { "min-avail-pct", CASE_NAMESPACE_STORAGE_DEVICE_MIN_AVAIL_PCT }, + { "post-write-queue", CASE_NAMESPACE_STORAGE_DEVICE_POST_WRITE_QUEUE }, + { "tomb-raider-sleep", CASE_NAMESPACE_STORAGE_DEVICE_TOMB_RAIDER_SLEEP }, + { "write-threads", CASE_NAMESPACE_STORAGE_DEVICE_WRITE_THREADS }, + { "defrag-max-blocks", CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_MAX_BLOCKS }, + { "defrag-period", CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_PERIOD }, + { "load-at-startup", CASE_NAMESPACE_STORAGE_DEVICE_LOAD_AT_STARTUP }, + { "persist", CASE_NAMESPACE_STORAGE_DEVICE_PERSIST }, + { "readonly", CASE_NAMESPACE_STORAGE_DEVICE_READONLY }, + { "signature", CASE_NAMESPACE_STORAGE_DEVICE_SIGNATURE }, + { "write-smoothing-period", CASE_NAMESPACE_STORAGE_DEVICE_WRITE_SMOOTHING_PERIOD }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt NAMESPACE_SET_OPTS[] = { + { "set-disable-eviction", CASE_NAMESPACE_SET_DISABLE_EVICTION }, + { "set-enable-xdr", CASE_NAMESPACE_SET_ENABLE_XDR }, + { "set-stop-writes-count", CASE_NAMESPACE_SET_STOP_WRITES_COUNT }, + { "set-evict-hwm-count", CASE_NAMESPACE_SET_EVICT_HWM_COUNT }, + { "set-evict-hwm-pct", CASE_NAMESPACE_SET_EVICT_HWM_PCT }, + { "set-stop-write-count", CASE_NAMESPACE_SET_STOP_WRITE_COUNT }, + { "set-stop-write-pct", CASE_NAMESPACE_SET_STOP_WRITE_PCT }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt NAMESPACE_SET_ENABLE_XDR_OPTS[] = { + { "use-default", CASE_NAMESPACE_SET_ENABLE_XDR_USE_DEFAULT }, + { "false", CASE_NAMESPACE_SET_ENABLE_XDR_FALSE }, + { "true", CASE_NAMESPACE_SET_ENABLE_XDR_TRUE } +}; + +const cfg_opt NAMESPACE_SI_OPTS[] = { + { "si-gc-period", CASE_NAMESPACE_SI_GC_PERIOD }, + { "si-gc-max-units", CASE_NAMESPACE_SI_GC_MAX_UNITS }, + { "si-histogram", CASE_NAMESPACE_SI_HISTOGRAM }, + { "si-ignore-not-sync", CASE_NAMESPACE_SI_IGNORE_NOT_SYNC }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt NAMESPACE_SINDEX_OPTS[] = { + { "num-partitions", CASE_NAMESPACE_SINDEX_NUM_PARTITIONS }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt NAMESPACE_GEO2DSPHERE_WITHIN_OPTS[] = { + { "strict", CASE_NAMESPACE_GEO2DSPHERE_WITHIN_STRICT }, + { "min-level", CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MIN_LEVEL }, + { "max-level", CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MAX_LEVEL }, + { "max-cells", CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MAX_CELLS }, + { "level-mod", CASE_NAMESPACE_GEO2DSPHERE_WITHIN_LEVEL_MOD }, + { "earth-radius-meters", CASE_NAMESPACE_GEO2DSPHERE_WITHIN_EARTH_RADIUS_METERS }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt MOD_LUA_OPTS[] = { + { "cache-enabled", CASE_MOD_LUA_CACHE_ENABLED }, + { "system-path", CASE_MOD_LUA_SYSTEM_PATH }, + { "user-path", CASE_MOD_LUA_USER_PATH }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt SECURITY_OPTS[] = { + { "enable-security", CASE_SECURITY_ENABLE_SECURITY }, + { "privilege-refresh-period", CASE_SECURITY_PRIVILEGE_REFRESH_PERIOD }, + { "log", CASE_SECURITY_LOG_BEGIN }, + { "syslog", CASE_SECURITY_SYSLOG_BEGIN }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt SECURITY_LOG_OPTS[] = { + { "report-authentication", CASE_SECURITY_LOG_REPORT_AUTHENTICATION }, + { "report-data-op", CASE_SECURITY_LOG_REPORT_DATA_OP }, + { "report-sys-admin", CASE_SECURITY_LOG_REPORT_SYS_ADMIN }, + { "report-user-admin", CASE_SECURITY_LOG_REPORT_USER_ADMIN }, + { "report-violation", CASE_SECURITY_LOG_REPORT_VIOLATION }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt SECURITY_SYSLOG_OPTS[] = { + { "local", CASE_SECURITY_SYSLOG_LOCAL }, + { "report-authentication", CASE_SECURITY_SYSLOG_REPORT_AUTHENTICATION }, + { "report-data-op", CASE_SECURITY_SYSLOG_REPORT_DATA_OP }, + { "report-sys-admin", CASE_SECURITY_SYSLOG_REPORT_SYS_ADMIN }, + { "report-user-admin", CASE_SECURITY_SYSLOG_REPORT_USER_ADMIN }, + { "report-violation", CASE_SECURITY_SYSLOG_REPORT_VIOLATION }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt XDR_OPTS[] = { + { "{", CASE_CONTEXT_BEGIN }, + { "enable-xdr", CASE_XDR_ENABLE_XDR }, + { "xdr-digestlog-path", CASE_XDR_DIGESTLOG_PATH }, + { "datacenter", CASE_XDR_DATACENTER_BEGIN }, + { "xdr-client-threads", CASE_XDR_CLIENT_THREADS }, + { "xdr-compression-threshold", CASE_XDR_COMPRESSION_THRESHOLD }, + { "xdr-delete-shipping-enabled", CASE_XDR_DELETE_SHIPPING_ENABLED }, + { "xdr-digestlog-iowait-ms", CASE_XDR_DIGESTLOG_IOWAIT_MS }, + { "forward-xdr-writes", CASE_XDR_FORWARD_XDR_WRITES }, + { "xdr-hotkey-time-ms", CASE_XDR_HOTKEY_TIME_MS }, + { "xdr-info-port", CASE_XDR_INFO_PORT }, + { "xdr-info-timeout", CASE_XDR_INFO_TIMEOUT }, + { "xdr-max-ship-bandwidth", CASE_XDR_MAX_SHIP_BANDWIDTH }, + { "xdr-max-ship-throughput", CASE_XDR_MAX_SHIP_THROUGHPUT }, + { "xdr-min-digestlog-free-pct", CASE_XDR_MIN_DIGESTLOG_FREE_PCT }, + { "xdr-nsup-deletes-enabled", CASE_XDR_NSUP_DELETES_ENABLED }, + { "xdr-read-threads", CASE_XDR_READ_THREADS}, + { "xdr-ship-bins", CASE_XDR_SHIP_BINS }, + { "xdr-ship-delay", CASE_XDR_SHIP_DELAY }, // hidden + { "xdr-shipping-enabled", CASE_XDR_SHIPPING_ENABLED }, + { "xdr-write-timeout", CASE_XDR_WRITE_TIMEOUT }, + { "}", CASE_CONTEXT_END } +}; + +const cfg_opt XDR_DATACENTER_OPTS[] = { + { "{", CASE_CONTEXT_BEGIN }, + { "dc-node-address-port", CASE_XDR_DATACENTER_DC_NODE_ADDRESS_PORT }, + { "dc-connections", CASE_XDR_DATACENTER_DC_CONNECTIONS }, + { "dc-connections-idle-ms", CASE_XDR_DATACENTER_DC_CONNECTIONS_IDLE_MS }, + { "dc-int-ext-ipmap", CASE_XDR_DATACENTER_DC_INT_EXT_IPMAP }, + { "dc-security-config-file", CASE_XDR_DATACENTER_DC_SECURITY_CONFIG_FILE }, + { "dc-use-alternate-services", CASE_XDR_DATACENTER_DC_USE_ALTERNATE_SERVICES }, + { "tls-name", CASE_XDR_DATACENTER_TLS_NAME }, + { "tls-node", CASE_XDR_DATACENTER_TLS_NODE }, + { "}", CASE_CONTEXT_END } +}; + +// Used parsing separate file, but share cfg_case_id enum. + +const cfg_opt XDR_SEC_GLOBAL_OPTS[] = { + { "credentials", XDR_SEC_CASE_CREDENTIALS_BEGIN } +}; + +const cfg_opt XDR_SEC_CREDENTIALS_OPTS[] = { + { "{", CASE_CONTEXT_BEGIN }, + { "username", XDR_SEC_CASE_CREDENTIALS_USERNAME }, + { "password", XDR_SEC_CASE_CREDENTIALS_PASSWORD }, + { "}", CASE_CONTEXT_END } +}; + +const int NUM_GLOBAL_OPTS = sizeof(GLOBAL_OPTS) / sizeof(cfg_opt); +const int NUM_SERVICE_OPTS = sizeof(SERVICE_OPTS) / sizeof(cfg_opt); +const int NUM_SERVICE_AUTO_PIN_OPTS = sizeof(SERVICE_AUTO_PIN_OPTS) / sizeof(cfg_opt); +const int NUM_SERVICE_DEBUG_ALLOCATIONS_OPTS = sizeof(SERVICE_DEBUG_ALLOCATIONS_OPTS) / sizeof(cfg_opt); +const int NUM_LOGGING_OPTS = sizeof(LOGGING_OPTS) / sizeof(cfg_opt); +const int NUM_LOGGING_FILE_OPTS = sizeof(LOGGING_FILE_OPTS) / sizeof(cfg_opt); +const int NUM_LOGGING_CONSOLE_OPTS = sizeof(LOGGING_CONSOLE_OPTS) / sizeof(cfg_opt); +const int NUM_NETWORK_OPTS = sizeof(NETWORK_OPTS) / sizeof(cfg_opt); +const int NUM_NETWORK_SERVICE_OPTS = sizeof(NETWORK_SERVICE_OPTS) / sizeof(cfg_opt); +const int NUM_NETWORK_HEARTBEAT_OPTS = sizeof(NETWORK_HEARTBEAT_OPTS) / sizeof(cfg_opt); +const int NUM_NETWORK_HEARTBEAT_MODE_OPTS = sizeof(NETWORK_HEARTBEAT_MODE_OPTS) / sizeof(cfg_opt); +const int NUM_NETWORK_HEARTBEAT_PROTOCOL_OPTS = sizeof(NETWORK_HEARTBEAT_PROTOCOL_OPTS) / sizeof(cfg_opt); +const int NUM_NETWORK_FABRIC_OPTS = sizeof(NETWORK_FABRIC_OPTS) / sizeof(cfg_opt); +const int NUM_NETWORK_INFO_OPTS = sizeof(NETWORK_INFO_OPTS) / sizeof(cfg_opt); +const int NUM_NETWORK_TLS_OPTS = sizeof(NETWORK_TLS_OPTS) / sizeof(cfg_opt); +const int NUM_NAMESPACE_OPTS = sizeof(NAMESPACE_OPTS) / sizeof(cfg_opt); +const int NUM_NAMESPACE_CONFLICT_RESOLUTION_OPTS = sizeof(NAMESPACE_CONFLICT_RESOLUTION_OPTS) / sizeof(cfg_opt); +const int NUM_NAMESPACE_READ_CONSISTENCY_OPTS = sizeof(NAMESPACE_READ_CONSISTENCY_OPTS) / sizeof(cfg_opt); +const int NUM_NAMESPACE_WRITE_COMMIT_OPTS = sizeof(NAMESPACE_WRITE_COMMIT_OPTS) / sizeof(cfg_opt); +const int NUM_NAMESPACE_STORAGE_OPTS = sizeof(NAMESPACE_STORAGE_OPTS) / sizeof(cfg_opt); +const int NUM_NAMESPACE_STORAGE_DEVICE_OPTS = sizeof(NAMESPACE_STORAGE_DEVICE_OPTS) / sizeof(cfg_opt); +const int NUM_NAMESPACE_SET_OPTS = sizeof(NAMESPACE_SET_OPTS) / sizeof(cfg_opt); +const int NUM_NAMESPACE_SET_ENABLE_XDR_OPTS = sizeof(NAMESPACE_SET_ENABLE_XDR_OPTS) / sizeof(cfg_opt); +const int NUM_NAMESPACE_SI_OPTS = sizeof(NAMESPACE_SI_OPTS) / sizeof(cfg_opt); +const int NUM_NAMESPACE_SINDEX_OPTS = sizeof(NAMESPACE_SINDEX_OPTS) / sizeof(cfg_opt); +const int NUM_NAMESPACE_GEO2DSPHERE_WITHIN_OPTS = sizeof(NAMESPACE_GEO2DSPHERE_WITHIN_OPTS) / sizeof(cfg_opt); +const int NUM_MOD_LUA_OPTS = sizeof(MOD_LUA_OPTS) / sizeof(cfg_opt); +const int NUM_SECURITY_OPTS = sizeof(SECURITY_OPTS) / sizeof(cfg_opt); +const int NUM_SECURITY_LOG_OPTS = sizeof(SECURITY_LOG_OPTS) / sizeof(cfg_opt); +const int NUM_SECURITY_SYSLOG_OPTS = sizeof(SECURITY_SYSLOG_OPTS) / sizeof(cfg_opt); +const int NUM_XDR_OPTS = sizeof(XDR_OPTS) / sizeof(cfg_opt); +const int NUM_XDR_DATACENTER_OPTS = sizeof(XDR_DATACENTER_OPTS) / sizeof(cfg_opt); + +// Used parsing separate file, but share cfg_case_id enum. + +const int NUM_XDR_SEC_GLOBAL_OPTS = sizeof(XDR_SEC_GLOBAL_OPTS) / sizeof(cfg_opt); +const int NUM_XDR_SEC_CREDENTIALS_OPTS = sizeof(XDR_SEC_CREDENTIALS_OPTS) / sizeof(cfg_opt); + + +//========================================================== +// Configuration value constants not for switch cases. +// + +const char* DEVICE_SCHEDULER_MODES[] = { + "anticipatory", + "cfq", // best for rotational drives + "deadline", + "noop" // best for SSDs +}; + +const int NUM_DEVICE_SCHEDULER_MODES = sizeof(DEVICE_SCHEDULER_MODES) / sizeof(const char*); + + +//========================================================== +// Generic parsing utilities. +// + +// Don't use these functions. Use the cf_str functions, which have better error +// handling, and support K, M, B/G, etc. +#undef atoi +#define atoi() DO_NOT_USE +#undef atol +#define atol() DO_NOT_USE +#undef atoll +#define atol() DO_NOT_USE + +//------------------------------------------------ +// Parsing state (context) tracking & switching. +// + +typedef enum { + GLOBAL, + SERVICE, + LOGGING, LOGGING_FILE, LOGGING_CONSOLE, + NETWORK, NETWORK_SERVICE, NETWORK_HEARTBEAT, NETWORK_FABRIC, NETWORK_INFO, NETWORK_TLS, + NAMESPACE, NAMESPACE_STORAGE_DEVICE, NAMESPACE_SET, NAMESPACE_SI, NAMESPACE_SINDEX, NAMESPACE_GEO2DSPHERE_WITHIN, + MOD_LUA, + SECURITY, SECURITY_LOG, SECURITY_SYSLOG, + XDR, XDR_DATACENTER, + // Used parsing separate file, but shares this enum: + XDR_SEC_CREDENTIALS, + // Must be last, use for sanity-checking: + PARSER_STATE_MAX_PLUS_1 +} as_config_parser_state; + +// For detail logging only - keep in sync with as_config_parser_state. +const char* CFG_PARSER_STATES[] = { + "GLOBAL", + "SERVICE", + "LOGGING", "LOGGING_FILE", "LOGGING_CONSOLE", + "NETWORK", "NETWORK_SERVICE", "NETWORK_HEARTBEAT", "NETWORK_FABRIC", "NETWORK_INFO", "NETWORK_TLS", + "NAMESPACE", "NAMESPACE_STORAGE_DEVICE", "NAMESPACE_SET", "NAMESPACE_SI", "NAMESPACE_SINDEX", "NAMESPACE_GEO2DSPHERE_WITHIN", + "MOD_LUA", + "SECURITY", "SECURITY_LOG", "SECURITY_SYSLOG", + "XDR", "XDR_DATACENTER", + // Used parsing separate file, but shares corresponding enum: + "XDR_SEC_CREDENTIALS" +}; + +#define MAX_STACK_DEPTH 8 + +typedef struct cfg_parser_state_s { + as_config_parser_state current; + as_config_parser_state stack[MAX_STACK_DEPTH]; + int depth; +} cfg_parser_state; + +void +cfg_parser_state_init(cfg_parser_state* p_state) +{ + p_state->current = p_state->stack[0] = GLOBAL; + p_state->depth = 0; +} + +void +cfg_begin_context(cfg_parser_state* p_state, as_config_parser_state context) +{ + if (context < 0 || context >= PARSER_STATE_MAX_PLUS_1) { + cf_crash(AS_CFG, "parsing - unknown context"); + } + + as_config_parser_state prev_context = p_state->stack[p_state->depth]; + + if (++p_state->depth >= MAX_STACK_DEPTH) { + cf_crash(AS_CFG, "parsing - context too deep"); + } + + p_state->current = p_state->stack[p_state->depth] = context; + + // To see this log, change NO_SINKS_LIMIT in fault.c: + cf_detail(AS_CFG, "begin context: %s -> %s", CFG_PARSER_STATES[prev_context], CFG_PARSER_STATES[context]); +} + +void +cfg_end_context(cfg_parser_state* p_state) +{ + as_config_parser_state prev_context = p_state->stack[p_state->depth]; + + if (--p_state->depth < 0) { + cf_crash(AS_CFG, "parsing - can't end context depth 0"); + } + + p_state->current = p_state->stack[p_state->depth]; + + // To see this log, change NO_SINKS_LIMIT in fault.c: + cf_detail(AS_CFG, "end context: %s -> %s", CFG_PARSER_STATES[prev_context], CFG_PARSER_STATES[p_state->current]); +} + +//------------------------------------------------ +// Given a token, return switch case identifier. +// + +cfg_case_id +cfg_find_tok(const char* tok, const cfg_opt opts[], int num_opts) +{ + for (int i = 0; i < num_opts; i++) { + if (strcmp(tok, opts[i].tok) == 0) { + return opts[i].case_id; + } + } + + return CASE_NOT_FOUND; +} + +//------------------------------------------------ +// Value parsing and sanity-checking utilities. +// + +void +cfg_renamed_name_tok(const cfg_line* p_line, const char* new_tok) +{ + cf_warning(AS_CFG, "line %d :: %s was renamed - please use '%s'", + p_line->num, p_line->name_tok, new_tok); +} + +void +cfg_renamed_val_tok_1(const cfg_line* p_line, const char* new_tok) +{ + cf_warning(AS_CFG, "line %d :: %s value '%s' was renamed - please use '%s'", + p_line->num, p_line->name_tok, p_line->val_tok_1, new_tok); +} + +void +cfg_deprecated_name_tok(const cfg_line* p_line) +{ + cf_warning(AS_CFG, "line %d :: %s is deprecated - please remove", + p_line->num, p_line->name_tok); +} + +void +cfg_deprecated_val_tok_1(const cfg_line* p_line) +{ + cf_warning(AS_CFG, "line %d :: %s value '%s' is deprecated - please remove", + p_line->num, p_line->name_tok, p_line->val_tok_1); +} + +void +cfg_unknown_name_tok(const cfg_line* p_line) +{ + cf_crash_nostack(AS_CFG, "line %d :: unknown config parameter name '%s'", + p_line->num, p_line->name_tok); +} + +void +cfg_unknown_val_tok_1(const cfg_line* p_line) +{ + cf_crash_nostack(AS_CFG, "line %d :: %s has unknown value '%s'", + p_line->num, p_line->name_tok, p_line->val_tok_1); +} + +void +cfg_obsolete(const cfg_line* p_line, const char* message) +{ + cf_crash_nostack(AS_CFG, "line %d :: '%s' is obsolete%s%s", + p_line->num, p_line->name_tok, message ? " - " : "", message); +} + +void +cfg_not_supported(const cfg_line* p_line, const char* feature) +{ + cf_crash_nostack(AS_CFG, "line %d :: illegal value '%s' for config parameter '%s' - feature %s is not supported", + p_line->num, p_line->val_tok_1, p_line->name_tok, feature); +} + +char* +cfg_strdup_no_checks(const cfg_line* p_line) +{ + return cf_strdup(p_line->val_tok_1); +} + +char* +cfg_strdup_val2_no_checks(const cfg_line* p_line) +{ + return cf_strdup(p_line->val_tok_2); +} + +char* +cfg_strdup_anyval(const cfg_line* p_line, const char* val_tok, bool is_required) +{ + if (val_tok[0] == 0) { + if (is_required) { + cf_crash_nostack(AS_CFG, "line %d :: %s must have a value specified", + p_line->num, p_line->name_tok); + } + + // Do not duplicate empty strings. + return NULL; + } + + return cf_strdup(val_tok); +} + +char* +cfg_strdup(const cfg_line* p_line, bool is_required) +{ + return cfg_strdup_anyval(p_line, p_line->val_tok_1, is_required); +} + +char* +cfg_strdup_val2(const cfg_line* p_line, bool is_required) +{ + return cfg_strdup_anyval(p_line, p_line->val_tok_2, is_required); +} + +char* +cfg_strdup_one_of(const cfg_line* p_line, const char* toks[], int num_toks) +{ + for (int i = 0; i < num_toks; i++) { + if (strcmp(p_line->val_tok_1, toks[i]) == 0) { + return cfg_strdup_no_checks(p_line); + } + } + + uint32_t valid_toks_size = (num_toks * 2) + 1; + + for (int i = 0; i < num_toks; i++) { + valid_toks_size += strlen(toks[i]); + } + + char valid_toks[valid_toks_size]; + + valid_toks[0] = 0; + + for (int i = 0; i < num_toks; i++) { + strcat(valid_toks, toks[i]); + strcat(valid_toks, ", "); + } + + cf_crash_nostack(AS_CFG, "line %d :: %s must be one of: %snot %s", + p_line->num, p_line->name_tok, valid_toks, p_line->val_tok_1); + + // Won't get here, but quiet warnings... + return NULL; +} + +void +cfg_strcpy(const cfg_line* p_line, char* p_str, size_t max_size) +{ + size_t tok1_len = strlen(p_line->val_tok_1); + + if (tok1_len == 0) { + cf_crash_nostack(AS_CFG, "line %d :: %s must have a value specified", + p_line->num, p_line->name_tok); + } + + if (tok1_len >= max_size) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be < %lu characters long, not %s", + p_line->num, p_line->name_tok, max_size, p_line->val_tok_1); + } + + strcpy(p_str, p_line->val_tok_1); +} + +bool +cfg_bool(const cfg_line* p_line) +{ + if (strcasecmp(p_line->val_tok_1, "true") == 0 || strcasecmp(p_line->val_tok_1, "yes") == 0) { + return true; + } + + if (strcasecmp(p_line->val_tok_1, "false") == 0 || strcasecmp(p_line->val_tok_1, "no") == 0) { + return false; + } + + if (*p_line->val_tok_1 == '\0') { + cf_crash_nostack(AS_CFG, "line %d :: %s must be true or false or yes or no", + p_line->num, p_line->name_tok); + } + + cf_crash_nostack(AS_CFG, "line %d :: %s must be true or false or yes or no, not %s", + p_line->num, p_line->name_tok, p_line->val_tok_1); + + // Won't get here, but quiet warnings... + return false; +} + +bool +cfg_bool_no_value_is_true(const cfg_line* p_line) +{ + return (*p_line->val_tok_1 == '\0') ? true : cfg_bool(p_line); +} + +int64_t +cfg_i64_anyval_no_checks(const cfg_line* p_line, char* val_tok) +{ + if (*val_tok == '\0') { + cf_crash_nostack(AS_CFG, "line %d :: %s must specify an integer value", + p_line->num, p_line->name_tok); + } + + int64_t value; + + if (0 != cf_str_atoi_64(val_tok, &value)) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be a number, not %s", + p_line->num, p_line->name_tok, val_tok); + } + + return value; +} + +int64_t +cfg_i64_no_checks(const cfg_line* p_line) +{ + return cfg_i64_anyval_no_checks(p_line, p_line->val_tok_1); +} + +int64_t +cfg_i64_val2_no_checks(const cfg_line* p_line) +{ + return cfg_i64_anyval_no_checks(p_line, p_line->val_tok_2); +} + +int64_t +cfg_i64_val3_no_checks(const cfg_line* p_line) +{ + return cfg_i64_anyval_no_checks(p_line, p_line->val_tok_3); +} + +int64_t +cfg_i64(const cfg_line* p_line, int64_t min, int64_t max) +{ + int64_t value = cfg_i64_no_checks(p_line); + + if (value < min || value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %ld and <= %ld, not %ld", + p_line->num, p_line->name_tok, min, max, value); + } + + return value; +} + +int +cfg_int_no_checks(const cfg_line* p_line) +{ + int64_t value = cfg_i64_no_checks(p_line); + + if (value < INT_MIN || value > INT_MAX) { + cf_crash_nostack(AS_CFG, "line %d :: %s %ld overflows int", + p_line->num, p_line->name_tok, value); + } + + return (int)value; +} + +int +cfg_int(const cfg_line* p_line, int min, int max) +{ + int value = cfg_int_no_checks(p_line); + + if (value < min || value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %d and <= %d, not %d", + p_line->num, p_line->name_tok, min, max, value); + } + + return value; +} + +int +cfg_int_val2_no_checks(const cfg_line* p_line) +{ + int64_t value = cfg_i64_val2_no_checks(p_line); + + if (value < INT_MIN || value > INT_MAX) { + cf_crash_nostack(AS_CFG, "line %d :: %s %ld overflows int", + p_line->num, p_line->name_tok, value); + } + + return (int)value; +} + +int +cfg_int_val3_no_checks(const cfg_line* p_line) +{ + int64_t value = cfg_i64_val3_no_checks(p_line); + + if (value < INT_MIN || value > INT_MAX) { + cf_crash_nostack(AS_CFG, "line %d :: %s %ld overflows int", + p_line->num, p_line->name_tok, value); + } + + return (int)value; +} +int +cfg_int_val2(const cfg_line* p_line, int min, int max) +{ + int value = cfg_int_val2_no_checks(p_line); + + if (value < min || value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %d and <= %d, not %d", + p_line->num, p_line->name_tok, min, max, value); + } + + return value; +} + +int +cfg_int_val3(const cfg_line* p_line, int min, int max) +{ + int value = cfg_int_val3_no_checks(p_line); + + if (value < min || value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %d and <= %d, not %d", + p_line->num, p_line->name_tok, min, max, value); + } + + return value; +} + +uint64_t +cfg_x64_anyval_no_checks(const cfg_line* p_line, char* val_tok) +{ + if (*val_tok == '\0') { + cf_crash_nostack(AS_CFG, "line %d :: %s must specify a hex value", + p_line->num, p_line->name_tok); + } + + uint64_t value; + + if (0 != cf_str_atoi_x64(val_tok, &value)) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be a 64-bit hex number, not %s", + p_line->num, p_line->name_tok, val_tok); + } + + return value; +} + +uint64_t +cfg_x64_no_checks(const cfg_line* p_line) +{ + return cfg_x64_anyval_no_checks(p_line, p_line->val_tok_1); +} + +uint64_t +cfg_x64(const cfg_line* p_line, uint64_t min, uint64_t max) +{ + uint64_t value = cfg_x64_no_checks(p_line); + + if (min == 0) { + if (value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be <= %lx, not %lx", + p_line->num, p_line->name_tok, max, value); + } + } + else if (value < min || value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %lx and <= %lx, not %lx", + p_line->num, p_line->name_tok, min, max, value); + } + + return value; +} + +uint64_t +cfg_u64_anyval_no_checks(const cfg_line* p_line, char* val_tok) +{ + if (*val_tok == '\0') { + cf_crash_nostack(AS_CFG, "line %d :: %s must specify an unsigned integer value", + p_line->num, p_line->name_tok); + } + + uint64_t value; + + if (0 != cf_str_atoi_u64(val_tok, &value)) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be an unsigned number, not %s", + p_line->num, p_line->name_tok, val_tok); + } + + return value; +} + +uint64_t +cfg_u64_no_checks(const cfg_line* p_line) +{ + return cfg_u64_anyval_no_checks(p_line, p_line->val_tok_1); +} + +uint64_t +cfg_u64_val2_no_checks(const cfg_line* p_line) +{ + return cfg_u64_anyval_no_checks(p_line, p_line->val_tok_2); +} + +uint64_t +cfg_u64(const cfg_line* p_line, uint64_t min, uint64_t max) +{ + uint64_t value = cfg_u64_no_checks(p_line); + + if (min == 0) { + if (value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be <= %lu, not %lu", + p_line->num, p_line->name_tok, max, value); + } + } + else if (value < min || value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %lu and <= %lu, not %lu", + p_line->num, p_line->name_tok, min, max, value); + } + + return value; +} + +uint32_t +cfg_u32_no_checks(const cfg_line* p_line) +{ + uint64_t value = cfg_u64_no_checks(p_line); + + if (value > UINT_MAX) { + cf_crash_nostack(AS_CFG, "line %d :: %s %lu overflows unsigned int", + p_line->num, p_line->name_tok, value); + } + + return (uint32_t)value; +} + +uint32_t +cfg_u32(const cfg_line* p_line, uint32_t min, uint32_t max) +{ + uint32_t value = cfg_u32_no_checks(p_line); + + if (min == 0) { + if (value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be <= %u, not %u", + p_line->num, p_line->name_tok, max, value); + } + } + else if (value < min || value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %u and <= %u, not %u", + p_line->num, p_line->name_tok, min, max, value); + } + + return value; +} + +// Note - accepts 0 if min is 0. +uint32_t +cfg_u32_power_of_2(const cfg_line* p_line, uint32_t min, uint32_t max) +{ + uint32_t value = cfg_u32(p_line, min, max); + + if ((value & (value - 1)) != 0) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be an exact power of 2, not %u", + p_line->num, p_line->name_tok, value); + } + + return value; +} + +uint16_t +cfg_u16_no_checks(const cfg_line* p_line) +{ + uint64_t value = cfg_u64_no_checks(p_line); + + if (value > USHRT_MAX) { + cf_crash_nostack(AS_CFG, "line %d :: %s %lu overflows unsigned short", + p_line->num, p_line->name_tok, value); + } + + return (uint16_t)value; +} + +uint16_t +cfg_u16(const cfg_line* p_line, uint16_t min, uint16_t max) +{ + uint16_t value = cfg_u16_no_checks(p_line); + + if (min == 0) { + if (value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be <= %u, not %u", + p_line->num, p_line->name_tok, max, value); + } + } + else if (value < min || value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %u and <= %u, not %u", + p_line->num, p_line->name_tok, min, max, value); + } + + return value; +} + +uint8_t +cfg_u8_no_checks(const cfg_line* p_line) +{ + uint64_t value = cfg_u64_no_checks(p_line); + + if (value > UCHAR_MAX) { + cf_crash_nostack(AS_CFG, "line %d :: %s %lu overflows unsigned char", + p_line->num, p_line->name_tok, value); + } + + return (uint8_t)value; +} + +uint8_t +cfg_u8(const cfg_line* p_line, uint8_t min, uint8_t max) +{ + uint8_t value = cfg_u8_no_checks(p_line); + + if (min == 0) { + if (value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be <= %u, not %u", + p_line->num, p_line->name_tok, max, value); + } + } + else if (value < min || value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %u and <= %u, not %u", + p_line->num, p_line->name_tok, min, max, value); + } + + return value; +} + +uint32_t +cfg_seconds_no_checks(const cfg_line* p_line) +{ + if (*p_line->val_tok_1 == '\0') { + cf_crash_nostack(AS_CFG, "line %d :: %s must specify an unsigned integer value with time unit (s, m, h, or d)", + p_line->num, p_line->name_tok); + } + + uint64_t value; + + // TODO - should fix this to guard against overflow, give uint32_t. + if (0 != cf_str_atoi_seconds(p_line->val_tok_1, &value)) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be an unsigned number with time unit (s, m, h, or d), not %s", + p_line->num, p_line->name_tok, p_line->val_tok_1); + } + + return (uint32_t)value; +} + +uint32_t +cfg_seconds(const cfg_line* p_line, uint32_t min, uint32_t max) +{ + uint32_t value = cfg_seconds_no_checks(p_line); + + if (min == 0) { + if (value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be <= %u seconds, not %u seconds", + p_line->num, p_line->name_tok, max, value); + } + } + else if (value < min || value > max) { + cf_crash_nostack(AS_CFG, "line %d :: %s must be >= %u seconds and <= %u seconds, not %u seconds", + p_line->num, p_line->name_tok, min, max, value); + } + + return value; +} + +// Minimum & maximum port numbers: +const int CFG_MIN_PORT = 1024; +const int CFG_MAX_PORT = USHRT_MAX; + +cf_ip_port +cfg_port(const cfg_line* p_line) +{ + return (cf_ip_port)cfg_int(p_line, CFG_MIN_PORT, CFG_MAX_PORT); +} + +cf_ip_port +cfg_port_val2(const cfg_line* p_line) +{ + return (cf_ip_port)cfg_int_val2(p_line, CFG_MIN_PORT, CFG_MAX_PORT); +} + +cf_ip_port +cfg_port_val3(const cfg_line* p_line) +{ + return (cf_ip_port)cfg_int_val3(p_line, CFG_MIN_PORT, CFG_MAX_PORT); +} + +//------------------------------------------------ +// Constants used in parsing. +// + +// Token delimiter characters: +const char CFG_WHITESPACE[] = " \t\n\r\f\v"; + + +//========================================================== +// Public API - parse the configuration file. +// + +as_config* +as_config_init(const char* config_file) +{ + as_config* c = &g_config; // shortcut pointer + + // Set the service context defaults. Values parsed from the config file will + // override the defaults. + cfg_set_defaults(); + xdr_config_defaults(); + + FILE* FD; + char iobuf[256]; + int line_num = 0; + cfg_parser_state state; + + cfg_parser_state_init(&state); + + as_namespace* ns = NULL; + dc_config_opt *cur_dc_cfg = NULL; + cf_tls_spec* tls_spec = NULL; + cf_fault_sink* sink = NULL; + as_set* p_set = NULL; // local variable used for set initialization + + // Open the configuration file for reading. + if (NULL == (FD = fopen(config_file, "r"))) { + cf_crash_nostack(AS_CFG, "couldn't open configuration file %s: %s", config_file, cf_strerror(errno)); + } + + // Parse the configuration file, line by line. + while (fgets(iobuf, sizeof(iobuf), FD)) { + line_num++; + + // First chop the comment off, if there is one. + + char* p_comment = strchr(iobuf, '#'); + + if (p_comment) { + *p_comment = '\0'; + } + + // Find (and null-terminate) up to three whitespace-delimited tokens in + // the line, a 'name' token and up to two 'value' tokens. + + cfg_line line = { line_num, NULL, NULL, NULL, NULL }; + + line.name_tok = strtok(iobuf, CFG_WHITESPACE); + + // If there are no tokens, ignore this line, get the next line. + if (! line.name_tok) { + continue; + } + + line.val_tok_1 = strtok(NULL, CFG_WHITESPACE); + + if (! line.val_tok_1) { + line.val_tok_1 = ""; // in case it's used where NULL can't be used + } + else { + line.val_tok_2 = strtok(NULL, CFG_WHITESPACE); + } + + if (! line.val_tok_2) { + line.val_tok_2 = ""; // in case it's used where NULL can't be used + } + else { + line.val_tok_3 = strtok(NULL, CFG_WHITESPACE); + } + + if (! line.val_tok_3) { + line.val_tok_3 = ""; // in case it's used where NULL can't be used + } + + // Note that we can't see this output until a logging sink is specified. + cf_detail(AS_CFG, "line %d :: %s %s %s %s", line_num, line.name_tok, line.val_tok_1, line.val_tok_2, line.val_tok_3); + + // Parse the directive. + switch (state.current) { + + //================================================== + // Parse top-level items. + // + case GLOBAL: + switch (cfg_find_tok(line.name_tok, GLOBAL_OPTS, NUM_GLOBAL_OPTS)) { + case CASE_SERVICE_BEGIN: + cfg_begin_context(&state, SERVICE); + break; + case CASE_LOGGING_BEGIN: + cfg_begin_context(&state, LOGGING); + break; + case CASE_NETWORK_BEGIN: + cfg_begin_context(&state, NETWORK); + break; + case CASE_NAMESPACE_BEGIN: + // Create the namespace objects. + ns = as_namespace_create(line.val_tok_1); + cfg_begin_context(&state, NAMESPACE); + break; + case CASE_MOD_LUA_BEGIN: + cfg_begin_context(&state, MOD_LUA); + break; + case CASE_SECURITY_BEGIN: + cfg_enterprise_only(&line); + cfg_begin_context(&state, SECURITY); + break; + case CASE_XDR_BEGIN: + g_xcfg.xdr_section_configured = true; + cfg_enterprise_only(&line); + cfg_begin_context(&state, XDR); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //================================================== + // Parse service context items. + // + case SERVICE: + switch (cfg_find_tok(line.name_tok, SERVICE_OPTS, NUM_SERVICE_OPTS)) { + case CASE_SERVICE_USER: + { + struct passwd* pwd; + if (NULL == (pwd = getpwnam(line.val_tok_1))) { + cf_crash_nostack(AS_CFG, "line %d :: user not found: %s", line_num, line.val_tok_1); + } + c->uid = pwd->pw_uid; + endpwent(); + } + break; + case CASE_SERVICE_GROUP: + { + struct group* grp; + if (NULL == (grp = getgrnam(line.val_tok_1))) { + cf_crash_nostack(AS_CFG, "line %d :: group not found: %s", line_num, line.val_tok_1); + } + c->gid = grp->gr_gid; + endgrent(); + } + break; + case CASE_SERVICE_PAXOS_SINGLE_REPLICA_LIMIT: + c->paxos_single_replica_limit = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_PIDFILE: + c->pidfile = cfg_strdup_no_checks(&line); + break; + case CASE_SERVICE_CLIENT_FD_MAX: + cfg_renamed_name_tok(&line, "proto-fd-max"); + // No break. + case CASE_SERVICE_PROTO_FD_MAX: + c->n_proto_fd_max = cfg_int_no_checks(&line); + break; + case CASE_SERVICE_ADVERTISE_IPV6: + cf_socket_set_advertise_ipv6(cfg_bool(&line)); + break; + case CASE_SERVICE_AUTO_PIN: + switch (cfg_find_tok(line.val_tok_1, SERVICE_AUTO_PIN_OPTS, NUM_SERVICE_AUTO_PIN_OPTS)) { + case CASE_SERVICE_AUTO_PIN_NONE: + c->auto_pin = CF_TOPO_AUTO_PIN_NONE; + break; + case CASE_SERVICE_AUTO_PIN_CPU: + c->auto_pin = CF_TOPO_AUTO_PIN_CPU; + break; + case CASE_SERVICE_AUTO_PIN_NUMA: + c->auto_pin = CF_TOPO_AUTO_PIN_NUMA; + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_val_tok_1(&line); + break; + } + break; + case CASE_SERVICE_BATCH_THREADS: + c->n_batch_threads = cfg_int(&line, 0, MAX_BATCH_THREADS); + break; + case CASE_SERVICE_BATCH_MAX_BUFFERS_PER_QUEUE: + c->batch_max_buffers_per_queue = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_BATCH_MAX_REQUESTS: + c->batch_max_requests = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_BATCH_MAX_UNUSED_BUFFERS: + c->batch_max_unused_buffers = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_BATCH_PRIORITY: + c->batch_priority = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_BATCH_INDEX_THREADS: + c->n_batch_index_threads = cfg_u32(&line, 1, MAX_BATCH_THREADS); + break; + case CASE_SERVICE_CLUSTER_NAME: + cfg_set_cluster_name(line.val_tok_1); + break; + case CASE_SERVICE_ENABLE_BENCHMARKS_FABRIC: + c->fabric_benchmarks_enabled = cfg_bool(&line); + break; + case CASE_SERVICE_ENABLE_BENCHMARKS_SVC: + c->svc_benchmarks_enabled = cfg_bool(&line); + break; + case CASE_SERVICE_ENABLE_HIST_INFO: + c->info_hist_enabled = cfg_bool(&line); + break; + case CASE_SERVICE_FEATURE_KEY_FILE: + c->feature_key_file = cfg_strdup(&line, true); + break; + case CASE_SERVICE_HIST_TRACK_BACK: + c->hist_track_back = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_HIST_TRACK_SLICE: + c->hist_track_slice = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_HIST_TRACK_THRESHOLDS: + c->hist_track_thresholds = cfg_strdup_no_checks(&line); + // TODO - if config key present but no value (not even space) failure mode is bad... + break; + case CASE_SERVICE_INFO_THREADS: + c->n_info_threads = cfg_int_no_checks(&line); + break; + case CASE_SERVICE_LOG_LOCAL_TIME: + cf_fault_use_local_time(cfg_bool(&line)); + break; + case CASE_SERVICE_LOG_MILLIS: + cf_fault_log_millis(cfg_bool(&line)); + break; + case CASE_SERVICE_MIGRATE_MAX_NUM_INCOMING: + c->migrate_max_num_incoming = cfg_u32(&line, 0, AS_MIGRATE_LIMIT_MAX_NUM_INCOMING); + break; + case CASE_SERVICE_MIGRATE_THREADS: + c->n_migrate_threads = cfg_u32(&line, 0, MAX_NUM_MIGRATE_XMIT_THREADS); + break; + case CASE_SERVICE_MIN_CLUSTER_SIZE: + c->clustering_config.cluster_size_min = cfg_u32(&line, 0, AS_CLUSTER_SZ); + break; + case CASE_SERVICE_NODE_ID: + c->self_node = cfg_x64(&line, 1, UINT64_MAX); + break; + case CASE_SERVICE_NODE_ID_INTERFACE: + c->node_id_interface = cfg_strdup_no_checks(&line); + break; + case CASE_SERVICE_NSUP_DELETE_SLEEP: + c->nsup_delete_sleep = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_NSUP_PERIOD: + c->nsup_period = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_NSUP_STARTUP_EVICT: + c->nsup_startup_evict = cfg_bool(&line); + break; + case CASE_SERVICE_PROTO_FD_IDLE_MS: + c->proto_fd_idle_ms = cfg_int_no_checks(&line); + break; + case CASE_SERVICE_QUERY_BATCH_SIZE: + c->query_bsize = cfg_int_no_checks(&line); + break; + case CASE_SERVICE_QUERY_BUFPOOL_SIZE: + c->query_bufpool_size = cfg_u32(&line, 1, UINT32_MAX); + break; + case CASE_SERVICE_QUERY_IN_TRANSACTION_THREAD: + c->query_in_transaction_thr = cfg_bool(&line); + break; + case CASE_SERVICE_QUERY_LONG_Q_MAX_SIZE: + c->query_long_q_max_size = cfg_u32(&line, 1, UINT32_MAX); + break; + case CASE_SERVICE_QUERY_PRE_RESERVE_PARTITIONS: + c->partitions_pre_reserved = cfg_bool(&line); + break; + case CASE_SERVICE_QUERY_PRIORITY: + c->query_priority = cfg_int_no_checks(&line); + break; + case CASE_SERVICE_QUERY_PRIORITY_SLEEP_US: + c->query_sleep_us = cfg_u64_no_checks(&line); + break; + case CASE_SERVICE_QUERY_REC_COUNT_BOUND: + c->query_rec_count_bound = cfg_u64(&line, 1, UINT64_MAX); + break; + case CASE_SERVICE_QUERY_REQ_IN_QUERY_THREAD: + c->query_req_in_query_thread = cfg_bool(&line); + break; + case CASE_SERVICE_QUERY_REQ_MAX_INFLIGHT: + c->query_req_max_inflight = cfg_u32(&line, 1, UINT32_MAX); + break; + case CASE_SERVICE_QUERY_SHORT_Q_MAX_SIZE: + c->query_short_q_max_size = cfg_u32(&line, 1, UINT32_MAX); + break; + case CASE_SERVICE_QUERY_THREADS: + c->query_threads = cfg_u32(&line, 1, AS_QUERY_MAX_THREADS); + break; + case CASE_SERVICE_QUERY_THRESHOLD: + c->query_threshold = cfg_int_no_checks(&line); + break; + case CASE_SERVICE_QUERY_UNTRACKED_TIME_MS: + c->query_untracked_time_ms = cfg_u64_no_checks(&line); + break; + case CASE_SERVICE_QUERY_WORKER_THREADS: + c->query_worker_threads = cfg_u32(&line, 1, AS_QUERY_MAX_WORKER_THREADS); + break; + case CASE_SERVICE_RUN_AS_DAEMON: + c->run_as_daemon = cfg_bool_no_value_is_true(&line); + break; + case CASE_SERVICE_SCAN_MAX_ACTIVE: + c->scan_max_active = cfg_u32(&line, 0, 200); + break; + case CASE_SERVICE_SCAN_MAX_DONE: + c->scan_max_done = cfg_u32(&line, 0, 1000); + break; + case CASE_SERVICE_SCAN_MAX_UDF_TRANSACTIONS: + c->scan_max_udf_transactions = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_SCAN_THREADS: + c->scan_threads = cfg_u32(&line, 0, 128); + break; + case CASE_SERVICE_SERVICE_THREADS: + c->n_service_threads = cfg_u32(&line, 1, MAX_DEMARSHAL_THREADS); + break; + case CASE_SERVICE_SINDEX_BUILDER_THREADS: + c->sindex_builder_threads = cfg_u32(&line, 1, MAX_SINDEX_BUILDER_THREADS); + break; + case CASE_SERVICE_SINDEX_GC_MAX_RATE: + c->sindex_gc_max_rate = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_SINDEX_GC_PERIOD: + c->sindex_gc_period = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_TICKER_INTERVAL: + c->ticker_interval = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_TRANSACTION_MAX_MS: + c->transaction_max_ns = cfg_u64_no_checks(&line) * 1000000; + break; + case CASE_SERVICE_TRANSACTION_PENDING_LIMIT: + c->transaction_pending_limit = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_TRANSACTION_QUEUES: + c->n_transaction_queues = cfg_u32(&line, 1, MAX_TRANSACTION_QUEUES); + break; + case CASE_SERVICE_TRANSACTION_RETRY_MS: + c->transaction_retry_ms = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_TRANSACTION_THREADS_PER_QUEUE: + c->n_transaction_threads_per_queue = cfg_u32(&line, 1, MAX_TRANSACTION_THREADS_PER_QUEUE); + break; + case CASE_SERVICE_WORK_DIRECTORY: + c->work_directory = cfg_strdup_no_checks(&line); + break; + case CASE_SERVICE_DEBUG_ALLOCATIONS: + switch (cfg_find_tok(line.val_tok_1, SERVICE_DEBUG_ALLOCATIONS_OPTS, NUM_SERVICE_DEBUG_ALLOCATIONS_OPTS)) { + case CASE_SERVICE_DEBUG_ALLOCATIONS_NONE: + c->debug_allocations = CF_ALLOC_DEBUG_NONE; + break; + case CASE_SERVICE_DEBUG_ALLOCATIONS_TRANSIENT: + c->debug_allocations = CF_ALLOC_DEBUG_TRANSIENT; + break; + case CASE_SERVICE_DEBUG_ALLOCATIONS_PERSISTENT: + c->debug_allocations = CF_ALLOC_DEBUG_PERSISTENT; + break; + case CASE_SERVICE_DEBUG_ALLOCATIONS_ALL: + c->debug_allocations = CF_ALLOC_DEBUG_ALL; + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_val_tok_1(&line); + break; + } + break; + case CASE_SERVICE_FABRIC_DUMP_MSGS: + c->fabric_dump_msgs = cfg_bool(&line); + break; + case CASE_SERVICE_PROLE_EXTRA_TTL: + c->prole_extra_ttl = cfg_u32_no_checks(&line); + break; + case CASE_SERVICE_ALLOW_INLINE_TRANSACTIONS: + cfg_obsolete(&line, "please configure 'service-threads' carefully"); + break; + case CASE_SERVICE_RESPOND_CLIENT_ON_MASTER_COMPLETION: + cfg_obsolete(&line, "please use namespace-context 'write-commit-level-override' and/or write transaction policy"); + break; + case CASE_SERVICE_TRANSACTION_REPEATABLE_READ: + cfg_obsolete(&line, "please use namespace-context 'read-consistency-level-override' and/or read transaction policy"); + break; + case CASE_SERVICE_AUTO_DUN: + case CASE_SERVICE_AUTO_UNDUN: + case CASE_SERVICE_BATCH_RETRANSMIT: + case CASE_SERVICE_CLIB_LIBRARY: + case CASE_SERVICE_DEFRAG_QUEUE_ESCAPE: + case CASE_SERVICE_DEFRAG_QUEUE_HWM: + case CASE_SERVICE_DEFRAG_QUEUE_LWM: + case CASE_SERVICE_DEFRAG_QUEUE_PRIORITY: + case CASE_SERVICE_DUMP_MESSAGE_ABOVE_SIZE: + case CASE_SERVICE_FABRIC_WORKERS: + case CASE_SERVICE_FB_HEALTH_BAD_PCT: + case CASE_SERVICE_FB_HEALTH_GOOD_PCT: + case CASE_SERVICE_FB_HEALTH_MSG_PER_BURST: + case CASE_SERVICE_FB_HEALTH_MSG_TIMEOUT: + case CASE_SERVICE_GENERATION_DISABLE: + case CASE_SERVICE_MAX_MSGS_PER_TYPE: + case CASE_SERVICE_MIGRATE_READ_PRIORITY: + case CASE_SERVICE_MIGRATE_READ_SLEEP: + case CASE_SERVICE_MIGRATE_RX_LIFETIME_MS: + case CASE_SERVICE_MIGRATE_XMIT_HWM: + case CASE_SERVICE_MIGRATE_XMIT_LWM: + case CASE_SERVICE_MIGRATE_PRIORITY: + case CASE_SERVICE_MIGRATE_XMIT_PRIORITY: + case CASE_SERVICE_MIGRATE_XMIT_SLEEP: + case CASE_SERVICE_NSUP_AUTO_HWM: + case CASE_SERVICE_NSUP_AUTO_HWM_PCT: + case CASE_SERVICE_NSUP_MAX_DELETES: + case CASE_SERVICE_NSUP_QUEUE_ESCAPE: + case CASE_SERVICE_NSUP_QUEUE_HWM: + case CASE_SERVICE_NSUP_QUEUE_LWM: + case CASE_SERVICE_NSUP_REDUCE_PRIORITY: + case CASE_SERVICE_NSUP_REDUCE_SLEEP: + case CASE_SERVICE_NSUP_THREADS: + case CASE_SERVICE_PAXOS_MAX_CLUSTER_SIZE: + case CASE_SERVICE_PAXOS_PROTOCOL: + case CASE_SERVICE_PAXOS_RECOVERY_POLICY: + case CASE_SERVICE_PAXOS_RETRANSMIT_PERIOD: + case CASE_SERVICE_REPLICATION_FIRE_AND_FORGET: + case CASE_SERVICE_SCAN_MEMORY: + case CASE_SERVICE_SCAN_PRIORITY: + case CASE_SERVICE_SCAN_RETRANSMIT: + case CASE_SERVICE_SCHEDULER_PRIORITY: + case CASE_SERVICE_SCHEDULER_TYPE: + case CASE_SERVICE_TRANSACTION_DUPLICATE_THREADS: + case CASE_SERVICE_TRIAL_ACCOUNT_KEY: + case CASE_SERVICE_UDF_RUNTIME_MAX_GMEMORY: + case CASE_SERVICE_UDF_RUNTIME_MAX_MEMORY: + case CASE_SERVICE_USE_QUEUE_PER_DEVICE: + case CASE_SERVICE_WRITE_DUPLICATE_RESOLUTION_DISABLE: + cfg_deprecated_name_tok(&line); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //================================================== + // Parse logging context items. + // + case LOGGING: + switch (cfg_find_tok(line.name_tok, LOGGING_OPTS, NUM_LOGGING_OPTS)) { + case CASE_LOG_FILE_BEGIN: + if ((sink = cf_fault_sink_hold(line.val_tok_1)) == NULL) { + cf_crash_nostack(AS_CFG, "line %d :: can't add file %s as log sink", line_num, line.val_tok_1); + } + cfg_begin_context(&state, LOGGING_FILE); + break; + case CASE_LOG_CONSOLE_BEGIN: + if ((sink = cf_fault_sink_hold("stderr")) == NULL) { + cf_crash_nostack(AS_CFG, "line %d :: can't add stderr as log sink", line_num); + } + cfg_begin_context(&state, LOGGING_CONSOLE); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse logging::file context items. + // + case LOGGING_FILE: + switch (cfg_find_tok(line.name_tok, LOGGING_FILE_OPTS, NUM_LOGGING_FILE_OPTS)) { + case CASE_LOG_FILE_CONTEXT: + if (0 != cf_fault_sink_addcontext(sink, line.val_tok_1, line.val_tok_2)) { + cf_crash_nostack(AS_CFG, "line %d :: can't add logging file context %s %s", line_num, line.val_tok_1, line.val_tok_2); + } + break; + case CASE_CONTEXT_END: + sink = NULL; + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse logging::console context items. + // + case LOGGING_CONSOLE: + switch (cfg_find_tok(line.name_tok, LOGGING_CONSOLE_OPTS, NUM_LOGGING_CONSOLE_OPTS)) { + case CASE_LOG_CONSOLE_CONTEXT: + if (0 != cf_fault_sink_addcontext(sink, line.val_tok_1, line.val_tok_2)) { + cf_crash_nostack(AS_CFG, "line %d :: can't add logging console context %s %s", line_num, line.val_tok_1, line.val_tok_2); + } + break; + case CASE_CONTEXT_END: + sink = NULL; + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //================================================== + // Parse network context items. + // + case NETWORK: + switch (cfg_find_tok(line.name_tok, NETWORK_OPTS, NUM_NETWORK_OPTS)) { + case CASE_NETWORK_SERVICE_BEGIN: + cfg_begin_context(&state, NETWORK_SERVICE); + break; + case CASE_NETWORK_HEARTBEAT_BEGIN: + cfg_begin_context(&state, NETWORK_HEARTBEAT); + break; + case CASE_NETWORK_FABRIC_BEGIN: + cfg_begin_context(&state, NETWORK_FABRIC); + break; + case CASE_NETWORK_INFO_BEGIN: + cfg_begin_context(&state, NETWORK_INFO); + break; + case CASE_NETWORK_TLS_BEGIN: + tls_spec = cfg_create_tls_spec(c, line.val_tok_1); + cfg_begin_context(&state, NETWORK_TLS); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse network::service context items. + // + case NETWORK_SERVICE: + switch (cfg_find_tok(line.name_tok, NETWORK_SERVICE_OPTS, NUM_NETWORK_SERVICE_OPTS)) { + case CASE_NETWORK_SERVICE_ADDRESS: + cfg_add_addr_bind(line.val_tok_1, &c->service); + break; + case CASE_NETWORK_SERVICE_PORT: + c->service.bind_port = cfg_port(&line); + break; + case CASE_NETWORK_SERVICE_EXTERNAL_ADDRESS: + cfg_renamed_name_tok(&line, "access-address"); + // No break. + case CASE_NETWORK_SERVICE_ACCESS_ADDRESS: + cfg_add_addr_std(line.val_tok_1, &c->service); + break; + case CASE_NETWORK_SERVICE_ACCESS_PORT: + c->service.std_port = cfg_port(&line); + break; + case CASE_NETWORK_SERVICE_ALTERNATE_ACCESS_ADDRESS: + cfg_add_addr_alt(line.val_tok_1, &c->service); + break; + case CASE_NETWORK_SERVICE_ALTERNATE_ACCESS_PORT: + c->service.alt_port = cfg_port(&line); + break; + case CASE_NETWORK_SERVICE_TLS_ACCESS_ADDRESS: + cfg_enterprise_only(&line); + cfg_add_addr_std(line.val_tok_1, &c->tls_service); + break; + case CASE_NETWORK_SERVICE_TLS_ACCESS_PORT: + cfg_enterprise_only(&line); + c->tls_service.std_port = cfg_port(&line); + break; + case CASE_NETWORK_SERVICE_TLS_ADDRESS: + cfg_enterprise_only(&line); + cfg_add_addr_bind(line.val_tok_1, &c->tls_service); + break; + case CASE_NETWORK_SERVICE_TLS_ALTERNATE_ACCESS_ADDRESS: + cfg_enterprise_only(&line); + cfg_add_addr_alt(line.val_tok_1, &c->tls_service); + break; + case CASE_NETWORK_SERVICE_TLS_ALTERNATE_ACCESS_PORT: + cfg_enterprise_only(&line); + c->tls_service.alt_port = cfg_port(&line); + break; + case CASE_NETWORK_SERVICE_TLS_AUTHENTICATE_CLIENT: + cfg_enterprise_only(&line); + add_tls_peer_name(line.val_tok_1, &c->tls_service); + break; + case CASE_NETWORK_SERVICE_TLS_NAME: + cfg_enterprise_only(&line); + c->tls_service.tls_our_name = cfg_strdup_no_checks(&line); + break; + case CASE_NETWORK_SERVICE_TLS_PORT: + cfg_enterprise_only(&line); + c->tls_service.bind_port = cfg_port(&line); + break; + case CASE_NETWORK_SERVICE_ALTERNATE_ADDRESS: + cfg_obsolete(&line, "see Aerospike documentation http://www.aerospike.com/docs/operations/upgrade/network_to_3_10"); + break; + case CASE_NETWORK_SERVICE_NETWORK_INTERFACE_NAME: + cfg_obsolete(&line, "see Aerospike documentation http://www.aerospike.com/docs/operations/upgrade/network_to_3_10"); + break; + case CASE_NETWORK_SERVICE_REUSE_ADDRESS: + cfg_deprecated_name_tok(&line); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse network::heartbeat context items. + // + case NETWORK_HEARTBEAT: + switch (cfg_find_tok(line.name_tok, NETWORK_HEARTBEAT_OPTS, NUM_NETWORK_HEARTBEAT_OPTS)) { + case CASE_NETWORK_HEARTBEAT_MODE: + switch (cfg_find_tok(line.val_tok_1, NETWORK_HEARTBEAT_MODE_OPTS, NUM_NETWORK_HEARTBEAT_MODE_OPTS)) { + case CASE_NETWORK_HEARTBEAT_MODE_MULTICAST: + c->hb_config.mode = AS_HB_MODE_MULTICAST; + break; + case CASE_NETWORK_HEARTBEAT_MODE_MESH: + c->hb_config.mode = AS_HB_MODE_MESH; + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_val_tok_1(&line); + break; + } + break; + case CASE_NETWORK_HEARTBEAT_ADDRESS: + cfg_add_addr_bind(line.val_tok_1, &c->hb_serv_spec); + break; + case CASE_NETWORK_HEARTBEAT_MULTICAST_GROUP: + add_addr(line.val_tok_1, &c->hb_multicast_groups); + break; + case CASE_NETWORK_HEARTBEAT_PORT: + c->hb_serv_spec.bind_port = cfg_port(&line); + break; + case CASE_NETWORK_HEARTBEAT_MESH_SEED_ADDRESS_PORT: + cfg_add_mesh_seed_addr_port(cfg_strdup_no_checks(&line), cfg_port_val2(&line), false); + break; + case CASE_NETWORK_HEARTBEAT_INTERVAL: + c->hb_config.tx_interval = cfg_u32(&line, AS_HB_TX_INTERVAL_MS_MIN, AS_HB_TX_INTERVAL_MS_MAX); + break; + case CASE_NETWORK_HEARTBEAT_TIMEOUT: + c->hb_config.max_intervals_missed = cfg_u32(&line, AS_HB_MAX_INTERVALS_MISSED_MIN, UINT_MAX); + break; + case CASE_NETWORK_HEARTBEAT_MTU: + c->hb_config.override_mtu = cfg_u32_no_checks(&line); + break; + case CASE_NETWORK_HEARTBEAT_MCAST_TTL: + cfg_renamed_name_tok(&line, "multicast-ttl"); + // No break. + case CASE_NETWORK_HEARTBEAT_MULTICAST_TTL: + c->hb_config.multicast_ttl = cfg_u8_no_checks(&line); + break; + case CASE_NETWORK_HEARTBEAT_PROTOCOL: + switch (cfg_find_tok(line.val_tok_1, NETWORK_HEARTBEAT_PROTOCOL_OPTS, NUM_NETWORK_HEARTBEAT_PROTOCOL_OPTS)) { + case CASE_NETWORK_HEARTBEAT_PROTOCOL_NONE: + c->hb_config.protocol = AS_HB_PROTOCOL_NONE; + break; + case CASE_NETWORK_HEARTBEAT_PROTOCOL_V3: + c->hb_config.protocol = AS_HB_PROTOCOL_V3; + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_val_tok_1(&line); + break; + } + break; + case CASE_NETWORK_HEARTBEAT_TLS_ADDRESS: + cfg_enterprise_only(&line); + cfg_add_addr_bind(line.val_tok_1, &c->hb_tls_serv_spec); + break; + case CASE_NETWORK_HEARTBEAT_TLS_MESH_SEED_ADDRESS_PORT: + cfg_add_mesh_seed_addr_port(cfg_strdup_no_checks(&line), cfg_port_val2(&line), true); + break; + case CASE_NETWORK_HEARTBEAT_TLS_NAME: + cfg_enterprise_only(&line); + c->hb_tls_serv_spec.tls_our_name = cfg_strdup_no_checks(&line); + break; + case CASE_NETWORK_HEARTBEAT_TLS_PORT: + cfg_enterprise_only(&line); + c->hb_tls_serv_spec.bind_port = cfg_port(&line); + break; + case CASE_NETWORK_HEARTBEAT_INTERFACE_ADDRESS: + cfg_obsolete(&line, "see Aerospike documentation http://www.aerospike.com/docs/operations/upgrade/network_to_3_10"); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse network::fabric context items. + // + case NETWORK_FABRIC: + switch (cfg_find_tok(line.name_tok, NETWORK_FABRIC_OPTS, NUM_NETWORK_FABRIC_OPTS)) { + case CASE_NETWORK_FABRIC_ADDRESS: + cfg_add_addr_bind(line.val_tok_1, &c->fabric); + break; + case CASE_NETWORK_FABRIC_PORT: + c->fabric.bind_port = cfg_port(&line); + break; + case CASE_NETWORK_FABRIC_CHANNEL_BULK_FDS: + c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_BULK] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_SOCKETS); + break; + case CASE_NETWORK_FABRIC_CHANNEL_BULK_RECV_THREADS: + c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_BULK] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_THREADS); + break; + case CASE_NETWORK_FABRIC_CHANNEL_CTRL_FDS: + c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_CTRL] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_SOCKETS); + break; + case CASE_NETWORK_FABRIC_CHANNEL_CTRL_RECV_THREADS: + c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_CTRL] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_THREADS); + break; + case CASE_NETWORK_FABRIC_CHANNEL_META_FDS: + c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_META] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_SOCKETS); + break; + case CASE_NETWORK_FABRIC_CHANNEL_META_RECV_THREADS: + c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_META] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_THREADS); + break; + case CASE_NETWORK_FABRIC_CHANNEL_RW_FDS: + c->n_fabric_channel_fds[AS_FABRIC_CHANNEL_RW] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_SOCKETS); + break; + case CASE_NETWORK_FABRIC_CHANNEL_RW_RECV_THREADS: + c->n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_RW] = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_THREADS); + break; + case CASE_NETWORK_FABRIC_KEEPALIVE_ENABLED: + c->fabric_keepalive_enabled = cfg_bool(&line); + break; + case CASE_NETWORK_FABRIC_KEEPALIVE_INTVL: + c->fabric_keepalive_intvl = cfg_int_no_checks(&line); + break; + case CASE_NETWORK_FABRIC_KEEPALIVE_PROBES: + c->fabric_keepalive_probes = cfg_int_no_checks(&line); + break; + case CASE_NETWORK_FABRIC_KEEPALIVE_TIME: + c->fabric_keepalive_time = cfg_int_no_checks(&line); + break; + case CASE_NETWORK_FABRIC_LATENCY_MAX_MS: + c->fabric_latency_max_ms = cfg_int(&line, 0, 1000); + break; + case CASE_NETWORK_FABRIC_RECV_REARM_THRESHOLD: + c->fabric_recv_rearm_threshold = cfg_u32(&line, 0, 1024 * 1024); + break; + case CASE_NETWORK_FABRIC_SEND_THREADS: + c->n_fabric_send_threads = cfg_u32(&line, 1, MAX_FABRIC_CHANNEL_THREADS); + break; + case CASE_NETWORK_FABRIC_TLS_ADDRESS: + cfg_enterprise_only(&line); + cfg_add_addr_bind(line.val_tok_1, &c->tls_fabric); + break; + case CASE_NETWORK_FABRIC_TLS_NAME: + cfg_enterprise_only(&line); + c->tls_fabric.tls_our_name = cfg_strdup_no_checks(&line); + break; + case CASE_NETWORK_FABRIC_TLS_PORT: + cfg_enterprise_only(&line); + c->tls_fabric.bind_port = cfg_port(&line); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse network::info context items. + // + case NETWORK_INFO: + switch (cfg_find_tok(line.name_tok, NETWORK_INFO_OPTS, NUM_NETWORK_INFO_OPTS)) { + case CASE_NETWORK_INFO_ADDRESS: + cfg_add_addr_bind(line.val_tok_1, &c->info); + break; + case CASE_NETWORK_INFO_PORT: + c->info.bind_port = cfg_port(&line); + break; + case CASE_NETWORK_INFO_ENABLE_FASTPATH: + cfg_deprecated_name_tok(&line); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse network::tls context items. + // + case NETWORK_TLS: + switch (cfg_find_tok(line.name_tok, NETWORK_TLS_OPTS, NUM_NETWORK_TLS_OPTS)) { + case CASE_NETWORK_TLS_CA_FILE: + cfg_enterprise_only(&line); + tls_spec->ca_file = cfg_strdup_no_checks(&line); + break; + case CASE_NETWORK_TLS_CA_PATH: + cfg_enterprise_only(&line); + tls_spec->ca_path = cfg_strdup_no_checks(&line); + break; + case CASE_NETWORK_TLS_CERT_BLACKLIST: + cfg_enterprise_only(&line); + tls_spec->cert_blacklist = cfg_strdup_no_checks(&line); + break; + case CASE_NETWORK_TLS_CERT_FILE: + cfg_enterprise_only(&line); + tls_spec->cert_file = cfg_strdup_no_checks(&line); + break; + case CASE_NETWORK_TLS_CIPHER_SUITE: + cfg_enterprise_only(&line); + tls_spec->cipher_suite = cfg_strdup_no_checks(&line); + break; + case CASE_NETWORK_TLS_KEY_FILE: + cfg_enterprise_only(&line); + tls_spec->key_file = cfg_strdup_no_checks(&line); + break; + case CASE_NETWORK_TLS_PROTOCOLS: + cfg_enterprise_only(&line); + tls_spec->protocols = cfg_strdup_no_checks(&line); + break; + case CASE_CONTEXT_END: + tls_spec = NULL; + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //================================================== + // Parse namespace items. + // + case NAMESPACE: + switch (cfg_find_tok(line.name_tok, NAMESPACE_OPTS, NUM_NAMESPACE_OPTS)) { + case CASE_NAMESPACE_REPLICATION_FACTOR: + ns->cfg_replication_factor = cfg_u32(&line, 1, AS_CLUSTER_SZ); + break; + case CASE_NAMESPACE_LIMIT_SIZE: + cfg_renamed_name_tok(&line, "memory-size"); + // No break. + case CASE_NAMESPACE_MEMORY_SIZE: + ns->memory_size = cfg_u64_no_checks(&line); + break; + case CASE_NAMESPACE_DEFAULT_TTL: + ns->default_ttl = cfg_seconds_no_checks(&line); + break; + case CASE_NAMESPACE_STORAGE_ENGINE_BEGIN: + switch (cfg_find_tok(line.val_tok_1, NAMESPACE_STORAGE_OPTS, NUM_NAMESPACE_STORAGE_OPTS)) { + case CASE_NAMESPACE_STORAGE_MEMORY: + ns->storage_type = AS_STORAGE_ENGINE_MEMORY; + ns->storage_data_in_memory = true; + break; + case CASE_NAMESPACE_STORAGE_SSD: + cfg_renamed_val_tok_1(&line, "device"); + // No break. + case CASE_NAMESPACE_STORAGE_DEVICE: + ns->storage_type = AS_STORAGE_ENGINE_SSD; + ns->storage_data_in_memory = false; + cfg_begin_context(&state, NAMESPACE_STORAGE_DEVICE); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_val_tok_1(&line); + break; + } + break; + case CASE_NAMESPACE_ENABLE_XDR: + cfg_enterprise_only(&line); + ns->enable_xdr = cfg_bool(&line); + break; + case CASE_NAMESPACE_SETS_ENABLE_XDR: + ns->sets_enable_xdr = cfg_bool(&line); + break; + case CASE_NAMESPACE_FORWARD_XDR_WRITES: + ns->ns_forward_xdr_writes = cfg_bool(&line); + break; + case CASE_NAMESPACE_XDR_REMOTE_DATACENTER: + xdr_cfg_add_datacenter(cfg_strdup(&line, true), ns->id); + break; + case CASE_NAMESPACE_ALLOW_NONXDR_WRITES: + ns->ns_allow_nonxdr_writes = cfg_bool(&line); + break; + case CASE_NAMESPACE_ALLOW_XDR_WRITES: + ns->ns_allow_xdr_writes = cfg_bool(&line); + break; + case CASE_NAMESPACE_COLD_START_EVICT_TTL: + ns->cold_start_evict_ttl = cfg_u32_no_checks(&line); + break; + case CASE_NAMESPACE_CONFLICT_RESOLUTION_POLICY: + switch (cfg_find_tok(line.val_tok_1, NAMESPACE_CONFLICT_RESOLUTION_OPTS, NUM_NAMESPACE_CONFLICT_RESOLUTION_OPTS)) { + case CASE_NAMESPACE_CONFLICT_RESOLUTION_GENERATION: + ns->conflict_resolution_policy = AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION; + break; + case CASE_NAMESPACE_CONFLICT_RESOLUTION_LAST_UPDATE_TIME: + ns->conflict_resolution_policy = AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME; + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_val_tok_1(&line); + break; + } + break; + case CASE_NAMESPACE_DATA_IN_INDEX: + ns->data_in_index = cfg_bool(&line); + break; + case CASE_NAMESPACE_DISABLE_WRITE_DUP_RES: + ns->write_dup_res_disabled = cfg_bool(&line); + break; + case CASE_NAMESPACE_DISALLOW_NULL_SETNAME: + ns->disallow_null_setname = cfg_bool(&line); + break; + case CASE_NAMESPACE_ENABLE_BENCHMARKS_BATCH_SUB: + ns->batch_sub_benchmarks_enabled = true; + break; + case CASE_NAMESPACE_ENABLE_BENCHMARKS_READ: + ns->read_benchmarks_enabled = true; + break; + case CASE_NAMESPACE_ENABLE_BENCHMARKS_UDF: + ns->udf_benchmarks_enabled = true; + break; + case CASE_NAMESPACE_ENABLE_BENCHMARKS_UDF_SUB: + ns->udf_sub_benchmarks_enabled = true; + break; + case CASE_NAMESPACE_ENABLE_BENCHMARKS_WRITE: + ns->write_benchmarks_enabled = true; + break; + case CASE_NAMESPACE_ENABLE_HIST_PROXY: + ns->proxy_hist_enabled = cfg_bool(&line); + break; + case CASE_NAMESPACE_EVICT_HIST_BUCKETS: + ns->evict_hist_buckets = cfg_u32(&line, 100, 10000000); + break; + case CASE_NAMESPACE_EVICT_TENTHS_PCT: + ns->evict_tenths_pct = cfg_u32_no_checks(&line); + break; + case CASE_NAMESPACE_HIGH_WATER_DISK_PCT: + ns->hwm_disk_pct = cfg_u32(&line, 0, 100); + break; + case CASE_NAMESPACE_HIGH_WATER_MEMORY_PCT: + ns->hwm_memory_pct = cfg_u32(&line, 0, 100); + break; + case CASE_NAMESPACE_MAX_TTL: + ns->max_ttl = cfg_seconds(&line, 1, MAX_ALLOWED_TTL); + break; + case CASE_NAMESPACE_MIGRATE_ORDER: + ns->migrate_order = cfg_u32(&line, 1, 10); + break; + case CASE_NAMESPACE_MIGRATE_RETRANSMIT_MS: + ns->migrate_retransmit_ms = cfg_u32_no_checks(&line); + break; + case CASE_NAMESPACE_MIGRATE_SLEEP: + ns->migrate_sleep = cfg_u32_no_checks(&line); + break; + case CASE_NAMESPACE_OBJ_SIZE_HIST_MAX: + ns->obj_size_hist_max = cfg_obj_size_hist_max(cfg_u32_no_checks(&line)); + break; + case CASE_NAMESPACE_PARTITION_TREE_LOCKS: + ns->tree_shared.n_lock_pairs = cfg_u32_power_of_2(&line, 1, 256); + break; + case CASE_NAMESPACE_PARTITION_TREE_SPRIGS: + ns->tree_shared.n_sprigs = cfg_u32_power_of_2(&line, 16, 4096); + break; + case CASE_NAMESPACE_RACK_ID: + cfg_enterprise_only(&line); + ns->rack_id = cfg_u32(&line, 0, MAX_RACK_ID); + break; + case CASE_NAMESPACE_READ_CONSISTENCY_LEVEL_OVERRIDE: + switch (cfg_find_tok(line.val_tok_1, NAMESPACE_READ_CONSISTENCY_OPTS, NUM_NAMESPACE_READ_CONSISTENCY_OPTS)) { + case CASE_NAMESPACE_READ_CONSISTENCY_ALL: + ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_ALL; + break; + case CASE_NAMESPACE_READ_CONSISTENCY_OFF: + ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_PROTO; + break; + case CASE_NAMESPACE_READ_CONSISTENCY_ONE: + ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_ONE; + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_val_tok_1(&line); + break; + } + break; + case CASE_NAMESPACE_SET_BEGIN: + p_set = cfg_add_set(ns); + cfg_strcpy(&line, p_set->name, AS_SET_NAME_MAX_SIZE); + cfg_begin_context(&state, NAMESPACE_SET); + break; + case CASE_NAMESPACE_SINDEX_BEGIN: + cfg_begin_context(&state, NAMESPACE_SINDEX); + break; + case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_BEGIN: + cfg_begin_context(&state, NAMESPACE_GEO2DSPHERE_WITHIN); + break; + case CASE_NAMESPACE_SINGLE_BIN: + ns->single_bin = cfg_bool(&line); + break; + case CASE_NAMESPACE_STOP_WRITES_PCT: + ns->stop_writes_pct = cfg_u32(&line, 0, 100); + break; + case CASE_NAMESPACE_STRONG_CONSISTENCY: + cfg_enterprise_only(&line); + ns->cp = cfg_bool(&line); + break; + case CASE_NAMESPACE_STRONG_CONSISTENCY_ALLOW_EXPUNGE: + cfg_enterprise_only(&line); + ns->cp_allow_drops = cfg_bool(&line); + break; + case CASE_NAMESPACE_TOMB_RAIDER_ELIGIBLE_AGE: + cfg_enterprise_only(&line); + ns->tomb_raider_eligible_age = cfg_seconds_no_checks(&line); + break; + case CASE_NAMESPACE_TOMB_RAIDER_PERIOD: + cfg_enterprise_only(&line); + ns->tomb_raider_period = cfg_seconds_no_checks(&line); + break; + case CASE_NAMESPACE_WRITE_COMMIT_LEVEL_OVERRIDE: + switch (cfg_find_tok(line.val_tok_1, NAMESPACE_WRITE_COMMIT_OPTS, NUM_NAMESPACE_WRITE_COMMIT_OPTS)) { + case CASE_NAMESPACE_WRITE_COMMIT_ALL: + ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_ALL; + break; + case CASE_NAMESPACE_WRITE_COMMIT_MASTER: + ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_MASTER; + break; + case CASE_NAMESPACE_WRITE_COMMIT_OFF: + ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_PROTO; + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_val_tok_1(&line); + break; + } + break; + case CASE_NAMESPACE_ALLOW_VERSIONS: + case CASE_NAMESPACE_DEMO_READ_MULTIPLIER: + case CASE_NAMESPACE_DEMO_WRITE_MULTIPLIER: + case CASE_NAMESPACE_HIGH_WATER_PCT: + case CASE_NAMESPACE_LOW_WATER_PCT: + cfg_deprecated_name_tok(&line); + break; + case CASE_NAMESPACE_SI_BEGIN: + cfg_deprecated_name_tok(&line); + // Entire section is deprecated but needs to begin and end the + // context to avoid crash. + cfg_begin_context(&state, NAMESPACE_SI); + break; + case CASE_CONTEXT_END: + if (ns->data_in_index && ! (ns->single_bin && ns->storage_data_in_memory && ns->storage_type == AS_STORAGE_ENGINE_SSD)) { + cf_crash_nostack(AS_CFG, "ns %s data-in-index can't be true unless storage-engine is device and both single-bin and data-in-memory are true", ns->name); + } + if (ns->default_ttl > ns->max_ttl) { + cf_crash_nostack(AS_CFG, "ns %s default-ttl can't be > max-ttl", ns->name); + } + if (ns->tree_shared.n_lock_pairs > ns->tree_shared.n_sprigs) { + cf_crash_nostack(AS_CFG, "ns %s partition-tree-locks can't be > partition-tree-sprigs", ns->name); + } + if (ns->storage_data_in_memory) { + ns->storage_post_write_queue = 0; // override default (or configuration mistake) + } + if (ns->storage_data_in_memory && + ! ns->storage_commit_to_device) { + c->n_namespaces_inlined++; + } + else { + c->n_namespaces_not_inlined++; + } + ns = NULL; + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse namespace::storage-engine device context items. + // + case NAMESPACE_STORAGE_DEVICE: + switch (cfg_find_tok(line.name_tok, NAMESPACE_STORAGE_DEVICE_OPTS, NUM_NAMESPACE_STORAGE_DEVICE_OPTS)) { + case CASE_NAMESPACE_STORAGE_DEVICE_DEVICE: + cfg_add_storage_device(ns, cfg_strdup(&line, true), cfg_strdup_val2(&line, false)); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_FILE: + cfg_add_storage_file(ns, cfg_strdup(&line, true)); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_FILESIZE: + ns->storage_filesize = cfg_u64(&line, 1024 * 1024, AS_STORAGE_MAX_DEVICE_SIZE); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_SCHEDULER_MODE: + ns->storage_scheduler_mode = cfg_strdup_one_of(&line, DEVICE_SCHEDULER_MODES, NUM_DEVICE_SCHEDULER_MODES); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_WRITE_BLOCK_SIZE: + ns->storage_write_block_size = cfg_u32_power_of_2(&line, MIN_WRITE_BLOCK_SIZE, MAX_WRITE_BLOCK_SIZE); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_MEMORY_ALL: + cfg_renamed_name_tok(&line, "data-in-memory"); + // No break. + case CASE_NAMESPACE_STORAGE_DEVICE_DATA_IN_MEMORY: + ns->storage_data_in_memory = cfg_bool(&line); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_COLD_START_EMPTY: + ns->storage_cold_start_empty = cfg_bool(&line); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_TO_DEVICE: + cfg_enterprise_only(&line); + ns->storage_commit_to_device = cfg_bool(&line); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_COMMIT_MIN_SIZE: + cfg_enterprise_only(&line); + ns->storage_commit_min_size = cfg_u32_power_of_2(&line, 0, MAX_WRITE_BLOCK_SIZE); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_LWM_PCT: + ns->storage_defrag_lwm_pct = cfg_u32_no_checks(&line); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_QUEUE_MIN: + ns->storage_defrag_queue_min = cfg_u32_no_checks(&line); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_SLEEP: + ns->storage_defrag_sleep = cfg_u32_no_checks(&line); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_STARTUP_MINIMUM: + ns->storage_defrag_startup_minimum = cfg_int(&line, 1, 99); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_DISABLE_ODIRECT: + ns->storage_disable_odirect = cfg_bool(&line); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_ENABLE_BENCHMARKS_STORAGE: + ns->storage_benchmarks_enabled = true; + break; + case CASE_NAMESPACE_STORAGE_DEVICE_ENABLE_OSYNC: + ns->storage_enable_osync = cfg_bool(&line); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_ENCRYPTION_KEY_FILE: + cfg_enterprise_only(&line); + ns->storage_encryption_key_file = cfg_strdup(&line, true); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_FLUSH_MAX_MS: + ns->storage_flush_max_us = cfg_u64_no_checks(&line) * 1000; + break; + case CASE_NAMESPACE_STORAGE_DEVICE_FSYNC_MAX_SEC: + ns->storage_fsync_max_us = cfg_u64_no_checks(&line) * 1000000; + break; + case CASE_NAMESPACE_STORAGE_DEVICE_MAX_WRITE_CACHE: + ns->storage_max_write_cache = cfg_u64_no_checks(&line); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_MIN_AVAIL_PCT: + ns->storage_min_avail_pct = cfg_u32(&line, 0, 100); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_POST_WRITE_QUEUE: + ns->storage_post_write_queue = cfg_u32(&line, 0, 4 * 1024); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_TOMB_RAIDER_SLEEP: + cfg_enterprise_only(&line); + ns->storage_tomb_raider_sleep = cfg_u32_no_checks(&line); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_WRITE_THREADS: + ns->storage_write_threads = cfg_u32_no_checks(&line); + break; + case CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_MAX_BLOCKS: + case CASE_NAMESPACE_STORAGE_DEVICE_DEFRAG_PERIOD: + case CASE_NAMESPACE_STORAGE_DEVICE_LOAD_AT_STARTUP: + case CASE_NAMESPACE_STORAGE_DEVICE_PERSIST: + case CASE_NAMESPACE_STORAGE_DEVICE_READONLY: + case CASE_NAMESPACE_STORAGE_DEVICE_SIGNATURE: + case CASE_NAMESPACE_STORAGE_DEVICE_WRITE_SMOOTHING_PERIOD: + cfg_deprecated_name_tok(&line); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse namespace::set context items. + // + case NAMESPACE_SET: + switch (cfg_find_tok(line.name_tok, NAMESPACE_SET_OPTS, NUM_NAMESPACE_SET_OPTS)) { + case CASE_NAMESPACE_SET_DISABLE_EVICTION: + DISABLE_SET_EVICTION(p_set, cfg_bool(&line)); + break; + case CASE_NAMESPACE_SET_ENABLE_XDR: + switch (cfg_find_tok(line.val_tok_1, NAMESPACE_SET_ENABLE_XDR_OPTS, NUM_NAMESPACE_SET_ENABLE_XDR_OPTS)) { + case CASE_NAMESPACE_SET_ENABLE_XDR_USE_DEFAULT: + p_set->enable_xdr = AS_SET_ENABLE_XDR_DEFAULT; + break; + case CASE_NAMESPACE_SET_ENABLE_XDR_FALSE: + p_set->enable_xdr = AS_SET_ENABLE_XDR_FALSE; + break; + case CASE_NAMESPACE_SET_ENABLE_XDR_TRUE: + p_set->enable_xdr = AS_SET_ENABLE_XDR_TRUE; + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_val_tok_1(&line); + break; + } + break; + case CASE_NAMESPACE_SET_STOP_WRITES_COUNT: + p_set->stop_writes_count = cfg_u64_no_checks(&line); + break; + case CASE_NAMESPACE_SET_EVICT_HWM_COUNT: + case CASE_NAMESPACE_SET_EVICT_HWM_PCT: + case CASE_NAMESPACE_SET_STOP_WRITE_COUNT: + case CASE_NAMESPACE_SET_STOP_WRITE_PCT: + cfg_deprecated_name_tok(&line); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse namespace::si context items. + // + case NAMESPACE_SI: + switch (cfg_find_tok(line.name_tok, NAMESPACE_SI_OPTS, NUM_NAMESPACE_SI_OPTS)) { + case CASE_NAMESPACE_SI_GC_PERIOD: + cfg_deprecated_name_tok(&line); + break; + case CASE_NAMESPACE_SI_GC_MAX_UNITS: + cfg_deprecated_name_tok(&line); + break; + case CASE_NAMESPACE_SI_HISTOGRAM: + cfg_deprecated_name_tok(&line); + break; + case CASE_NAMESPACE_SI_IGNORE_NOT_SYNC: + cfg_deprecated_name_tok(&line); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_val_tok_1(&line); + break; + } + break; + + //---------------------------------------- + // Parse namespace::sindex context items. + // + case NAMESPACE_SINDEX: + switch (cfg_find_tok(line.name_tok, NAMESPACE_SINDEX_OPTS, NUM_NAMESPACE_SINDEX_OPTS)) { + case CASE_NAMESPACE_SINDEX_NUM_PARTITIONS: + // FIXME - minimum should be 1, but currently crashes. + ns->sindex_num_partitions = cfg_u32(&line, MIN_PARTITIONS_PER_INDEX, MAX_PARTITIONS_PER_INDEX); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse namespace::2dsphere-within context items. + // + case NAMESPACE_GEO2DSPHERE_WITHIN: + switch (cfg_find_tok(line.name_tok, NAMESPACE_GEO2DSPHERE_WITHIN_OPTS, NUM_NAMESPACE_GEO2DSPHERE_WITHIN_OPTS)) { + case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_STRICT: + ns->geo2dsphere_within_strict = cfg_bool(&line); + break; + case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MIN_LEVEL: + ns->geo2dsphere_within_min_level = cfg_u16(&line, 0, 30); + break; + case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MAX_LEVEL: + ns->geo2dsphere_within_max_level = cfg_u16(&line, 0, 30); + break; + case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_MAX_CELLS: + ns->geo2dsphere_within_max_cells = cfg_u16(&line, 1, MAX_REGION_CELLS); + break; + case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_LEVEL_MOD: + ns->geo2dsphere_within_level_mod = cfg_u16(&line, 1, 3); + break; + case CASE_NAMESPACE_GEO2DSPHERE_WITHIN_EARTH_RADIUS_METERS: + ns->geo2dsphere_within_earth_radius_meters = cfg_u32_no_checks(&line); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //================================================== + // Parse mod-lua context items. + // + case MOD_LUA: + switch (cfg_find_tok(line.name_tok, MOD_LUA_OPTS, NUM_MOD_LUA_OPTS)) { + case CASE_MOD_LUA_CACHE_ENABLED: + c->mod_lua.cache_enabled = cfg_bool(&line); + break; + case CASE_MOD_LUA_SYSTEM_PATH: + cfg_strcpy(&line, c->mod_lua.system_path, sizeof(c->mod_lua.system_path)); + break; + case CASE_MOD_LUA_USER_PATH: + cfg_strcpy(&line, c->mod_lua.user_path, sizeof(c->mod_lua.user_path)); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //================================================== + // Parse security context items. + // + case SECURITY: + switch (cfg_find_tok(line.name_tok, SECURITY_OPTS, NUM_SECURITY_OPTS)) { + case CASE_SECURITY_ENABLE_SECURITY: + c->sec_cfg.security_enabled = cfg_bool(&line); + break; + case CASE_SECURITY_PRIVILEGE_REFRESH_PERIOD: + c->sec_cfg.privilege_refresh_period = cfg_u32(&line, 10, 60 * 60 * 24); + break; + case CASE_SECURITY_LOG_BEGIN: + cfg_begin_context(&state, SECURITY_LOG); + break; + case CASE_SECURITY_SYSLOG_BEGIN: + cfg_begin_context(&state, SECURITY_SYSLOG); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse security::log context items. + // + case SECURITY_LOG: + switch (cfg_find_tok(line.name_tok, SECURITY_LOG_OPTS, NUM_SECURITY_LOG_OPTS)) { + case CASE_SECURITY_LOG_REPORT_AUTHENTICATION: + c->sec_cfg.report.authentication |= cfg_bool(&line) ? AS_SEC_SINK_LOG : 0; + break; + case CASE_SECURITY_LOG_REPORT_DATA_OP: + as_security_config_log_scope(AS_SEC_SINK_LOG, line.val_tok_1, line.val_tok_2); + break; + case CASE_SECURITY_LOG_REPORT_SYS_ADMIN: + c->sec_cfg.report.sys_admin |= cfg_bool(&line) ? AS_SEC_SINK_LOG : 0; + break; + case CASE_SECURITY_LOG_REPORT_USER_ADMIN: + c->sec_cfg.report.user_admin |= cfg_bool(&line) ? AS_SEC_SINK_LOG : 0; + break; + case CASE_SECURITY_LOG_REPORT_VIOLATION: + c->sec_cfg.report.violation |= cfg_bool(&line) ? AS_SEC_SINK_LOG : 0; + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse security::syslog context items. + // + case SECURITY_SYSLOG: + switch (cfg_find_tok(line.name_tok, SECURITY_SYSLOG_OPTS, NUM_SECURITY_SYSLOG_OPTS)) { + case CASE_SECURITY_SYSLOG_LOCAL: + c->sec_cfg.syslog_local = (as_sec_syslog_local)cfg_int(&line, AS_SYSLOG_MIN, AS_SYSLOG_MAX); + break; + case CASE_SECURITY_SYSLOG_REPORT_AUTHENTICATION: + c->sec_cfg.report.authentication |= cfg_bool(&line) ? AS_SEC_SINK_SYSLOG : 0; + break; + case CASE_SECURITY_SYSLOG_REPORT_DATA_OP: + as_security_config_log_scope(AS_SEC_SINK_SYSLOG, line.val_tok_1, line.val_tok_2); + break; + case CASE_SECURITY_SYSLOG_REPORT_SYS_ADMIN: + c->sec_cfg.report.sys_admin |= cfg_bool(&line) ? AS_SEC_SINK_SYSLOG : 0; + break; + case CASE_SECURITY_SYSLOG_REPORT_USER_ADMIN: + c->sec_cfg.report.user_admin |= cfg_bool(&line) ? AS_SEC_SINK_SYSLOG : 0; + break; + case CASE_SECURITY_SYSLOG_REPORT_VIOLATION: + c->sec_cfg.report.violation |= cfg_bool(&line) ? AS_SEC_SINK_SYSLOG : 0; + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //================================================== + // Parse xdr context items. + // + case XDR: + switch (cfg_find_tok(line.name_tok, XDR_OPTS, NUM_XDR_OPTS)) { + case CASE_CONTEXT_BEGIN: + // Allow open brace on its own line to begin this context. + break; + case CASE_XDR_ENABLE_XDR: + g_xcfg.xdr_global_enabled = cfg_bool(&line); + break; + case CASE_XDR_DIGESTLOG_PATH: + g_xcfg.xdr_digestlog_path = cfg_strdup(&line, true); + g_xcfg.xdr_digestlog_file_size = cfg_u64_val2_no_checks(&line); + break; + case CASE_XDR_DATACENTER_BEGIN: + if (g_dc_count == DC_MAX_NUM) { + cf_crash_nostack(AS_CFG, "Cannot have more than %d datacenters", DC_MAX_NUM); + } + + cur_dc_cfg = &g_dc_xcfg_opt[g_dc_count]; + cur_dc_cfg->dc_name = cfg_strdup(&line, true); + cur_dc_cfg->dc_id = g_dc_count; + cf_vector_pointer_init(&cur_dc_cfg->dc_node_v, 10, 0); + cf_vector_pointer_init(&cur_dc_cfg->dc_addr_map_v, 10, 0); + cfg_begin_context(&state, XDR_DATACENTER); + break; + case CASE_XDR_CLIENT_THREADS: + g_xcfg.xdr_client_threads = cfg_u32_no_checks(&line); + break; + case CASE_XDR_COMPRESSION_THRESHOLD: + g_xcfg.xdr_compression_threshold = cfg_u32_no_checks(&line); + break; + case CASE_XDR_DELETE_SHIPPING_ENABLED: + g_xcfg.xdr_delete_shipping_enabled = cfg_bool(&line); + break; + case CASE_XDR_DIGESTLOG_IOWAIT_MS: + g_xcfg.xdr_digestlog_iowait_ms = cfg_u32_no_checks(&line); + break; + case CASE_XDR_FORWARD_XDR_WRITES: + g_xcfg.xdr_forward_xdrwrites = cfg_bool(&line); + break; + case CASE_XDR_HOTKEY_TIME_MS: + g_xcfg.xdr_hotkey_time_ms = cfg_u32_no_checks(&line); + break; + case CASE_XDR_INFO_PORT: + g_xcfg.xdr_info_port = cfg_port(&line); + break; + case CASE_XDR_INFO_TIMEOUT: + g_xcfg.xdr_info_request_timeout_ms = cfg_u32_no_checks(&line); + break; + case CASE_XDR_MAX_SHIP_BANDWIDTH: + g_xcfg.xdr_max_ship_bandwidth = cfg_u32_no_checks(&line); + break; + case CASE_XDR_MAX_SHIP_THROUGHPUT: + g_xcfg.xdr_max_ship_throughput = cfg_u32_no_checks(&line); + break; + case CASE_XDR_MIN_DIGESTLOG_FREE_PCT: + g_xcfg.xdr_min_dlog_free_pct = cfg_u32(&line, 0, 100); + break; + case CASE_XDR_NSUP_DELETES_ENABLED: + g_xcfg.xdr_nsup_deletes_enabled = cfg_bool(&line); + break; + case CASE_XDR_READ_THREADS: + g_xcfg.xdr_read_threads = cfg_u32_no_checks(&line); + break; + case CASE_XDR_SHIP_BINS: + g_xcfg.xdr_ship_bins = cfg_bool(&line); + break; + case CASE_XDR_SHIP_DELAY: + g_xcfg.xdr_internal_shipping_delay = cfg_u32_no_checks(&line); + break; + case CASE_XDR_SHIPPING_ENABLED: + g_xcfg.xdr_shipping_enabled = cfg_bool(&line); + break; + case CASE_XDR_WRITE_TIMEOUT: + g_xcfg.xdr_write_timeout = cfg_u32_no_checks(&line); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //---------------------------------------- + // Parse xdr::datacenter context items. + // + case XDR_DATACENTER: + switch (cfg_find_tok(line.name_tok, XDR_DATACENTER_OPTS, NUM_XDR_DATACENTER_OPTS)) { + case CASE_CONTEXT_BEGIN: + // Allow open brace on its own line to begin this context. + break; + case CASE_XDR_DATACENTER_DC_NODE_ADDRESS_PORT: + xdr_cfg_add_node_addr_port(cur_dc_cfg, cfg_strdup(&line, true), cfg_port_val2(&line)); + break; + case CASE_XDR_DATACENTER_DC_CONNECTIONS: + cur_dc_cfg->dc_connections = cfg_u32_no_checks(&line); + break; + case CASE_XDR_DATACENTER_DC_CONNECTIONS_IDLE_MS: + cur_dc_cfg->dc_connections_idle_ms = cfg_u32_no_checks(&line); + break; + case CASE_XDR_DATACENTER_DC_INT_EXT_IPMAP: + xdr_cfg_add_int_ext_mapping(cur_dc_cfg, cfg_strdup(&line, true), cfg_strdup_val2(&line, true)); + break; + case CASE_XDR_DATACENTER_DC_SECURITY_CONFIG_FILE: + cur_dc_cfg->dc_security_cfg.sec_config_file = cfg_strdup(&line, true); + break; + case CASE_XDR_DATACENTER_DC_USE_ALTERNATE_SERVICES: + cur_dc_cfg->dc_use_alternate_services = cfg_bool(&line); + break; + case CASE_XDR_DATACENTER_TLS_NAME: + cur_dc_cfg->tls_our_name = cfg_strdup_no_checks(&line); + break; + case CASE_XDR_DATACENTER_TLS_NODE: + xdr_cfg_add_tls_node(cur_dc_cfg, cfg_strdup(&line, true), cfg_strdup_val2(&line, true), cfg_port_val3(&line)); + break; + case CASE_CONTEXT_END: + g_dc_count++; + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + //================================================== + // Parser state is corrupt. + // + default: + cf_crash_nostack(AS_CFG, "line %d :: invalid parser top-level state %d", line_num, state.current); + break; + } + } + + fclose(FD); + + //-------------------------------------------- + // Checks that must wait until everything is parsed. Alternatively, such + // checks can be done in as_config_post_process() - doing them here means + // failure logs show in the console, doing them in as_config_post_process() + // means failure logs show in the log file. + // + + as_security_config_check(); + + return &g_config; +} + + +//========================================================== +// Public API - configuration-related tasks after parsing. +// + +void +as_config_post_process(as_config* c, const char* config_file) +{ + //-------------------------------------------- + // Re-read the configuration file and print it to the logs, line by line. + // This will be the first thing to appear in the log file(s). + // + + FILE* FD; + + if (NULL == (FD = fopen(config_file, "r"))) { + cf_crash_nostack(AS_CFG, "couldn't re-open configuration file %s: %s", config_file, cf_strerror(errno)); + } + + char iobuf[256]; + + while (fgets(iobuf, sizeof(iobuf), FD)) { + char* p = iobuf; + char* p_last = p + (strlen(p) - 1); + + if ('\n' == *p_last) { + *p_last-- = '\0'; + } + + if (p_last >= p && '\r' == *p_last) { + *p_last = '\0'; + } + + cf_info(AS_CFG, "%s", p); + } + + fclose(FD); + + // + // Done echoing configuration file to log. + //-------------------------------------------- + + // Configuration checks and special defaults that differ between CE and EE. + cfg_post_process(); + + cf_alloc_set_debug(c->debug_allocations); + + // Check the configured file descriptor limit against the system limit. + struct rlimit fd_limit; + + getrlimit(RLIMIT_NOFILE, &fd_limit); + + if (c->n_proto_fd_max < 0 || (rlim_t)c->n_proto_fd_max > fd_limit.rlim_cur) { + cf_crash_nostack(AS_CFG, "%lu system file descriptors not enough, config specified %d", fd_limit.rlim_cur, c->n_proto_fd_max); + } + + cf_info(AS_CFG, "system file descriptor limit: %lu, proto-fd-max: %d", fd_limit.rlim_cur, c->n_proto_fd_max); + + // Output NUMA topology information. + cf_topo_info(); + + if (c->auto_pin != CF_TOPO_AUTO_PIN_NONE) { + if (c->n_service_threads != 0) { + cf_crash_nostack(AS_CFG, "can't configure 'service-threads' and 'auto-pin' at the same time"); + } + + if (c->n_transaction_queues != 0) { + cf_crash_nostack(AS_CFG, "can't configure 'transaction-queues' and 'auto-pin' at the same time"); + } + } + + uint16_t n_cpus = cf_topo_count_cpus(); + + if (c->n_service_threads == 0) { + c->n_service_threads = n_cpus; + } + + if (c->n_transaction_queues == 0) { + // If there's at least one SSD namespace, use CPU count. Otherwise, be + // modest - only proxies, internal retries, and background scans & queries + // will use these queues & threads. + c->n_transaction_queues = g_config.n_namespaces_not_inlined != 0 ? n_cpus : 4; + } + + // Allocate and initialize the record locks (olocks). Maybe not the best + // place for this, unless we make number of locks configurable. + g_record_locks = olock_create(16 * 1024, true); + + // Setup performance metrics histograms. + cfg_create_all_histograms(); + + // If node-id was not configured, generate one. + if (c->self_node == 0) { + cf_ip_port id_port = c->fabric.bind_port != 0 ? c->fabric.bind_port : c->tls_fabric.bind_port; + + if (cf_node_id_get(id_port, c->node_id_interface, &c->self_node) < 0) { + cf_crash_nostack(AS_CFG, "could not get node id"); + } + } + else if (c->node_id_interface) { + cf_crash_nostack(AS_CFG, "may not configure both 'node-id' and ''node-id-interface"); + } + + cf_info(AS_CFG, "node-id %lx", c->self_node); + + // Resolve TLS names in all TLS configurations. + + for (uint32_t i = 0; i < g_config.n_tls_specs; ++i) { + if (g_config.tls_specs[i].name == NULL) { + cf_crash_nostack(AS_CFG, "nameless TLS configuration section"); + } + + g_config.tls_specs[i].name = + cfg_resolve_tls_name(g_config.tls_specs[i].name, g_config.cluster_name, NULL); + } + + // Populate access ports from configuration. + + g_access.service.port = g_config.service.std_port != 0 ? + g_config.service.std_port : g_config.service.bind_port; + + g_access.alt_service.port = g_config.service.alt_port != 0 ? + g_config.service.alt_port : g_access.service.port; + + g_access.tls_service.port = g_config.tls_service.std_port != 0 ? + g_config.tls_service.std_port : g_config.tls_service.bind_port; + + g_access.alt_tls_service.port = g_config.tls_service.alt_port != 0 ? + g_config.tls_service.alt_port : g_access.tls_service.port; + + // Populate access addresses from configuration. + + cfg_serv_spec_std_to_access(&g_config.service, &g_access.service.addrs); + cfg_serv_spec_alt_to_access(&g_config.service, &g_access.alt_service.addrs); + cfg_serv_spec_std_to_access(&g_config.tls_service, &g_access.tls_service.addrs); + cfg_serv_spec_alt_to_access(&g_config.tls_service, &g_access.alt_tls_service.addrs); + + // By default, use bind addresses also as access addresses. + + if (g_access.service.addrs.n_addrs == 0) { + bind_to_access(&g_config.service, &g_access.service.addrs); + } + + if (g_access.tls_service.addrs.n_addrs == 0) { + bind_to_access(&g_config.tls_service, &g_access.tls_service.addrs); + } + + // By default, use non-TLS access addresses also for TLS - and vice versa. + + default_addrs(&g_access.service.addrs, &g_access.tls_service.addrs); + default_addrs(&g_access.alt_service.addrs, &g_access.alt_tls_service.addrs); + + cf_serv_cfg_init(&g_service_bind); + + // Client service bind addresses. + + if (g_config.service.bind_port != 0) { + cfg_serv_spec_to_bind(&g_config.service, &g_config.tls_service, &g_service_bind, + CF_SOCK_OWNER_SERVICE); + } + + // Client TLS service bind addresses. + + if (g_config.tls_service.bind_port != 0) { + cfg_serv_spec_to_bind(&g_config.tls_service, &g_config.service, &g_service_bind, + CF_SOCK_OWNER_SERVICE_TLS); + + cf_tls_spec* tls_spec = cfg_link_tls("service", &g_config.tls_service.tls_our_name); + + uint32_t n_peer_names = g_config.tls_service.n_tls_peer_names; + char **peer_names = g_config.tls_service.tls_peer_names; + + bool has_any = false; + bool has_false = false; + + for (uint32_t i = 0; i < n_peer_names; ++i) { + has_any = has_any || strcmp(peer_names[i], "any") == 0; + has_false = has_false || strcmp(peer_names[i], "false") == 0; + } + + if ((has_any || has_false) && n_peer_names > 1) { + cf_crash_nostack(AS_CFG, "\"any\" and \"false\" are incompatible with other tls-authenticate-client arguments"); + } + + bool auth_client; + + if (has_any || n_peer_names == 0) { + auth_client = true; + n_peer_names = 0; + peer_names = NULL; + } + else if (has_false) { + auth_client = false; + n_peer_names = 0; + peer_names = NULL; + } + else { + auth_client = true; + } + + g_service_tls = tls_config_server_context(tls_spec, auth_client, n_peer_names, peer_names); + } + + if (g_service_bind.n_cfgs == 0) { + cf_crash_nostack(AS_CFG, "no service ports configured"); + } + + // Heartbeat service bind addresses. + + cf_serv_cfg_init(&g_config.hb_config.bind_cfg); + + if (c->hb_serv_spec.bind_port != 0) { + cfg_serv_spec_to_bind(&c->hb_serv_spec, &c->hb_tls_serv_spec, &c->hb_config.bind_cfg, + CF_SOCK_OWNER_HEARTBEAT); + } + + // Heartbeat TLS service bind addresses. + + if (c->hb_tls_serv_spec.bind_port != 0) { + if (c->hb_config.mode != AS_HB_MODE_MESH) { + cf_crash_nostack(AS_CFG, "multicast heartbeats do not support TLS"); + } + + cfg_serv_spec_to_bind(&c->hb_tls_serv_spec, &c->hb_serv_spec, &c->hb_config.bind_cfg, + CF_SOCK_OWNER_HEARTBEAT_TLS); + + cf_tls_spec* tls_spec = cfg_link_tls("heartbeat", &c->hb_tls_serv_spec.tls_our_name); + c->hb_config.tls = tls_config_intra_context(tls_spec, "heartbeat"); + } + + if (g_config.hb_config.bind_cfg.n_cfgs == 0) { + cf_crash_nostack(AS_CFG, "no heartbeat ports configured"); + } + + // Heartbeat multicast groups. + + if (c->hb_multicast_groups.n_addrs > 0) { + cfg_mserv_config_from_addrs(&c->hb_multicast_groups, &c->hb_serv_spec.bind, + &g_config.hb_config.multicast_group_cfg, c->hb_serv_spec.bind_port, + CF_SOCK_OWNER_HEARTBEAT, g_config.hb_config.multicast_ttl); + } + + // Fabric service bind addresses. + + cf_serv_cfg_init(&g_fabric_bind); + + if (g_config.fabric.bind_port != 0) { + cfg_serv_spec_to_bind(&g_config.fabric, &g_config.tls_fabric, &g_fabric_bind, + CF_SOCK_OWNER_FABRIC); + } + + // Fabric TLS service bind addresses. + + if (g_config.tls_fabric.bind_port != 0) { + cfg_serv_spec_to_bind(&g_config.tls_fabric, &g_config.fabric, &g_fabric_bind, + CF_SOCK_OWNER_FABRIC_TLS); + + cf_tls_spec* tls_spec = cfg_link_tls("fabric", &g_config.tls_fabric.tls_our_name); + g_fabric_tls = tls_config_intra_context(tls_spec, "fabric"); + } + + if (g_fabric_bind.n_cfgs == 0) { + cf_crash_nostack(AS_CFG, "no fabric ports configured"); + } + + // Info service port. + + g_info_port = g_config.info.bind_port; + + // Info service bind addresses. + + cf_serv_cfg_init(&g_info_bind); + cfg_serv_spec_to_bind(&g_config.info, NULL, &g_info_bind, CF_SOCK_OWNER_INFO); + + // Validate heartbeat configuration. + as_hb_config_validate(); + + //-------------------------------------------- + // Per-namespace config post-processing. + // + + for (int i = 0; i < g_config.n_namespaces; i++) { + as_namespace* ns = g_config.namespaces[i]; + + client_replica_maps_create(ns); + + ns->tree_shared.destructor = (as_index_value_destructor)&as_record_destroy; + ns->tree_shared.destructor_udata = (void*)ns; + ns->tree_shared.locks_shift = 12 - cf_msb(ns->tree_shared.n_lock_pairs); + ns->tree_shared.sprigs_shift = 12 - cf_msb(ns->tree_shared.n_sprigs); + ns->tree_shared.sprigs_offset = sizeof(as_lock_pair) * ns->tree_shared.n_lock_pairs; + + ssd_init_encryption_key(ns); + + char hist_name[HISTOGRAM_NAME_SIZE]; + + // One-way activated histograms (may be tracked histograms). + + sprintf(hist_name, "{%s}-read", ns->name); + create_and_check_hist_track(&ns->read_hist, hist_name, HIST_MILLISECONDS); + + sprintf(hist_name, "{%s}-write", ns->name); + create_and_check_hist_track(&ns->write_hist, hist_name, HIST_MILLISECONDS); + + sprintf(hist_name, "{%s}-udf", ns->name); + create_and_check_hist_track(&ns->udf_hist, hist_name, HIST_MILLISECONDS); + + sprintf(hist_name, "{%s}-query", ns->name); + create_and_check_hist_track(&ns->query_hist, hist_name, HIST_MILLISECONDS); + + sprintf(hist_name, "{%s}-query-rec-count", ns->name); + ns->query_rec_count_hist = histogram_create(hist_name, HIST_COUNT); + + sprintf(hist_name, "{%s}-re-repl", ns->name); + ns->re_repl_hist = histogram_create(hist_name, HIST_MILLISECONDS); + + // Activate-by-config histograms (can't be tracked histograms). + + sprintf(hist_name, "{%s}-proxy", ns->name); + ns->proxy_hist = histogram_create(hist_name, HIST_MILLISECONDS); + + sprintf(hist_name, "{%s}-read-start", ns->name); + ns->read_start_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-read-restart", ns->name); + ns->read_restart_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-read-dup-res", ns->name); + ns->read_dup_res_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-read-repl-ping", ns->name); + ns->read_repl_ping_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-read-local", ns->name); + ns->read_local_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-read-response", ns->name); + ns->read_response_hist = histogram_create(hist_name, HIST_MILLISECONDS); + + sprintf(hist_name, "{%s}-write-start", ns->name); + ns->write_start_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-write-restart", ns->name); + ns->write_restart_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-write-dup-res", ns->name); + ns->write_dup_res_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-write-master", ns->name); + ns->write_master_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-write-repl-write", ns->name); + ns->write_repl_write_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-write-response", ns->name); + ns->write_response_hist = histogram_create(hist_name, HIST_MILLISECONDS); + + sprintf(hist_name, "{%s}-udf-start", ns->name); + ns->udf_start_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-udf-restart", ns->name); + ns->udf_restart_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-udf-dup-res", ns->name); + ns->udf_dup_res_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-udf-master", ns->name); + ns->udf_master_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-udf-repl-write", ns->name); + ns->udf_repl_write_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-udf-response", ns->name); + ns->udf_response_hist = histogram_create(hist_name, HIST_MILLISECONDS); + + sprintf(hist_name, "{%s}-batch-sub-start", ns->name); + ns->batch_sub_start_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-batch-sub-restart", ns->name); + ns->batch_sub_restart_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-batch-sub-dup-res", ns->name); + ns->batch_sub_dup_res_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-batch-sub-repl-ping", ns->name); + ns->batch_sub_repl_ping_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-batch-sub-read-local", ns->name); + ns->batch_sub_read_local_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-batch-sub-response", ns->name); + ns->batch_sub_response_hist = histogram_create(hist_name, HIST_MILLISECONDS); + + sprintf(hist_name, "{%s}-udf-sub-start", ns->name); + ns->udf_sub_start_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-udf-sub-restart", ns->name); + ns->udf_sub_restart_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-udf-sub-dup-res", ns->name); + ns->udf_sub_dup_res_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-udf-sub-master", ns->name); + ns->udf_sub_master_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-udf-sub-repl-write", ns->name); + ns->udf_sub_repl_write_hist = histogram_create(hist_name, HIST_MILLISECONDS); + sprintf(hist_name, "{%s}-udf-sub-response", ns->name); + ns->udf_sub_response_hist = histogram_create(hist_name, HIST_MILLISECONDS); + + // Linear 'nsup' histograms. + // Note - histograms' ranges MUST be set before use. + + sprintf(hist_name, "%s object size histogram", ns->name); + ns->obj_size_hist = linear_hist_create(hist_name, 0, 0, OBJ_SIZE_HIST_NUM_BUCKETS); + + sprintf(hist_name, "%s evict histogram", ns->name); + ns->evict_hist = linear_hist_create(hist_name, 0, 0, ns->evict_hist_buckets); + + sprintf(hist_name, "%s ttl histogram", ns->name); + ns->ttl_hist = linear_hist_create(hist_name, 0, 0, TTL_HIST_NUM_BUCKETS); + } +} + + +//========================================================== +// Public API - Cluster name. +// + +pthread_mutex_t g_config_lock = PTHREAD_MUTEX_INITIALIZER; + +void +as_config_cluster_name_get(char* cluster_name) +{ + pthread_mutex_lock(&g_config_lock); + strcpy(cluster_name, g_config.cluster_name); + pthread_mutex_unlock(&g_config_lock); +} + +bool +as_config_cluster_name_set(const char* cluster_name) +{ + if (cluster_name[0] == '\0') { + cf_warning(AS_CFG, "cluster name '%s' is not allowed. Ignoring.", cluster_name); + return false; + } + + if (strlen(cluster_name) >= AS_CLUSTER_NAME_SZ) { + cf_warning(AS_CFG, "size of cluster name should not be greater than %d characters. Ignoring cluster name '%s'.", + AS_CLUSTER_NAME_SZ - 1, cluster_name); + return false; + } + + pthread_mutex_lock(&g_config_lock); + if (strcmp(cluster_name,"null") == 0){ + // 'null' is a special value representing an unset cluster-name. + strcpy(g_config.cluster_name, ""); + } else { + strcpy(g_config.cluster_name, cluster_name); + } + pthread_mutex_unlock(&g_config_lock); + + return true; +} + +bool +as_config_cluster_name_matches(const char* cluster_name) +{ + pthread_mutex_lock(&g_config_lock); + bool matches = strcmp(cluster_name, g_config.cluster_name) == 0; + pthread_mutex_unlock(&g_config_lock); + return matches; +} + + +//========================================================== +// Public API - XDR. +// + +bool +xdr_read_security_configfile(xdr_security_config* sc) +{ + FILE* FD; + char iobuf[256]; + int line_num = 0; + cfg_parser_state state; + + cfg_parser_state_init(&state); + + // Initialize the XDR config values to the defaults. + sc->username = NULL; + sc->password = NULL; + iobuf[0] = 0; + + // Open the configuration file for reading. Dont crash if it fails as this + // function can be called during runtime (when credentials file change) + if (NULL == (FD = fopen(sc->sec_config_file, "r"))) { + cf_warning(AS_XDR, "Couldn't open configuration file %s: %s", + sc->sec_config_file, cf_strerror(errno)); + return false; + } + + // Parse the configuration file, line by line. + while (fgets(iobuf, sizeof(iobuf), FD)) { + line_num++; + + // First chop the comment off, if there is one. + + char* p_comment = strchr(iobuf, '#'); + + if (p_comment) { + *p_comment = '\0'; + } + + // Find (and null-terminate) up to three whitespace-delimited tokens in + // the line, a 'name' token and up to two 'value' tokens. + + cfg_line line = { line_num, NULL, NULL, NULL, NULL }; + + line.name_tok = strtok(iobuf, CFG_WHITESPACE); + + // If there are no tokens, ignore this line, get the next line. + if (! line.name_tok) { + continue; + } + + line.val_tok_1 = strtok(NULL, CFG_WHITESPACE); + + if (! line.val_tok_1) { + line.val_tok_1 = ""; // in case it's used where NULL can't be used + } + else { + line.val_tok_2 = strtok(NULL, CFG_WHITESPACE); + } + + if (! line.val_tok_2) { + line.val_tok_2 = ""; // in case it's used where NULL can't be used + } + else { + line.val_tok_3 = strtok(NULL, CFG_WHITESPACE); + } + + if (! line.val_tok_3) { + line.val_tok_3 = ""; // in case it's used where NULL can't be used + } + + // Note that we can't see this output until a logging sink is specified. + cf_detail(AS_CFG, "line %d :: %s %s %s %s", line_num, line.name_tok, + line.val_tok_1, line.val_tok_2, line.val_tok_3); + + // Parse the directive. + switch (state.current) { + + // Parse top-level items. + case GLOBAL: + switch (cfg_find_tok(line.name_tok, XDR_SEC_GLOBAL_OPTS, NUM_XDR_SEC_GLOBAL_OPTS)) { + case XDR_SEC_CASE_CREDENTIALS_BEGIN: + cfg_begin_context(&state, XDR_SEC_CREDENTIALS); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + // Parse xdr context items. + case XDR_SEC_CREDENTIALS: + switch (cfg_find_tok(line.name_tok, XDR_SEC_CREDENTIALS_OPTS, NUM_XDR_SEC_CREDENTIALS_OPTS)) { + case CASE_CONTEXT_BEGIN: + // Allow open brace on its own line to begin this context. + break; + case XDR_SEC_CASE_CREDENTIALS_USERNAME: + sc->username = cfg_strdup(&line, true); + break; + case XDR_SEC_CASE_CREDENTIALS_PASSWORD: + sc->password = cfg_strdup(&line, true); + break; + case CASE_CONTEXT_END: + cfg_end_context(&state); + break; + case CASE_NOT_FOUND: + default: + cfg_unknown_name_tok(&line); + break; + } + break; + + // Parser state is corrupt. + default: + cf_warning(AS_XDR, "line %d :: invalid parser top-level state %d", + line_num, state.current); + break; + } + } + + // Close the file. + fclose(FD); + return true; +} + + +//========================================================== +// Item-specific parsing utilities. +// + +void +init_addr_list(cf_addr_list* addrs) +{ + addrs->n_addrs = 0; + memset(&addrs->addrs, '\0', sizeof(addrs->addrs)); +} + +void +add_addr(const char* name, cf_addr_list* addrs) +{ + uint32_t n = addrs->n_addrs; + + if (n >= CF_SOCK_CFG_MAX) { + cf_crash_nostack(CF_SOCKET, "Too many addresses: %s", name); + } + + addrs->addrs[n] = cf_strdup(name); + ++addrs->n_addrs; +} + +void +add_tls_peer_name(const char* name, cf_serv_spec* spec) +{ + uint32_t n = spec->n_tls_peer_names; + + if (n >= CF_SOCK_CFG_MAX) { + cf_crash_nostack(CF_SOCKET, "Too many TLS peer names: %s", name); + } + + spec->tls_peer_names[n] = cf_strdup(name); + ++spec->n_tls_peer_names; +} + +void +copy_addrs(const cf_addr_list* from, cf_addr_list* to) +{ + for (uint32_t i = 0; i < from->n_addrs; ++i) { + to->addrs[i] = from->addrs[i]; + } + + to->n_addrs = from->n_addrs; +} + +void +default_addrs(cf_addr_list* one, cf_addr_list* two) +{ + if (one->n_addrs == 0) { + copy_addrs(two, one); + } + + if (two->n_addrs == 0) { + copy_addrs(one, two); + } +} + +void +bind_to_access(const cf_serv_spec* from, cf_addr_list* to) +{ + cf_serv_spec spec; + spec.bind_port = 0; + init_addr_list(&spec.bind); + spec.std_port = 0; + init_addr_list(&spec.std); + spec.alt_port = 0; + init_addr_list(&spec.alt); + + for (uint32_t i = 0; i < from->bind.n_addrs; ++i) { + cf_ip_addr resol[CF_SOCK_CFG_MAX]; + uint32_t n_resol = CF_SOCK_CFG_MAX; + + if (cf_ip_addr_from_string_multi(from->bind.addrs[i], resol, &n_resol) < 0) { + cf_crash_nostack(AS_CFG, "Invalid default access address: %s", from->bind.addrs[i]); + } + + bool valid = true; + + for (uint32_t k = 0; k < n_resol; ++k) { + if (cf_ip_addr_is_any(&resol[k]) || cf_ip_addr_is_local(&resol[k])) { + cf_debug(AS_CFG, "Skipping invalid default access address: %s", + from->bind.addrs[i]); + valid = false; + break; + } + } + + if (valid) { + uint32_t n = spec.std.n_addrs; + spec.std.addrs[n] = from->bind.addrs[i]; + ++spec.std.n_addrs; + } + } + + cfg_serv_spec_std_to_access(&spec, to); +} + +void +cfg_add_addr_bind(const char* name, cf_serv_spec* spec) +{ + add_addr(name, &spec->bind); +} + +void +cfg_add_addr_std(const char* name, cf_serv_spec* spec) +{ + add_addr(name, &spec->std); +} + +void +cfg_add_addr_alt(const char* name, cf_serv_spec* spec) +{ + add_addr(name, &spec->alt); +} + +void +cfg_mserv_config_from_addrs(cf_addr_list* addrs, cf_addr_list* bind_addrs, + cf_mserv_cfg* serv_cfg, cf_ip_port port, cf_sock_owner owner, + uint8_t ttl) +{ + static cf_addr_list def_addrs = { + .n_addrs = 1, .addrs = { "any" } + }; + + if (bind_addrs->n_addrs == 0) { + bind_addrs = &def_addrs; + } + + for (uint32_t i = 0; i < addrs->n_addrs; ++i) { + + cf_ip_addr resol[CF_SOCK_CFG_MAX]; + uint32_t n_resol = CF_SOCK_CFG_MAX; + + if (cf_ip_addr_from_string_multi(addrs->addrs[i], resol, + &n_resol) < 0) { + cf_crash_nostack(AS_CFG, "Invalid multicast group: %s", + addrs->addrs[i]); + } + + for (uint32_t j = 0; j < bind_addrs->n_addrs; j++) { + + cf_ip_addr bind_resol[CF_SOCK_CFG_MAX]; + uint32_t n_bind_resol = CF_SOCK_CFG_MAX; + + if (cf_ip_addr_from_string_multi(bind_addrs->addrs[j], + bind_resol, + &n_bind_resol) < 0) { + cf_crash_nostack(AS_CFG, "Invalid address: %s", + bind_addrs->addrs[j]); + } + + for (int32_t k = 0; k < n_resol; ++k) { + for (int32_t l = 0; l < n_bind_resol; ++l) { + if (cf_mserv_cfg_add_combo(serv_cfg, owner, port, + &resol[k], &bind_resol[l], ttl) < 0) { + cf_crash_nostack(AS_CFG, "Too many IP addresses"); + } + } + } + } + } +} + +void +cfg_serv_spec_to_bind(const cf_serv_spec* spec, const cf_serv_spec* def_spec, cf_serv_cfg* bind, + cf_sock_owner owner) +{ + static cf_addr_list def_addrs = { + .n_addrs = 1, .addrs = { "any" } + }; + + cf_sock_cfg cfg; + cf_sock_cfg_init(&cfg, owner); + cfg.port = spec->bind_port; + + const cf_addr_list* addrs; + + if (spec->bind.n_addrs != 0) { + addrs = &spec->bind; + } + else if (def_spec != NULL && def_spec->bind.n_addrs != 0) { + addrs = &def_spec->bind; + } + else { + addrs = &def_addrs; + } + + for (uint32_t i = 0; i < addrs->n_addrs; ++i) { + cf_ip_addr resol[CF_SOCK_CFG_MAX]; + uint32_t n_resol = CF_SOCK_CFG_MAX; + + if (cf_ip_addr_from_string_multi(addrs->addrs[i], resol, &n_resol) < 0) { + cf_crash_nostack(AS_CFG, "Invalid address: %s", addrs->addrs[i]); + } + + for (uint32_t k = 0; k < n_resol; ++k) { + cf_ip_addr_copy(&resol[k], &cfg.addr); + + if (cf_serv_cfg_add_sock_cfg(bind, &cfg) < 0) { + cf_crash_nostack(AS_CFG, "Too many IP addresses: %s", addrs->addrs[i]); + } + } + } +} + +static void +addrs_to_access(const cf_addr_list* addrs, cf_addr_list* access) +{ + for (uint32_t i = 0; i < addrs->n_addrs; ++i) { + cf_ip_addr resol[CF_SOCK_CFG_MAX]; + uint32_t n_resol = CF_SOCK_CFG_MAX; + + if (cf_ip_addr_from_string_multi(addrs->addrs[i], resol, &n_resol) < 0) { + cf_crash_nostack(AS_CFG, "Invalid access address: %s", addrs->addrs[i]); + } + + for (uint32_t k = 0; k < n_resol; ++k) { + if (cf_ip_addr_is_any(&resol[k])) { + cf_crash_nostack(AS_CFG, "Invalid access address: %s", addrs->addrs[i]); + } + } + + if (cf_ip_addr_is_dns_name(addrs->addrs[i])) { + add_addr(addrs->addrs[i], access); + } + else { + for (uint32_t k = 0; k < n_resol; ++k) { + char tmp[250]; + cf_ip_addr_to_string_safe(&resol[k], tmp, sizeof(tmp)); + add_addr(tmp, access); + } + } + } +} + +void +cfg_serv_spec_std_to_access(const cf_serv_spec* spec, cf_addr_list* access) +{ + addrs_to_access(&spec->std, access); +} + +void +cfg_serv_spec_alt_to_access(const cf_serv_spec* spec, cf_addr_list* access) +{ + addrs_to_access(&spec->alt, access); +} + +void +cfg_add_mesh_seed_addr_port(char* addr, cf_ip_port port, bool tls) +{ + int32_t i; + + for (i = 0; i < AS_CLUSTER_SZ; i++) { + if (g_config.hb_config.mesh_seed_addrs[i] == NULL) { + g_config.hb_config.mesh_seed_addrs[i] = addr; + g_config.hb_config.mesh_seed_ports[i] = port; + g_config.hb_config.mesh_seed_tls[i] = tls; + break; + } + } + + if (i == AS_CLUSTER_SZ) { + cf_crash_nostack(AS_CFG, "can't configure more than %d mesh-seed-address-port entries", AS_CLUSTER_SZ); + } +} + +as_set* +cfg_add_set(as_namespace* ns) +{ + if (ns->sets_cfg_count >= AS_SET_MAX_COUNT) { + cf_crash_nostack(AS_CFG, "namespace %s - too many sets", ns->name); + } + + // Lazily allocate temporary sets config array. + if (! ns->sets_cfg_array) { + size_t array_size = AS_SET_MAX_COUNT * sizeof(as_set); + + ns->sets_cfg_array = (as_set*)cf_malloc(array_size); + memset(ns->sets_cfg_array, 0, array_size); + } + + return &ns->sets_cfg_array[ns->sets_cfg_count++]; +} + +void +cfg_add_storage_file(as_namespace* ns, char* file_name) +{ + int i; + + for (i = 0; i < AS_STORAGE_MAX_FILES; i++) { + if (! ns->storage_files[i]) { + ns->storage_files[i] = file_name; + break; + } + } + + if (i == AS_STORAGE_MAX_FILES) { + cf_crash_nostack(AS_CFG, "namespace %s - too many storage files", ns->name); + } +} + +void +cfg_add_storage_device(as_namespace* ns, char* device_name, char* shadow_name) +{ + int i; + + for (i = 0; i < AS_STORAGE_MAX_DEVICES; i++) { + if (! ns->storage_devices[i]) { + ns->storage_devices[i] = device_name; + ns->storage_shadows[i] = shadow_name; + break; + } + } + + if (i == AS_STORAGE_MAX_DEVICES) { + cf_crash_nostack(AS_CFG, "namespace %s - too many storage devices", ns->name); + } +} + +uint32_t +cfg_obj_size_hist_max(uint32_t hist_max) +{ + uint32_t round_to = OBJ_SIZE_HIST_NUM_BUCKETS; + uint32_t round_max = hist_max != 0 ? + ((hist_max + round_to - 1) / round_to) * round_to : round_to; + + if (round_max != hist_max) { + cf_info(AS_CFG, "rounding obj-size-hist-max %u up to %u", hist_max, round_max); + } + + return round_max; // in 128-byte blocks +} + +void +cfg_set_cluster_name(char* cluster_name){ + if(!as_config_cluster_name_set(cluster_name)){ + cf_crash_nostack(AS_CFG, "cluster name '%s' is not allowed", cluster_name); + } +} + + +//========================================================== +// Other (non-item-specific) utilities. +// + +void +create_and_check_hist_track(cf_hist_track** h, const char* name, + histogram_scale scale) +{ + *h = cf_hist_track_create(name, scale); + + as_config* c = &g_config; + + if (c->hist_track_back != 0 && + ! cf_hist_track_start(*h, c->hist_track_back, c->hist_track_slice, c->hist_track_thresholds)) { + cf_crash_nostack(AS_AS, "couldn't enable histogram tracking: %s", name); + } +} + +// TODO - not really a config method any more, reorg needed. +void +cfg_create_all_histograms() +{ + g_stats.batch_index_hist = histogram_create("batch-index", HIST_MILLISECONDS); + g_stats.info_hist = histogram_create("info", HIST_MILLISECONDS); + g_stats.svc_demarshal_hist = histogram_create("svc-demarshal", HIST_MILLISECONDS); + g_stats.svc_queue_hist = histogram_create("svc-queue", HIST_MILLISECONDS); + + g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_BULK] = histogram_create("fabric-bulk-send-init", HIST_MILLISECONDS); + g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_BULK] = histogram_create("fabric-bulk-send-fragment", HIST_MILLISECONDS); + g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_BULK] = histogram_create("fabric-bulk-recv-fragment", HIST_MILLISECONDS); + g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_BULK] = histogram_create("fabric-bulk-recv-cb", HIST_MILLISECONDS); + g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_CTRL] = histogram_create("fabric-ctrl-send-init", HIST_MILLISECONDS); + g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_CTRL] = histogram_create("fabric-ctrl-send-fragment", HIST_MILLISECONDS); + g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_CTRL] = histogram_create("fabric-ctrl-recv-fragment", HIST_MILLISECONDS); + g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_CTRL] = histogram_create("fabric-ctrl-recv-cb", HIST_MILLISECONDS); + g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_META] = histogram_create("fabric-meta-send-init", HIST_MILLISECONDS); + g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_META] = histogram_create("fabric-meta-send-fragment", HIST_MILLISECONDS); + g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_META] = histogram_create("fabric-meta-recv-fragment", HIST_MILLISECONDS); + g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_META] = histogram_create("fabric-meta-recv-cb", HIST_MILLISECONDS); + g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_RW] = histogram_create("fabric-rw-send-init", HIST_MILLISECONDS); + g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_RW] = histogram_create("fabric-rw-send-fragment", HIST_MILLISECONDS); + g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_RW] = histogram_create("fabric-rw-recv-fragment", HIST_MILLISECONDS); + g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_RW] = histogram_create("fabric-rw-recv-cb", HIST_MILLISECONDS); +} + +void +cfg_init_serv_spec(cf_serv_spec* spec_p) +{ + spec_p->bind_port = 0; + init_addr_list(&spec_p->bind); + spec_p->std_port = 0; + init_addr_list(&spec_p->std); + spec_p->alt_port = 0; + init_addr_list(&spec_p->alt); + spec_p->tls_our_name = NULL; + spec_p->n_tls_peer_names = 0; + memset(spec_p->tls_peer_names, 0, sizeof(spec_p->tls_peer_names)); +} + +cf_tls_spec* +cfg_create_tls_spec(as_config* cfg, char* name) +{ + uint32_t ind = cfg->n_tls_specs++; + + if (ind >= MAX_TLS_SPECS) { + cf_crash_nostack(AS_CFG, "too many TLS configuration sections"); + } + + cf_tls_spec* tls_spec = cfg->tls_specs + ind; + tls_spec->name = cf_strdup(name); + return tls_spec; +} + +char* +cfg_resolve_tls_name(char* tls_name, const char* cluster_name, const char* which) +{ + bool expanded = false; + + if (strcmp(tls_name, "") == 0) { + char hostname[1024]; + int rv = gethostname(hostname, sizeof(hostname)); + if (rv != 0) { + cf_crash_nostack(AS_CFG, + "trouble resolving hostname for tls-name: %s", cf_strerror(errno)); + } + hostname[sizeof(hostname)-1] = '\0'; // POSIX.1-2001 + cf_free(tls_name); + tls_name = cf_strdup(hostname); + expanded = true; + } + else if (strcmp(tls_name, "") == 0) { + if (strlen(cluster_name) == 0) { + cf_crash_nostack + (AS_CFG, "can't resolve tls-name to non-existent cluster-name"); + } + cf_free(tls_name); + tls_name = cf_strdup(cluster_name); + expanded = true; + } + + if (expanded && which != NULL) { + cf_info(AS_CFG, "%s tls-name %s", which, tls_name); + } + + return tls_name; +} + +cf_tls_spec* +cfg_link_tls(const char* which, char** our_name) +{ + if (*our_name == NULL) { + cf_crash_nostack(AS_CFG, "%s TLS configuration requires tls-name", which); + } + + *our_name = cfg_resolve_tls_name(*our_name, g_config.cluster_name, which); + cf_tls_spec* tls_spec = NULL; + + for (uint32_t i = 0; i < g_config.n_tls_specs; ++i) { + if (strcmp(*our_name, g_config.tls_specs[i].name) == 0) { + tls_spec = g_config.tls_specs + i; + break; + } + } + + if (tls_spec == NULL) { + cf_crash_nostack(AS_CFG, "invalid tls-name in TLS configuration: %s", + *our_name); + } + + return tls_spec; +} + +//========================================================== +// XDR utilities. +// + +void +xdr_cfg_add_datacenter(char* dc, uint32_t nsid) +{ + cf_vector *v = &g_config.namespaces[nsid-1]->xdr_dclist_v; + + // Crash if datacenter with same name already exists. + for (uint32_t index = 0; index < cf_vector_size(v); index++) { + if (strcmp((char *)cf_vector_pointer_get(v, index), dc) == 0) { + cf_crash_nostack(AS_XDR, "datacenter %s already exists for namespace %s - please remove duplicate entries from config file", + dc, g_config.namespaces[nsid-1]->name); + } + } + + // Add the string pointer (of the datacenter name) to the vector. + cf_vector_pointer_append(v, dc); +} + +void +xdr_cfg_add_node_addr_port(dc_config_opt *dc_cfg, char* addr, int port) +{ + xdr_cfg_add_tls_node(dc_cfg, addr, NULL, port); +} + +void +xdr_cfg_add_tls_node(dc_config_opt *dc_cfg, char* addr, char *tls_name, int port) +{ + // Add the element to the vector. + node_addr_port* nap = (node_addr_port*)cf_malloc(sizeof(node_addr_port)); + + nap->addr = addr; + nap->tls_name = tls_name; + nap->port = port; + + cf_vector_pointer_append(&dc_cfg->dc_node_v, nap); +} diff --git a/as/src/base/cfg_ce.c b/as/src/base/cfg_ce.c new file mode 100644 index 00000000..e90f5e8a --- /dev/null +++ b/as/src/base/cfg_ce.c @@ -0,0 +1,90 @@ +/* + * cfg_ce.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "base/cfg.h" + +#include +#include + +#include "fault.h" + +#include "base/datamodel.h" + + +//========================================================== +// Forward declarations. +// + +void post_process_namespace(as_namespace* ns); + + +//========================================================== +// Public API. +// + +bool +as_config_error_enterprise_only() +{ + return true; +} + + +//========================================================== +// Private API - for enterprise separation only. +// + +void +cfg_enterprise_only(const cfg_line* p_line) +{ + cf_crash_nostack(AS_CFG, "line %d :: '%s' is enterprise-only", + p_line->num, p_line->name_tok); +} + + +void +cfg_post_process() +{ + // So far, no other context handled. + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + post_process_namespace(g_config.namespaces[ns_ix]); + } +} + + +//========================================================== +// Local helpers. +// + +void +post_process_namespace(as_namespace* ns) +{ + if (ns->conflict_resolution_policy == + AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_UNDEF) { + ns->conflict_resolution_policy = + AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION; + } +} diff --git a/as/src/base/features_ce.c b/as/src/base/features_ce.c new file mode 100644 index 00000000..b07c8ffb --- /dev/null +++ b/as/src/base/features_ce.c @@ -0,0 +1,38 @@ +/* + * features_ce.c + * + * Copyright (C) 2018 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "base/features.h" + + +//========================================================== +// Public API. +// + +const char * +as_features_info() +{ + return "null"; +} diff --git a/as/src/base/index.c b/as/src/base/index.c new file mode 100644 index 00000000..6d3de8df --- /dev/null +++ b/as/src/base/index.c @@ -0,0 +1,1254 @@ +/* + * index.c + * + * Copyright (C) 2012-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "base/index.h" + +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_queue.h" + +#include "arenax.h" +#include "cf_mutex.h" +#include "fault.h" +#include "olock.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/stats.h" + + +//========================================================== +// Constants and typedefs. +// + +typedef enum { + AS_BLACK = 0, + AS_RED = 1 +} as_index_color; + +typedef struct as_index_ph_s { + as_index *r; + cf_arenax_handle r_h; +} as_index_ph; + +typedef struct as_index_ph_array_s { + uint64_t alloc_sz; + uint64_t pos; + as_index_ph indexes[]; +} as_index_ph_array; + +typedef struct as_index_ele_s { + struct as_index_ele_s *parent; + cf_arenax_handle me_h; + as_index *me; +} as_index_ele; + +const size_t MAX_STACK_ARRAY_BYTES = 128 * 1024; + + +//========================================================== +// Globals. +// + +static cf_queue g_gc_queue; + + +//========================================================== +// Forward declarations. +// + +void *run_index_tree_gc(void *unused); +void as_index_tree_destroy(as_index_tree *tree); +void as_index_sprig_done(as_index_sprig *isprig, as_index *r, cf_arenax_handle r_h); +bool as_index_sprig_invalid_record_done(as_index_sprig *isprig, as_index_ref *index_ref); + +uint64_t as_index_sprig_reduce_partial(as_index_sprig *isprig, uint64_t sample_count, as_index_reduce_fn cb, void *udata); +void as_index_sprig_traverse(as_index_sprig *isprig, cf_arenax_handle r_h, as_index_ph_array *v_a); +void as_index_sprig_traverse_purge(as_index_sprig *isprig, cf_arenax_handle r_h); + +int as_index_sprig_exists(as_index_sprig *isprig, cf_digest *keyd); +int as_index_sprig_get_vlock(as_index_sprig *isprig, cf_digest *keyd, as_index_ref *index_ref); +int as_index_sprig_get_insert_vlock(as_index_sprig *isprig, cf_digest *keyd, as_index_ref *index_ref); +int as_index_sprig_delete(as_index_sprig *isprig, cf_digest *keyd); + +int as_index_sprig_search_lockless(as_index_sprig *isprig, cf_digest *keyd, as_index **ret, cf_arenax_handle *ret_h); +void as_index_sprig_insert_rebalance(as_index_sprig *isprig, as_index *root_parent, as_index_ele *ele); +void as_index_sprig_delete_rebalance(as_index_sprig *isprig, as_index *root_parent, as_index_ele *ele); +void as_index_rotate_left(as_index_ele *a, as_index_ele *b); +void as_index_rotate_right(as_index_ele *a, as_index_ele *b); + +static inline void +as_index_sprig_from_i(as_index_tree *tree, as_index_sprig *isprig, + uint32_t sprig_i) +{ + uint32_t lock_i = sprig_i >> + (tree->shared->locks_shift - tree->shared->sprigs_shift); + + isprig->destructor = tree->shared->destructor; + isprig->destructor_udata = tree->shared->destructor_udata; + isprig->arena = tree->arena; + isprig->pair = tree_locks(tree) + lock_i; + isprig->sprig = tree_sprigs(tree) + sprig_i; +} + +static inline void +as_index_sprig_from_keyd(as_index_tree *tree, as_index_sprig *isprig, + const cf_digest *keyd) +{ + // Get the 12 most significant non-pid bits in the digest. Note - this is + // hardwired around the way we currently extract the (12 bit) partition-ID + // from the digest. + uint32_t bits = (((uint32_t)keyd->digest[1] & 0xF0) << 4) | + (uint32_t)keyd->digest[2]; + + uint32_t lock_i = bits >> tree->shared->locks_shift; + uint32_t sprig_i = bits >> tree->shared->sprigs_shift; + + isprig->destructor = tree->shared->destructor; + isprig->destructor_udata = tree->shared->destructor_udata; + isprig->arena = tree->arena; + isprig->pair = tree_locks(tree) + lock_i; + isprig->sprig = tree_sprigs(tree) + sprig_i; +} + + +//========================================================== +// Public API - initialize garbage collection system. +// + +void +as_index_tree_gc_init() +{ + cf_queue_init(&g_gc_queue, sizeof(as_index_tree*), 4096, true); + + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + if (pthread_create(&thread, &attrs, run_index_tree_gc, NULL) != 0) { + cf_crash(AS_INDEX, "failed to create garbage collection thread"); + } +} + + +int +as_index_tree_gc_queue_size() +{ + return cf_queue_sz(&g_gc_queue); +} + + +//========================================================== +// Public API - create/destroy/size a tree. +// + +// Create a new red-black tree. +as_index_tree * +as_index_tree_create(as_index_tree_shared *shared, cf_arenax *arena) +{ + size_t locks_size = sizeof(cf_mutex) * shared->n_lock_pairs * 2; + size_t sprigs_size = sizeof(as_sprig) * shared->n_sprigs; + size_t tree_size = sizeof(as_index_tree) + locks_size + sprigs_size; + + as_index_tree *tree = cf_rc_alloc(tree_size); + + tree->shared = shared; + tree->arena = arena; + + as_lock_pair *pair = tree_locks(tree); + as_lock_pair *pair_end = pair + shared->n_lock_pairs; + + while (pair < pair_end) { + cf_mutex_init(&pair->lock); + cf_mutex_init(&pair->reduce_lock); + pair++; + } + + // The tree starts empty. + memset(tree_sprigs(tree), 0, sprigs_size); + + return tree; +} + + +// Destroy a red-black tree; return 0 if the tree was destroyed or 1 otherwise. +// TODO - nobody cares about the return value, make it void? +int +as_index_tree_release(as_index_tree *tree) +{ + int rc = cf_rc_release(tree); + + if (rc > 0) { + return 1; + } + + cf_assert(rc == 0, AS_INDEX, "tree ref-count %d", rc); + + // TODO - call as_index_tree_destroy() directly if tree is empty? + + cf_queue_push(&g_gc_queue, &tree); + + return 0; +} + + +// Get the number of elements in the tree. +uint64_t +as_index_tree_size(as_index_tree *tree) +{ + uint64_t n_elements = 0; + as_sprig* sprig = tree_sprigs(tree); + as_sprig* sprig_end = sprig + tree->shared->n_sprigs; + + while (sprig < sprig_end) { + n_elements += sprig->n_elements; + sprig++; + } + + return n_elements; +} + + +//========================================================== +// Public API - reduce a tree. +// + +// Make a callback for every element in the tree, from outside the tree lock. +void +as_index_reduce(as_index_tree *tree, as_index_reduce_fn cb, void *udata) +{ + as_index_reduce_partial(tree, AS_REDUCE_ALL, cb, udata); +} + + +// Make a callback for a specified number of elements in the tree, from outside +// the tree lock. +void +as_index_reduce_partial(as_index_tree *tree, uint64_t sample_count, + as_index_reduce_fn cb, void *udata) +{ + // Reduce sprigs from largest to smallest digests to preserve this order for + // the whole tree. (Rapid rebalance requires exact order.) + + for (int i = (int)tree->shared->n_sprigs - 1; i >= 0; i--) { + as_index_sprig isprig; + as_index_sprig_from_i(tree, &isprig, (uint32_t)i); + + sample_count -= as_index_sprig_reduce_partial(&isprig, sample_count, cb, + udata); + + if (sample_count == 0) { + break; + } + } +} + + +//========================================================== +// Public API - get/insert/delete an element in a tree. +// + +// Is there an element with specified digest in the tree? +// +// Returns: +// 0 - found (yes) +// -1 - not found (no) +int +as_index_exists(as_index_tree *tree, cf_digest *keyd) +{ + as_index_sprig isprig; + as_index_sprig_from_keyd(tree, &isprig, keyd); + + return as_index_sprig_exists(&isprig, keyd); +} + + +// If there's an element with specified digest in the tree, return a locked +// and reserved reference to it in index_ref. +// +// Returns: +// 0 - found (reference returned in index_ref) +// -1 - not found (index_ref untouched) +int +as_index_get_vlock(as_index_tree *tree, cf_digest *keyd, + as_index_ref *index_ref) +{ + as_index_sprig isprig; + as_index_sprig_from_keyd(tree, &isprig, keyd); + + return as_index_sprig_get_vlock(&isprig, keyd, index_ref); +} + + +// If there's an element with specified digest in the tree, return a locked +// and reserved reference to it in index_ref. If not, create an element with +// this digest, insert it into the tree, and return a locked and reserved +// reference to it in index_ref. +// +// Returns: +// 1 - created and inserted (reference returned in index_ref) +// 0 - found already existing (reference returned in index_ref) +// -1 - error - could not allocate arena stage +// -2 - error - found "half created" or deleted record +int +as_index_get_insert_vlock(as_index_tree *tree, cf_digest *keyd, + as_index_ref *index_ref) +{ + as_index_sprig isprig; + as_index_sprig_from_keyd(tree, &isprig, keyd); + + return as_index_sprig_get_insert_vlock(&isprig, keyd, index_ref); +} + + +// If there's an element with specified digest in the tree, delete it. +// +// Returns: +// 0 - found and deleted +// -1 - not found +// TODO - nobody cares about the return value, make it void? +int +as_index_delete(as_index_tree *tree, cf_digest *keyd) +{ + as_index_sprig isprig; + as_index_sprig_from_keyd(tree, &isprig, keyd); + + return as_index_sprig_delete(&isprig, keyd); +} + + +//========================================================== +// Local helpers - garbage collection, generic. +// + +void * +run_index_tree_gc(void *unused) +{ + as_index_tree *tree; + + while (cf_queue_pop(&g_gc_queue, &tree, CF_QUEUE_FOREVER) == CF_QUEUE_OK) { + as_index_tree_destroy(tree); + } + + return NULL; +} + + +void +as_index_tree_destroy(as_index_tree *tree) +{ + as_sprig* sprig = tree_sprigs(tree); + as_sprig* sprig_end = sprig + tree->shared->n_sprigs; + + while (sprig < sprig_end) { + as_index_sprig isprig; + + isprig.destructor = tree->shared->destructor; + isprig.destructor_udata = tree->shared->destructor_udata; + isprig.arena = tree->arena; + isprig.sprig = sprig; + + as_index_sprig_traverse_purge(&isprig, isprig.sprig->root_h); + sprig++; + } + + as_lock_pair *pair = tree_locks(tree); + as_lock_pair *pair_end = pair + tree->shared->n_lock_pairs; + + while (pair < pair_end) { + cf_mutex_destroy(&pair->lock); + cf_mutex_destroy(&pair->reduce_lock); + pair++; + } + + cf_rc_free(tree); +} + + +void +as_index_sprig_done(as_index_sprig *isprig, as_index *r, cf_arenax_handle r_h) +{ + int rc = as_index_release(r); + + if (rc > 0) { + return; + } + + cf_assert(rc == 0, AS_INDEX, "index ref-count %d", rc); + + if (isprig->destructor) { + isprig->destructor(r, isprig->destructor_udata); + } + + cf_arenax_free(isprig->arena, r_h); +} + + +bool +as_index_sprig_invalid_record_done(as_index_sprig *isprig, + as_index_ref *index_ref) +{ + if (as_index_is_valid_record(index_ref->r)) { + return false; + } + + if (! index_ref->skip_lock) { + cf_mutex_unlock(index_ref->olock); + } + + as_index_sprig_done(isprig, index_ref->r, index_ref->r_h); + + return true; +} + + +//========================================================== +// Local helpers - reduce a sprig. +// + +// Make a callback for a specified number of elements in the tree, from outside +// the tree lock. +uint64_t +as_index_sprig_reduce_partial(as_index_sprig *isprig, uint64_t sample_count, + as_index_reduce_fn cb, void *udata) +{ + bool reduce_all = sample_count == AS_REDUCE_ALL; + + cf_mutex_lock(&isprig->pair->reduce_lock); + + if (reduce_all || sample_count > isprig->sprig->n_elements) { + sample_count = isprig->sprig->n_elements; + } + + // Common to encounter empty sprigs. + if (sample_count == 0) { + cf_mutex_unlock(&isprig->pair->reduce_lock); + return 0; + } + + size_t sz = sizeof(as_index_ph_array) + + (sizeof(as_index_ph) * sample_count); + as_index_ph_array *v_a; + uint8_t buf[MAX_STACK_ARRAY_BYTES]; + + v_a = sz > MAX_STACK_ARRAY_BYTES ? cf_malloc(sz) : (as_index_ph_array*)buf; + + v_a->alloc_sz = sample_count; + v_a->pos = 0; + + uint64_t start_ms = cf_getms(); + + // Recursively, fetch all the value pointers into this array, so we can make + // all the callbacks outside the big lock. + as_index_sprig_traverse(isprig, isprig->sprig->root_h, v_a); + + cf_detail(AS_INDEX, "sprig reduce took %lu ms", cf_getms() - start_ms); + + cf_mutex_unlock(&isprig->pair->reduce_lock); + + uint64_t i; + + for (i = 0; i < v_a->pos; i++) { + as_index_ref r_ref; + + r_ref.skip_lock = false; + r_ref.r = v_a->indexes[i].r; + r_ref.r_h = v_a->indexes[i].r_h; + + olock_vlock(g_record_locks, &r_ref.r->keyd, &r_ref.olock); + + // Ignore this record if it's "half created" or deleted. + if (as_index_sprig_invalid_record_done(isprig, &r_ref)) { + continue; + } + + // Callback MUST call as_record_done() to unlock and release record. + cb(&r_ref, udata); + } + + if (v_a != (as_index_ph_array*)buf) { + cf_free(v_a); + } + + // In reduce-all mode, return 0 so outside loop continues to pass + // sample_count = AS_REDUCE_ALL. + return reduce_all ? 0 : i; +} + + +void +as_index_sprig_traverse(as_index_sprig *isprig, cf_arenax_handle r_h, + as_index_ph_array *v_a) +{ + if (r_h == SENTINEL_H) { + return; + } + + as_index *r = RESOLVE_H(r_h); + + as_index_sprig_traverse(isprig, r->left_h, v_a); + + if (v_a->pos >= v_a->alloc_sz) { + return; + } + + as_index_reserve(r); + + v_a->indexes[v_a->pos].r = r; + v_a->indexes[v_a->pos].r_h = r_h; + v_a->pos++; + + as_index_sprig_traverse(isprig, r->right_h, v_a); +} + + +void +as_index_sprig_traverse_purge(as_index_sprig *isprig, cf_arenax_handle r_h) +{ + if (r_h == SENTINEL_H) { + return; + } + + as_index *r = RESOLVE_H(r_h); + + as_index_sprig_traverse_purge(isprig, r->left_h); + as_index_sprig_traverse_purge(isprig, r->right_h); + + as_index_sprig_done(isprig, r, r_h); +} + + +//========================================================== +// Local helpers - get/insert/delete an element in a sprig. +// + +int +as_index_sprig_exists(as_index_sprig *isprig, cf_digest *keyd) +{ + cf_mutex_lock(&isprig->pair->lock); + + int rv = as_index_sprig_search_lockless(isprig, keyd, NULL, NULL); + + cf_mutex_unlock(&isprig->pair->lock); + + return rv; +} + + +int +as_index_sprig_get_vlock(as_index_sprig *isprig, cf_digest *keyd, + as_index_ref *index_ref) +{ + cf_mutex_lock(&isprig->pair->lock); + + int rv = as_index_sprig_search_lockless(isprig, keyd, &index_ref->r, + &index_ref->r_h); + + if (rv != 0) { + cf_mutex_unlock(&isprig->pair->lock); + return rv; + } + + as_index_reserve(index_ref->r); + + cf_mutex_unlock(&isprig->pair->lock); + + if (! index_ref->skip_lock) { + olock_vlock(g_record_locks, keyd, &index_ref->olock); + } + + // Treat record as not found if it's "half created" or deleted. + if (as_index_sprig_invalid_record_done(isprig, index_ref)) { + return -1; + } + + return 0; +} + + +int +as_index_sprig_get_insert_vlock(as_index_sprig *isprig, cf_digest *keyd, + as_index_ref *index_ref) +{ + int cmp = 0; + bool retry; + + // Use a stack as_index object for the root's parent, for convenience. + as_index root_parent; + + // Save parents as we search for the specified element's insertion point. + as_index_ele eles[64]; // FIXME - increase this appropriately + as_index_ele *ele; + + do { + ele = eles; + + cf_mutex_lock(&isprig->pair->lock); + + // Search for the specified element, or a parent to insert it under. + + root_parent.left_h = isprig->sprig->root_h; + root_parent.color = AS_BLACK; + + ele->parent = NULL; // we'll never look this far up + ele->me_h = 0; // root parent has no handle, never used + ele->me = &root_parent; + + cf_arenax_handle t_h = isprig->sprig->root_h; + as_index *t = RESOLVE_H(t_h); + + while (t_h != SENTINEL_H) { + ele++; + ele->parent = ele - 1; + ele->me_h = t_h; + ele->me = t; + + _mm_prefetch(t, _MM_HINT_NTA); + + if ((cmp = cf_digest_compare(keyd, &t->keyd)) == 0) { + // The element already exists, simply return it. + + as_index_reserve(t); + + cf_mutex_unlock(&isprig->pair->lock); + + if (! index_ref->skip_lock) { + olock_vlock(g_record_locks, keyd, &index_ref->olock); + } + + index_ref->r = t; + index_ref->r_h = t_h; + + // Fail if the record is "half created" or deleted. + if (as_index_sprig_invalid_record_done(isprig, index_ref)) { + return -2; + } + + return 0; + } + + t_h = cmp > 0 ? t->left_h : t->right_h; + t = RESOLVE_H(t_h); + } + + // We didn't find the tree element, so we'll be inserting it. + + retry = false; + + if (! cf_mutex_trylock(&isprig->pair->reduce_lock)) { + // The tree is being reduced - could take long, unlock so reads and + // overwrites aren't blocked. + cf_mutex_unlock(&isprig->pair->lock); + + // Wait until the tree reduce is done... + cf_mutex_lock(&isprig->pair->reduce_lock); + cf_mutex_unlock(&isprig->pair->reduce_lock); + + // ... and start over - we unlocked, so the tree may have changed. + retry = true; + } + } while (retry); + + // Create a new element and insert it. + + // Save the root so we can detect whether it changes. + cf_arenax_handle old_root = isprig->sprig->root_h; + + // Make the new element. + cf_arenax_handle n_h = cf_arenax_alloc(isprig->arena); + + if (n_h == 0) { + cf_warning(AS_INDEX, "arenax alloc failed"); + cf_mutex_unlock(&isprig->pair->reduce_lock); + cf_mutex_unlock(&isprig->pair->lock); + return -1; + } + + as_index *n = RESOLVE_H(n_h); + + n->rc = 2; // one for create (eventually balanced by delete), one for caller + + n->keyd = *keyd; + + n->left_h = n->right_h = SENTINEL_H; // n starts as a leaf element + n->color = AS_RED; // n's color starts as red + + // Make sure we can detect that the record isn't initialized. + as_index_clear_record_info(n); + + // Insert the new element n under parent ele. + if (ele->me == &root_parent || 0 < cmp) { + ele->me->left_h = n_h; + } + else { + ele->me->right_h = n_h; + } + + ele++; + ele->parent = ele - 1; + ele->me_h = n_h; + ele->me = n; + + // Rebalance the sprig as needed. + as_index_sprig_insert_rebalance(isprig, &root_parent, ele); + + // If insertion caused the root to change, save the new root. + if (root_parent.left_h != old_root) { + isprig->sprig->root_h = root_parent.left_h; + } + + isprig->sprig->n_elements++; + + cf_mutex_unlock(&isprig->pair->reduce_lock); + cf_mutex_unlock(&isprig->pair->lock); + + if (! index_ref->skip_lock) { + olock_vlock(g_record_locks, keyd, &index_ref->olock); + } + + index_ref->r = n; + index_ref->r_h = n_h; + + return 1; +} + + +int +as_index_sprig_delete(as_index_sprig *isprig, cf_digest *keyd) +{ + as_index *r; + cf_arenax_handle r_h; + bool retry; + + // Use a stack as_index object for the root's parent, for convenience. + as_index root_parent; + + // Save parents as we search for the specified element (or its successor). + as_index_ele eles[(64 * 2) + 3]; // FIXME - increase this appropriately + as_index_ele *ele; + + do { + ele = eles; + + cf_mutex_lock(&isprig->pair->lock); + + root_parent.left_h = isprig->sprig->root_h; + root_parent.color = AS_BLACK; + + ele->parent = NULL; // we'll never look this far up + ele->me_h = 0; // root parent has no handle, never used + ele->me = &root_parent; + + r_h = isprig->sprig->root_h; + r = RESOLVE_H(r_h); + + while (r_h != SENTINEL_H) { + ele++; + ele->parent = ele - 1; + ele->me_h = r_h; + ele->me = r; + + _mm_prefetch(r, _MM_HINT_NTA); + + int cmp = cf_digest_compare(keyd, &r->keyd); + + if (cmp == 0) { + break; // found, we'll be deleting it + } + + r_h = cmp > 0 ? r->left_h : r->right_h; + r = RESOLVE_H(r_h); + } + + if (r_h == SENTINEL_H) { + cf_mutex_unlock(&isprig->pair->lock); + return -1; // not found, nothing to delete + } + + // We found the tree element, so we'll be deleting it. + + retry = false; + + if (! cf_mutex_trylock(&isprig->pair->reduce_lock)) { + // The tree is being reduced - could take long, unlock so reads and + // overwrites aren't blocked. + cf_mutex_unlock(&isprig->pair->lock); + + // Wait until the tree reduce is done... + cf_mutex_lock(&isprig->pair->reduce_lock); + cf_mutex_unlock(&isprig->pair->reduce_lock); + + // ... and start over - we unlocked, so the tree may have changed. + retry = true; + } + } while (retry); + + // Delete the element. + + // Save the root so we can detect whether it changes. + cf_arenax_handle old_root = isprig->sprig->root_h; + + // Snapshot the element to delete, r. (Already have r_h and r shortcuts.) + as_index_ele *r_e = ele; + + if (r->left_h != SENTINEL_H && r->right_h != SENTINEL_H) { + // Search down for a "successor"... + + ele++; + ele->parent = ele - 1; + ele->me_h = r->right_h; + ele->me = RESOLVE_H(ele->me_h); + + while (ele->me->left_h != SENTINEL_H) { + ele++; + ele->parent = ele - 1; + ele->me_h = ele->parent->me->left_h; + ele->me = RESOLVE_H(ele->me_h); + } + } + // else ele is left at r, i.e. s == r + + // Snapshot the successor, s. (Note - s could be r.) + as_index_ele *s_e = ele; + cf_arenax_handle s_h = s_e->me_h; + as_index *s = s_e->me; + + // Get the appropriate child of s. (Note - child could be sentinel.) + ele++; + + if (s->left_h == SENTINEL_H) { + ele->me_h = s->right_h; + } + else { + ele->me_h = s->left_h; + } + + ele->me = RESOLVE_H(ele->me_h); + + // Cut s (remember, it could be r) out of the tree. + ele->parent = s_e->parent; + + if (s_h == s_e->parent->me->left_h) { + s_e->parent->me->left_h = ele->me_h; + } + else { + s_e->parent->me->right_h = ele->me_h; + } + + // Rebalance at ele if necessary. (Note - if r != s, r is in the tree, and + // its parent may change during rebalancing.) + if (s->color == AS_BLACK) { + as_index_sprig_delete_rebalance(isprig, &root_parent, ele); + } + + if (s != r) { + // s was a successor distinct from r, put it in r's place in the tree. + s->left_h = r->left_h; + s->right_h = r->right_h; + s->color = r->color; + + if (r_h == r_e->parent->me->left_h) { + r_e->parent->me->left_h = s_h; + } + else { + r_e->parent->me->right_h = s_h; + } + } + + // If delete caused the root to change, save the new root. + if (root_parent.left_h != old_root) { + isprig->sprig->root_h = root_parent.left_h; + } + + // Flag record as deleted. + as_index_invalidate_record(r); + + // We may now destroy r, which is no longer in the sprig. + as_index_sprig_done(isprig, r, r_h); + + isprig->sprig->n_elements--; + + cf_mutex_unlock(&isprig->pair->reduce_lock); + cf_mutex_unlock(&isprig->pair->lock); + + return 0; +} + + +//========================================================== +// Local helpers - search/rebalance a sprig. +// + +int +as_index_sprig_search_lockless(as_index_sprig *isprig, cf_digest *keyd, + as_index **ret, cf_arenax_handle *ret_h) +{ + cf_arenax_handle r_h = isprig->sprig->root_h; + as_index *r = RESOLVE_H(r_h); + + while (r_h != SENTINEL_H) { + _mm_prefetch(r, _MM_HINT_NTA); + + int cmp = cf_digest_compare(keyd, &r->keyd); + + if (cmp == 0) { + if (ret_h) { + *ret_h = r_h; + } + + if (ret) { + *ret = r; + } + + return 0; // found + } + + r_h = cmp > 0 ? r->left_h : r->right_h; + r = RESOLVE_H(r_h); + } + + return -1; // not found +} + + +void +as_index_sprig_insert_rebalance(as_index_sprig *isprig, as_index *root_parent, + as_index_ele *ele) +{ + // Entering here, ele is the last element on the stack. It turns out during + // insert rebalancing we won't ever need new elements on the stack, but make + // this resemble delete rebalance - define r_e to go back up the tree. + as_index_ele *r_e = ele; + as_index_ele *parent_e = r_e->parent; + + while (parent_e->me->color == AS_RED) { + as_index_ele *grandparent_e = parent_e->parent; + + if (r_e->parent->me_h == grandparent_e->me->left_h) { + // Element u is r's 'uncle'. + cf_arenax_handle u_h = grandparent_e->me->right_h; + as_index *u = RESOLVE_H(u_h); + + if (u->color == AS_RED) { + u->color = AS_BLACK; + parent_e->me->color = AS_BLACK; + grandparent_e->me->color = AS_RED; + + // Move up two layers - r becomes old r's grandparent. + r_e = parent_e->parent; + parent_e = r_e->parent; + } + else { + if (r_e->me_h == parent_e->me->right_h) { + // Save original r, which will become new r's parent. + as_index_ele *r0_e = r_e; + + // Move up one layer - r becomes old r's parent. + r_e = parent_e; + + // Then rotate r back down a layer. + as_index_rotate_left(r_e, r0_e); + + parent_e = r_e->parent; + // Note - grandparent_e is unchanged. + } + + parent_e->me->color = AS_BLACK; + grandparent_e->me->color = AS_RED; + + // r and parent move up a layer as grandparent rotates down. + as_index_rotate_right(grandparent_e, parent_e); + } + } + else { + // Element u is r's 'uncle'. + cf_arenax_handle u_h = grandparent_e->me->left_h; + as_index *u = RESOLVE_H(u_h); + + if (u->color == AS_RED) { + u->color = AS_BLACK; + parent_e->me->color = AS_BLACK; + grandparent_e->me->color = AS_RED; + + // Move up two layers - r becomes old r's grandparent. + r_e = parent_e->parent; + parent_e = r_e->parent; + } + else { + if (r_e->me_h == parent_e->me->left_h) { + // Save original r, which will become new r's parent. + as_index_ele *r0_e = r_e; + + // Move up one layer - r becomes old r's parent. + r_e = parent_e; + + // Then rotate r back down a layer. + as_index_rotate_right(r_e, r0_e); + + parent_e = r_e->parent; + // Note - grandparent_e is unchanged. + } + + parent_e->me->color = AS_BLACK; + grandparent_e->me->color = AS_RED; + + // r and parent move up a layer as grandparent rotates down. + as_index_rotate_left(grandparent_e, parent_e); + } + } + } + + RESOLVE_H(root_parent->left_h)->color = AS_BLACK; +} + + +void +as_index_sprig_delete_rebalance(as_index_sprig *isprig, as_index *root_parent, + as_index_ele *ele) +{ + // Entering here, ele is the last element on the stack. It's possible as r_e + // crawls up the tree, we'll need new elements on the stack, in which case + // ele keeps building the stack down while r_e goes up. + as_index_ele *r_e = ele; + + while (r_e->me->color == AS_BLACK && r_e->me_h != root_parent->left_h) { + as_index *r_parent = r_e->parent->me; + + if (r_e->me_h == r_parent->left_h) { + cf_arenax_handle s_h = r_parent->right_h; + as_index *s = RESOLVE_H(s_h); + + if (s->color == AS_RED) { + s->color = AS_BLACK; + r_parent->color = AS_RED; + + ele++; + // ele->parent will be set by rotation. + ele->me_h = s_h; + ele->me = s; + + as_index_rotate_left(r_e->parent, ele); + + s_h = r_parent->right_h; + s = RESOLVE_H(s_h); + } + + as_index *s_left = RESOLVE_H(s->left_h); + as_index *s_right = RESOLVE_H(s->right_h); + + if (s_left->color == AS_BLACK && s_right->color == AS_BLACK) { + s->color = AS_RED; + + r_e = r_e->parent; + } + else { + if (s_right->color == AS_BLACK) { + s_left->color = AS_BLACK; + s->color = AS_RED; + + ele++; + ele->parent = r_e->parent; + ele->me_h = s_h; + ele->me = s; + + as_index_ele *s_e = ele; + + ele++; + // ele->parent will be set by rotation. + ele->me_h = s->left_h; + ele->me = s_left; + + as_index_rotate_right(s_e, ele); + + s_h = r_parent->right_h; + s = s_left; // same as RESOLVE_H(s_h) + } + + s->color = r_parent->color; + r_parent->color = AS_BLACK; + RESOLVE_H(s->right_h)->color = AS_BLACK; + + ele++; + // ele->parent will be set by rotation. + ele->me_h = s_h; + ele->me = s; + + as_index_rotate_left(r_e->parent, ele); + + RESOLVE_H(root_parent->left_h)->color = AS_BLACK; + + return; + } + } + else { + cf_arenax_handle s_h = r_parent->left_h; + as_index *s = RESOLVE_H(s_h); + + if (s->color == AS_RED) { + s->color = AS_BLACK; + r_parent->color = AS_RED; + + ele++; + // ele->parent will be set by rotation. + ele->me_h = s_h; + ele->me = s; + + as_index_rotate_right(r_e->parent, ele); + + s_h = r_parent->left_h; + s = RESOLVE_H(s_h); + } + + as_index *s_left = RESOLVE_H(s->left_h); + as_index *s_right = RESOLVE_H(s->right_h); + + if (s_left->color == AS_BLACK && s_right->color == AS_BLACK) { + s->color = AS_RED; + + r_e = r_e->parent; + } + else { + if (s_left->color == AS_BLACK) { + s_right->color = AS_BLACK; + s->color = AS_RED; + + ele++; + ele->parent = r_e->parent; + ele->me_h = s_h; + ele->me = s; + + as_index_ele *s_e = ele; + + ele++; + // ele->parent will be set by rotation. + ele->me_h = s->right_h; + ele->me = s_right; + + as_index_rotate_left(s_e, ele); + + s_h = r_parent->left_h; + s = s_right; // same as RESOLVE_H(s_h) + } + + s->color = r_parent->color; + r_parent->color = AS_BLACK; + RESOLVE_H(s->left_h)->color = AS_BLACK; + + ele++; + // ele->parent will be set by rotation. + ele->me_h = s_h; + ele->me = s; + + as_index_rotate_right(r_e->parent, ele); + + RESOLVE_H(root_parent->left_h)->color = AS_BLACK; + + return; + } + } + } + + r_e->me->color = AS_BLACK; +} + + +void +as_index_rotate_left(as_index_ele *a, as_index_ele *b) +{ + // Element b is element a's right child - a will become b's left child. + + /* p --> p + * | | + * a b + * / \ / \ + * [x] b a [y] + * / \ / \ + * c [y] [x] c + */ + + // Set a's right child to c, b's former left child. + a->me->right_h = b->me->left_h; + + // Set p's left or right child (whichever a was) to b. + if (a->me_h == a->parent->me->left_h) { + a->parent->me->left_h = b->me_h; + } + else { + a->parent->me->right_h = b->me_h; + } + + // Set b's parent to p, a's old parent. + b->parent = a->parent; + + // Set b's left child to a, and a's parent to b. + b->me->left_h = a->me_h; + a->parent = b; +} + + +void +as_index_rotate_right(as_index_ele *a, as_index_ele *b) +{ + // Element b is element a's left child - a will become b's right child. + + /* p --> p + * | | + * a b + * / \ / \ + * b [x] [y] a + * / \ / \ + * [y] c c [x] + */ + + // Set a's left child to c, b's former right child. + a->me->left_h = b->me->right_h; + + // Set p's left or right child (whichever a was) to b. + if (a->me_h == a->parent->me->left_h) { + a->parent->me->left_h = b->me_h; + } + else { + a->parent->me->right_h = b->me_h; + } + + // Set b's parent to p, a's old parent. + b->parent = a->parent; + + // Set b's right child to a, and a's parent to b. + b->me->right_h = a->me_h; + a->parent = b; +} diff --git a/as/src/base/index_ce.c b/as/src/base/index_ce.c new file mode 100644 index 00000000..faf94500 --- /dev/null +++ b/as/src/base/index_ce.c @@ -0,0 +1,67 @@ +/* + * index_ce.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "base/index.h" + +#include "arenax.h" +#include "fault.h" + +#include "base/datamodel.h" + + +//========================================================== +// Public API. +// + +as_index_tree * +as_index_tree_resume(as_index_tree_shared *shared, cf_arenax *arena, + as_treex *treex) +{ + cf_crash(AS_INDEX, "CE code called as_index_tree_resume()"); + return NULL; +} + + +void +as_index_tree_shutdown(as_index_tree *tree, as_treex *treex) +{ + // For enterprise version only. +} + + +void +as_index_reduce_live(as_index_tree *tree, as_index_reduce_fn cb, void *udata) +{ + as_index_reduce(tree, cb, udata); +} + + +void +as_index_reduce_partial_live(as_index_tree *tree, uint64_t sample_count, + as_index_reduce_fn cb, void *udata) +{ + as_index_reduce_partial(tree, sample_count, cb, udata); +} diff --git a/as/src/base/job_manager.c b/as/src/base/job_manager.c new file mode 100644 index 00000000..87fe0a94 --- /dev/null +++ b/as/src/base/job_manager.c @@ -0,0 +1,806 @@ +/* + * job_manager.c + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//============================================================================== +// Includes. +// + +#include "base/job_manager.h" + +#include +#include +#include +#include +#include +#include + +#include "aerospike/as_string.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_queue.h" +#include "citrusleaf/cf_queue_priority.h" + +#include "fault.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/monitor.h" +#include "fabric/partition.h" + + +//============================================================================== +// Globals. +// + +static cf_atomic32 g_job_trid = 0; + + + +//============================================================================== +// Non-class-specific utilities. +// + +static inline uint64_t +job_trid(uint64_t trid) +{ + return trid != 0 ? trid : (uint64_t)cf_atomic32_incr(&g_job_trid); +} + +static inline const char* +job_result_str(int result_code) +{ + switch (result_code) { + case 0: + return "ok"; + case AS_JOB_FAIL_UNKNOWN: + return "abandoned-unknown"; + case AS_JOB_FAIL_CLUSTER_KEY: + return "abandoned-cluster-key"; + case AS_JOB_FAIL_USER_ABORT: + return "user-aborted"; + case AS_JOB_FAIL_RESPONSE_ERROR: + return "abandoned-response-error"; + case AS_JOB_FAIL_RESPONSE_TIMEOUT: + return "abandoned-response-timeout"; + default: + return "abandoned-?"; + } +} + +static inline int +safe_priority(int priority) { + // Handles priority 0, the 'auto' priority. + return priority < AS_JOB_PRIORITY_LOW || priority > AS_JOB_PRIORITY_HIGH ? + AS_JOB_PRIORITY_MEDIUM : priority; +} + + + +//============================================================================== +// as_priority_thread_pool class implementation. +// TODO - move to common. +// + +//---------------------------------------------------------- +// as_priority_thread_pool typedefs and forward declarations. +// + +typedef struct queue_task_s { + as_priority_thread_pool_task_fn task_fn; + void* task; +} queue_task; + +uint32_t create_threads(as_priority_thread_pool* pool, uint32_t count); +void shutdown_threads(as_priority_thread_pool* pool, uint32_t count); +void* run_pool_thread(void* udata); +int compare_cb(void* buf, void* task); + +//---------------------------------------------------------- +// as_priority_thread_pool public API. +// + +bool +as_priority_thread_pool_init(as_priority_thread_pool* pool, uint32_t n_threads) +{ + pthread_mutex_init(&pool->lock, NULL); + + // Initialize queues. + pool->dispatch_queue = cf_queue_priority_create(sizeof(queue_task), true); + pool->complete_queue = cf_queue_create(sizeof(uint32_t), true); + + // Start detached threads. + pool->n_threads = create_threads(pool, n_threads); + + return pool->n_threads == n_threads; +} + +void +as_priority_thread_pool_shutdown(as_priority_thread_pool* pool) +{ + shutdown_threads(pool, pool->n_threads); + cf_queue_priority_destroy(pool->dispatch_queue); + cf_queue_destroy(pool->complete_queue); + pthread_mutex_destroy(&pool->lock); +} + +bool +as_priority_thread_pool_resize(as_priority_thread_pool* pool, + uint32_t n_threads) +{ + pthread_mutex_lock(&pool->lock); + + bool result = true; + + if (n_threads != pool->n_threads) { + if (n_threads < pool->n_threads) { + // Shutdown excess threads. + shutdown_threads(pool, pool->n_threads - n_threads); + pool->n_threads = n_threads; + } + else { + // Start new detached threads. + pool->n_threads += create_threads(pool, + n_threads - pool->n_threads); + result = pool->n_threads == n_threads; + } + } + + pthread_mutex_unlock(&pool->lock); + + return result; +} + +bool +as_priority_thread_pool_queue_task(as_priority_thread_pool* pool, + as_priority_thread_pool_task_fn task_fn, void* task, int priority) +{ + queue_task qtask = { task_fn, task }; + + return cf_queue_priority_push(pool->dispatch_queue, &qtask, priority) == + CF_QUEUE_OK; +} + +bool +as_priority_thread_pool_remove_task(as_priority_thread_pool* pool, void* task) +{ + queue_task qtask = { NULL, NULL }; + + cf_queue_priority_reduce_pop(pool->dispatch_queue, &qtask, compare_cb, + task); + + return qtask.task != NULL; +} + +void +as_priority_thread_pool_change_task_priority(as_priority_thread_pool* pool, + void* task, int new_priority) +{ + cf_queue_priority_reduce_change(pool->dispatch_queue, new_priority, + compare_cb, task); +} + +//---------------------------------------------------------- +// as_priority_thread_pool utilities. +// + +uint32_t +create_threads(as_priority_thread_pool* pool, uint32_t count) +{ + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + uint32_t n_threads_created = 0; + pthread_t thread; + + for (uint32_t i = 0; i < count; i++) { + if (pthread_create(&thread, &attrs, run_pool_thread, pool) == 0) { + n_threads_created++; + } + } + + return n_threads_created; +} + +void +shutdown_threads(as_priority_thread_pool* pool, uint32_t count) +{ + // Send terminator tasks to kill 'count' threads. + queue_task task = { NULL, NULL }; + + for (uint32_t i = 0; i < count; i++) { + cf_queue_priority_push(pool->dispatch_queue, &task, + CF_QUEUE_PRIORITY_HIGH); + } + + // Wait till threads finish. + uint32_t complete; + + for (uint32_t i = 0; i < count; i++) { + cf_queue_pop(pool->complete_queue, &complete, CF_QUEUE_FOREVER); + } +} + +void* +run_pool_thread(void* udata) +{ + as_priority_thread_pool* pool = (as_priority_thread_pool*)udata; + queue_task qtask; + + // Retrieve tasks from queue and execute. + while (cf_queue_priority_pop(pool->dispatch_queue, &qtask, + CF_QUEUE_FOREVER) == CF_QUEUE_OK) { + // A null task indicates thread should be shut down. + if (! qtask.task_fn) { + break; + } + + // Run task. + qtask.task_fn(qtask.task); + } + + // Send thread completion event back to caller. + uint32_t complete = 1; + + cf_queue_push(pool->complete_queue, &complete); + + return NULL; +} + +int +compare_cb(void* buf, void* task) +{ + return ((queue_task*)buf)->task == task ? -1 : 0; +} + + + +//============================================================================== +// as_job base class implementation. +// + +//---------------------------------------------------------- +// as_job typedefs and forward declarations. +// + +static inline const char* as_job_safe_set_name(as_job* _job); +static inline float as_job_progress(as_job* _job); +int as_job_partition_reserve(as_job* _job, int pid, as_partition_reservation* rsv); + +//---------------------------------------------------------- +// as_job public API. +// + +void +as_job_init(as_job* _job, const as_job_vtable* vtable, + as_job_manager* mgr, as_job_rsv_type rsv_type, uint64_t trid, + as_namespace* ns, uint16_t set_id, int priority) +{ + memset(_job, 0, sizeof(as_job)); + + _job->vtable = *vtable; + _job->mgr = mgr; + _job->rsv_type = rsv_type; + _job->trid = job_trid(trid); + _job->ns = ns; + _job->set_id = set_id; + _job->priority = safe_priority(priority); + + pthread_mutex_init(&_job->requeue_lock, NULL); +} + +void +as_job_slice(void* task) +{ + as_job* _job = (as_job*)task; + + int pid = _job->next_pid; + as_partition_reservation rsv; + + if ((pid = as_job_partition_reserve(_job, pid, &rsv)) == AS_PARTITIONS) { + _job->next_pid = AS_PARTITIONS; + as_job_active_release(_job); + return; + } + + pthread_mutex_lock(&_job->requeue_lock); + + if (_job->abandoned != 0) { + pthread_mutex_unlock(&_job->requeue_lock); + as_partition_release(&rsv); + as_job_active_release(_job); + return; + } + + if ((_job->next_pid = pid + 1) < AS_PARTITIONS) { + as_job_active_reserve(_job); + as_job_manager_requeue_job(_job->mgr, _job); + } + + pthread_mutex_unlock(&_job->requeue_lock); + + _job->vtable.slice_fn(_job, &rsv); + + as_partition_release(&rsv); + as_job_active_release(_job); +} + +void +as_job_finish(as_job* _job) +{ + _job->vtable.finish_fn(_job); + as_job_manager_finish_job(_job->mgr, _job); +} + +void +as_job_destroy(as_job* _job) +{ + _job->vtable.destroy_fn(_job); + + pthread_mutex_destroy(&_job->requeue_lock); + cf_free(_job); +} + +void +as_job_info(as_job* _job, as_mon_jobstat* stat) +{ + uint64_t now = cf_getms(); + bool done = _job->finish_ms != 0; + uint64_t since_start_ms = now - _job->start_ms; + uint64_t since_finish_ms = done ? now - _job->finish_ms : 0; + uint64_t active_ms = done ? + _job->finish_ms - _job->start_ms : since_start_ms; + + stat->trid = _job->trid; + stat->priority = (uint32_t)_job->priority; + stat->progress_pct = as_job_progress(_job); + stat->run_time = active_ms; + stat->time_since_done = since_finish_ms; + stat->recs_read = cf_atomic64_get(_job->n_records_read); + + strcpy(stat->ns, _job->ns->name); + strcpy(stat->set, as_job_safe_set_name(_job)); + + char status[64]; + sprintf(status, "%s(%s)", done ? "done" : "active", + job_result_str(_job->abandoned)); + as_strncpy(stat->status, status, sizeof(stat->status)); + + _job->vtable.info_mon_fn(_job, stat); +} + +void +as_job_active_reserve(as_job* _job) +{ + cf_atomic32_incr(&_job->active_rc); +} + +void +as_job_active_release(as_job* _job) +{ + if (cf_atomic32_decr(&_job->active_rc) == 0) { + as_job_finish(_job); + } +} + +//---------------------------------------------------------- +// as_job utilities. +// + +static inline const char* +as_job_safe_set_name(as_job* _job) +{ + const char* set_name = as_namespace_get_set_name(_job->ns, _job->set_id); + + return set_name ? set_name : ""; // empty string means no set name displayed +} + +static inline float +as_job_progress(as_job* _job) +{ + return ((float)(_job->next_pid * 100)) / (float)AS_PARTITIONS; +} + +int +as_job_partition_reserve(as_job* _job, int pid, as_partition_reservation* rsv) +{ + if (_job->rsv_type == RSV_WRITE) { + while (pid < AS_PARTITIONS && as_partition_reserve_write(_job->ns, pid, + rsv, NULL) != 0) { + pid++; + } + } + else if (_job->rsv_type == RSV_MIGRATE) { + as_partition_reserve(_job->ns, pid, rsv); + } + else { + cf_crash(AS_JOB, "bad job rsv type %d", _job->rsv_type); + } + + return pid; +} + + + +//============================================================================== +// as_job_manager class implementation. +// + +//---------------------------------------------------------- +// as_job_manager typedefs and forward declarations. +// + +typedef struct find_item_s { + uint64_t trid; + as_job* _job; + bool remove; +} find_item; + +typedef struct info_item_s { + as_job** p_job; +} info_item; + +void as_job_manager_evict_finished_jobs(as_job_manager* mgr); +int as_job_manager_find_cb(void* buf, void* udata); +as_job* as_job_manager_find_job(cf_queue* jobs, uint64_t trid, bool remove); +static inline as_job* as_job_manager_find_any(as_job_manager* mgr, uint64_t trid); +static inline as_job* as_job_manager_find_active(as_job_manager* mgr, uint64_t trid); +static inline as_job* as_job_manager_remove_active(as_job_manager* mgr, uint64_t trid); +int as_job_manager_info_cb(void* buf, void* udata); + +//---------------------------------------------------------- +// as_job_manager public API. +// + +void +as_job_manager_init(as_job_manager* mgr, uint32_t max_active, uint32_t max_done, + uint32_t n_threads) +{ + mgr->max_active = max_active; + mgr->max_done = max_done; + + if (pthread_mutex_init(&mgr->lock, NULL) != 0) { + cf_crash(AS_JOB, "job manager failed mutex init"); + } + + mgr->active_jobs = cf_queue_create(sizeof(as_job*), false); + mgr->finished_jobs = cf_queue_create(sizeof(as_job*), false); + + if (! as_priority_thread_pool_init(&mgr->thread_pool, n_threads)) { + cf_crash(AS_JOB, "job manager failed thread pool init"); + } +} + +int +as_job_manager_start_job(as_job_manager* mgr, as_job* _job) +{ + pthread_mutex_lock(&mgr->lock); + + if (cf_queue_sz(mgr->active_jobs) >= mgr->max_active) { + cf_warning(AS_JOB, "max of %u jobs currently active", mgr->max_active); + pthread_mutex_unlock(&mgr->lock); + return AS_JOB_FAIL_FORBIDDEN; + } + + // Make sure trid is unique. + if (as_job_manager_find_any(mgr, _job->trid)) { + cf_warning(AS_JOB, "job with trid %lu already active", _job->trid); + pthread_mutex_unlock(&mgr->lock); + return AS_JOB_FAIL_PARAMETER; + } + + _job->start_ms = cf_getms(); + as_job_active_reserve(_job); + cf_queue_push(mgr->active_jobs, &_job); + as_priority_thread_pool_queue_task(&mgr->thread_pool, as_job_slice, _job, + _job->priority); + + pthread_mutex_unlock(&mgr->lock); + return 0; +} + +void +as_job_manager_requeue_job(as_job_manager* mgr, as_job* _job) +{ + as_priority_thread_pool_queue_task(&mgr->thread_pool, as_job_slice, _job, + _job->priority); +} + +void +as_job_manager_finish_job(as_job_manager* mgr, as_job* _job) +{ + pthread_mutex_lock(&mgr->lock); + + as_job_manager_remove_active(mgr, _job->trid); + _job->finish_ms = cf_getms(); + cf_queue_push(mgr->finished_jobs, &_job); + as_job_manager_evict_finished_jobs(mgr); + + pthread_mutex_unlock(&mgr->lock); +} + +void +as_job_manager_abandon_job(as_job_manager* mgr, as_job* _job, int reason) +{ + pthread_mutex_lock(&_job->requeue_lock); + _job->abandoned = reason; + bool found = as_priority_thread_pool_remove_task(&mgr->thread_pool, _job); + pthread_mutex_unlock(&_job->requeue_lock); + + if (found) { + as_job_active_release(_job); + } +} + +bool +as_job_manager_abort_job(as_job_manager* mgr, uint64_t trid) +{ + pthread_mutex_lock(&mgr->lock); + + as_job* _job = as_job_manager_find_active(mgr, trid); + + if (! _job) { + pthread_mutex_unlock(&mgr->lock); + return false; + } + + pthread_mutex_lock(&_job->requeue_lock); + _job->abandoned = AS_JOB_FAIL_USER_ABORT; + bool found = as_priority_thread_pool_remove_task(&mgr->thread_pool, _job); + pthread_mutex_unlock(&_job->requeue_lock); + + pthread_mutex_unlock(&mgr->lock); + + if (found) { + as_job_active_release(_job); + } + + return true; +} + +int +as_job_manager_abort_all_jobs(as_job_manager* mgr) +{ + pthread_mutex_lock(&mgr->lock); + + int n_jobs = cf_queue_sz(mgr->active_jobs); + + if (n_jobs == 0) { + pthread_mutex_unlock(&mgr->lock); + return 0; + } + + as_job* _jobs[n_jobs]; + info_item item = { _jobs }; + + cf_queue_reduce(mgr->active_jobs, as_job_manager_info_cb, &item); + + bool found[n_jobs]; + + for (int i = 0; i < n_jobs; i++) { + as_job* _job = _jobs[i]; + + pthread_mutex_lock(&_job->requeue_lock); + _job->abandoned = AS_JOB_FAIL_USER_ABORT; + found[i] = as_priority_thread_pool_remove_task(&mgr->thread_pool, _job); + pthread_mutex_unlock(&_job->requeue_lock); + } + + pthread_mutex_unlock(&mgr->lock); + + for (int i = 0; i < n_jobs; i++) { + if (found[i]) { + as_job_active_release(_jobs[i]); + } + } + + return n_jobs; +} + +bool +as_job_manager_change_job_priority(as_job_manager* mgr, uint64_t trid, + int priority) +{ + pthread_mutex_lock(&mgr->lock); + + as_job* _job = as_job_manager_find_active(mgr, trid); + + if (! _job) { + pthread_mutex_unlock(&mgr->lock); + return false; + } + + pthread_mutex_lock(&_job->requeue_lock); + _job->priority = safe_priority(priority); + as_priority_thread_pool_change_task_priority(&mgr->thread_pool, _job, + _job->priority); + pthread_mutex_unlock(&_job->requeue_lock); + + pthread_mutex_unlock(&mgr->lock); + return true; +} + +void +as_job_manager_limit_active_jobs(as_job_manager* mgr, uint32_t max_active) +{ + mgr->max_active = max_active; +} + +void +as_job_manager_limit_finished_jobs(as_job_manager* mgr, uint32_t max_done) +{ + pthread_mutex_lock(&mgr->lock); + mgr->max_done = max_done; + as_job_manager_evict_finished_jobs(mgr); + pthread_mutex_unlock(&mgr->lock); +} + +void +as_job_manager_resize_thread_pool(as_job_manager* mgr, uint32_t n_threads) +{ + as_priority_thread_pool_resize(&mgr->thread_pool, n_threads); +} + +as_mon_jobstat* +as_job_manager_get_job_info(as_job_manager* mgr, uint64_t trid) +{ + pthread_mutex_lock(&mgr->lock); + + as_job* _job = as_job_manager_find_any(mgr, trid); + + if (! _job) { + pthread_mutex_unlock(&mgr->lock); + return NULL; + } + + as_mon_jobstat* stat = cf_malloc(sizeof(as_mon_jobstat)); + + memset(stat, 0, sizeof(as_mon_jobstat)); + as_job_info(_job, stat); + + pthread_mutex_unlock(&mgr->lock); + return stat; // caller must free this +} + +as_mon_jobstat* +as_job_manager_get_info(as_job_manager* mgr, int* size) +{ + *size = 0; + + pthread_mutex_lock(&mgr->lock); + + int n_jobs = cf_queue_sz(mgr->active_jobs) + + cf_queue_sz(mgr->finished_jobs); + + if (n_jobs == 0) { + pthread_mutex_unlock(&mgr->lock); + return NULL; + } + + as_job* _jobs[n_jobs]; + info_item item = { _jobs }; + + cf_queue_reduce_reverse(mgr->active_jobs, as_job_manager_info_cb, &item); + cf_queue_reduce_reverse(mgr->finished_jobs, as_job_manager_info_cb, &item); + + size_t stats_size = sizeof(as_mon_jobstat) * n_jobs; + as_mon_jobstat* stats = cf_malloc(stats_size); + + memset(stats, 0, stats_size); + + for (int i = 0; i < n_jobs; i++) { + as_job_info(_jobs[i], &stats[i]); + } + + pthread_mutex_unlock(&mgr->lock); + + *size = n_jobs; + return stats; // caller must free this +} + +int +as_job_manager_get_active_job_count(as_job_manager* mgr) +{ + pthread_mutex_lock(&mgr->lock); + int n_jobs = cf_queue_sz(mgr->active_jobs); + pthread_mutex_unlock(&mgr->lock); + + return n_jobs; +} + +//---------------------------------------------------------- +// as_job_manager utilities. +// + +void +as_job_manager_evict_finished_jobs(as_job_manager* mgr) +{ + int max_allowed = (int)mgr->max_done; + + while (cf_queue_sz(mgr->finished_jobs) > max_allowed) { + as_job* _job; + + cf_queue_pop(mgr->finished_jobs, &_job, 0); + as_job_destroy(_job); + } +} + +int +as_job_manager_find_cb(void* buf, void* udata) +{ + as_job* _job = *(as_job**)buf; + find_item* match = (find_item*)udata; + + if (match->trid == _job->trid) { + match->_job = _job; + return match->remove ? -2 : -1; + } + + return 0; +} + +as_job* +as_job_manager_find_job(cf_queue* jobs, uint64_t trid, bool remove) +{ + find_item item = { trid, NULL, remove }; + + cf_queue_reduce(jobs, as_job_manager_find_cb, &item); + + return item._job; +} + +static inline as_job* +as_job_manager_find_any(as_job_manager* mgr, uint64_t trid) +{ + as_job* _job = as_job_manager_find_job(mgr->active_jobs, trid, false); + + if (! _job) { + _job = as_job_manager_find_job(mgr->finished_jobs, trid, false); + } + + return _job; +} + +static inline as_job* +as_job_manager_find_active(as_job_manager* mgr, uint64_t trid) +{ + return as_job_manager_find_job(mgr->active_jobs, trid, false); +} + +static inline as_job* +as_job_manager_remove_active(as_job_manager* mgr, uint64_t trid) +{ + return as_job_manager_find_job(mgr->active_jobs, trid, true); +} + +int +as_job_manager_info_cb(void* buf, void* udata) +{ + as_job* _job = *(as_job**)buf; + info_item* item = (info_item*)udata; + + *item->p_job++ = _job; + + return 0; +} diff --git a/as/src/base/json_init.c b/as/src/base/json_init.c new file mode 100644 index 00000000..a7c93f9e --- /dev/null +++ b/as/src/base/json_init.c @@ -0,0 +1,62 @@ +/* + * json_init.c + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "jansson.h" +#include "citrusleaf/alloc.h" +#include "base/json_init.h" + +/* SYNOPSIS + * This module handles initialization of the Jansson JSON API by + * setting the memory allocation functions to be used internally + * by Jansson to the CF allocation-related functions. + */ + +/* + * Note that actual wrapper functions are needed instead of simply + * using the names of the CF malloc() and free() functions, since the + * memory allocation instrumentation infrastructure uses macroexpansion + * of the CF allocation-related function names to track all allocations. + */ + +/* + * Wrapper function to call the CF malloc() function. + */ +static void *as_json_malloc(size_t size) +{ + return cf_malloc(size); +} + +/* + * Wrapper function to call the CF free() function. + */ +static void as_json_free(void *ptr) +{ + cf_free(ptr); +} + +/* + * Initialize the JSON module by setting the memory allocation functions. + */ +void as_json_init() +{ + json_set_alloc_funcs(as_json_malloc, as_json_free); +} diff --git a/as/src/base/monitor.c b/as/src/base/monitor.c new file mode 100644 index 00000000..e0b22540 --- /dev/null +++ b/as/src/base/monitor.c @@ -0,0 +1,474 @@ +/* + * monitor.c + * + * Copyright (C) 2013-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * Aerospike Long Running Job Monitoring interface + * + * This file implements the generic interface for the long running jobs + * in Aerospike like query / scan / batch etc. The idea is to able to see + * what is going on in the system. + * + * Each module which needs to show up in the monitoring needs to register + * and implement the interfaces. + */ + +#include +#include + +#include "base/secondary_index.h" +#include "base/monitor.h" +#include "base/scan.h" +#include "base/thr_sindex.h" + + +#define AS_MON_MAX_MODULE 10 + +// Indexed by as_mon_module_slot - keep in sync. +const char * AS_MON_MODULES[] = { + "query", + "scan", + "sindex-builder" +}; + +// functional declaration +int as_mon_populate_jobstat(as_mon_jobstat * stat, cf_dyn_buf *db); +static as_mon * g_as_mon_module[AS_MON_MAX_MODULE]; +static uint32_t g_as_mon_curr_mod_count; +int as_mon_register(const char *module); + +/* + * This is called to init the mon subsystem. + */ +int +as_mon_init() +{ + g_as_mon_curr_mod_count = 0; + as_mon_register(AS_MON_MODULES[QUERY_MOD]); + as_mon_register(AS_MON_MODULES[SCAN_MOD]); + as_mon_register(AS_MON_MODULES[SBLD_MOD]); + + // TODO: Add more stuff if there is any locks needs some stats needed etc etc ... + return AS_MON_OK; +} + +as_mon * +as_mon_get_module(const char * module) +{ + as_mon_module_slot mod; + if (strcmp(module, AS_MON_MODULES[QUERY_MOD]) == 0) { + mod = QUERY_MOD; + } + else if (strcmp(module, AS_MON_MODULES[SCAN_MOD]) == 0) { + mod = SCAN_MOD; + } + else if (strcmp(module, AS_MON_MODULES[SBLD_MOD]) == 0) { + mod = SBLD_MOD; + } + else { + return NULL; + } + + return g_as_mon_module[mod]; +} + +/* + * The call to register a module to be tracked under as mon interface + * Returns - + * AS_MON_OK - On successful registartion. + * AS_MON_ERROR - failure + */ +int +as_mon_register(const char *module) +{ + if (!module) return AS_MON_ERR; + as_mon *mon_obj = (as_mon *) cf_rc_alloc(sizeof(as_mon)); + as_mon_cb *cb = cf_malloc(sizeof(as_mon_cb)); + as_mon_module_slot mod; + + if(!strcmp(module, AS_MON_MODULES[QUERY_MOD])) { + cb->get_jobstat = as_query_get_jobstat; + cb->get_jobstat_all = as_query_get_jobstat_all; + + cb->set_priority = as_query_set_priority; + cb->kill = as_query_kill; + cb->suspend = NULL; + cb->set_pendingmax = NULL; + cb->set_maxinflight = NULL; + cb->set_maxpriority = NULL; + mod = QUERY_MOD; + } + else if (!strcmp(module, AS_MON_MODULES[SCAN_MOD])) + { + cb->get_jobstat = as_scan_get_jobstat; + cb->get_jobstat_all = as_scan_get_jobstat_all; + + cb->set_priority = as_scan_change_job_priority; + cb->kill = as_scan_abort; + cb->suspend = NULL; + cb->set_pendingmax = NULL; + cb->set_maxinflight = NULL; + cb->set_maxpriority = NULL; + mod = SCAN_MOD; + } + else if (!strcmp(module, AS_MON_MODULES[SBLD_MOD])) + { + cb->get_jobstat = as_sbld_get_jobstat; + cb->get_jobstat_all = as_sbld_get_jobstat_all; + + cb->set_priority = NULL; + cb->kill = as_sbld_abort; + cb->suspend = NULL; + cb->set_pendingmax = NULL; + cb->set_maxinflight = NULL; + cb->set_maxpriority = NULL; + mod = SBLD_MOD; + } + else { + cf_warning(AS_MON, "wrong module parameter."); + return AS_MON_ERR; + } + // Setup mon object + mon_obj->type = cf_strdup(module); + memcpy(&mon_obj->cb, cb, sizeof(as_mon_cb)); + + g_as_mon_curr_mod_count++; + g_as_mon_module[mod] = mon_obj; + return AS_MON_OK; +} + +/* + * Calls the callback function to kill a job. + * + * Returns + * AS_MON_OK - On success. + * AS_MON_ERR - on failure. + * + */ +int +as_mon_killjob(const char *module, uint64_t id, cf_dyn_buf *db) +{ + int retval = AS_MON_ERR; + as_mon * mon_object = as_mon_get_module(module); + + if (!mon_object) { + cf_warning(AS_MON, "Failed to find module %s", module); + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND); + cf_dyn_buf_append_string(db, ":module \""); + cf_dyn_buf_append_string(db, module); + cf_dyn_buf_append_string(db, "\" not found"); + return retval; + } + + if (mon_object->cb.kill) { + retval = mon_object->cb.kill(id); + + if (retval == AS_MON_OK) { + cf_dyn_buf_append_string(db, "OK"); + } + else { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND); + cf_dyn_buf_append_string(db, ":job not active"); + } + } + else { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER); + cf_dyn_buf_append_string(db, ":kill-job not supported for module \""); + cf_dyn_buf_append_string(db, module); + cf_dyn_buf_append_string(db, "\""); + } + return retval; +} + +/* + * Calls the callback function to set priority of a job. + * + * Returns + * AS_MON_OK - On success. + * AS_MON_ERR - on failure. + * + */ +int +as_mon_set_priority(const char *module, uint64_t id, uint32_t priority, cf_dyn_buf *db) +{ + if (priority == 0) { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER); + cf_dyn_buf_append_string(db, ":priority value must be greater than zero"); + return AS_MON_ERR; + } + int retval = AS_MON_ERR; + as_mon * mon_object = as_mon_get_module(module); + + if (!mon_object) { + cf_warning(AS_MON, "Failed to find module %s", module); + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND); + cf_dyn_buf_append_string(db, ":module \""); + cf_dyn_buf_append_string(db, module); + cf_dyn_buf_append_string(db, "\" not found"); + return retval; + } + + if (mon_object->cb.set_priority) { + retval = mon_object->cb.set_priority(id, priority); + + if (retval == AS_MON_OK) { + cf_dyn_buf_append_string(db, "OK"); + } + else { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND); + cf_dyn_buf_append_string(db, ":job not active"); + } + } + else { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER); + cf_dyn_buf_append_string(db, ":set-priority not supported for module \""); + cf_dyn_buf_append_string(db, module); + cf_dyn_buf_append_string(db, "\""); + } + return retval; +} + +/* + * Calls the callback function to populate the stat of a particular job. + * + * Returns + * AS_MON_OK - On success. + * AS_MON_ERR - on failure. + * + */ +int +as_mon_populate_jobstat(as_mon_jobstat * job_stat, cf_dyn_buf *db) +{ + cf_dyn_buf_append_string(db, "trid="); + cf_dyn_buf_append_uint64(db, job_stat->trid); + + if (job_stat->job_type[0]) { + cf_dyn_buf_append_string(db, ":job-type="); + cf_dyn_buf_append_string(db, job_stat->job_type); + } + + cf_dyn_buf_append_string(db, ":ns="); + cf_dyn_buf_append_string(db, job_stat->ns); + + if (job_stat->set[0]) { + cf_dyn_buf_append_string(db, ":set="); + cf_dyn_buf_append_string(db, job_stat->set); + } + + cf_dyn_buf_append_string(db, ":priority="); + cf_dyn_buf_append_uint32(db, job_stat->priority); + + if (job_stat->status[0]) { + cf_dyn_buf_append_string(db, ":status="); + cf_dyn_buf_append_string(db, job_stat->status); + } + + char progress_pct[8]; + sprintf(progress_pct, "%.2f", job_stat->progress_pct); + + cf_dyn_buf_append_string(db, ":job-progress="); + cf_dyn_buf_append_string(db, progress_pct); + + cf_dyn_buf_append_string(db, ":run-time="); + cf_dyn_buf_append_uint64(db, job_stat->run_time); + + cf_dyn_buf_append_string(db, ":time-since-done="); + cf_dyn_buf_append_uint64(db, job_stat->time_since_done); + + cf_dyn_buf_append_string(db, ":recs-read="); + cf_dyn_buf_append_uint64(db, job_stat->recs_read); + + cf_dyn_buf_append_string(db, ":net-io-bytes="); + cf_dyn_buf_append_uint64(db, job_stat->net_io_bytes); + + // char cpu_data[100]; + // sprintf(cpu_data, "%f", job_stat->cpu); + // cf_dyn_buf_append_string(db, cpu_data); + + if (job_stat->jdata[0]) { + cf_dyn_buf_append_string(db, job_stat->jdata); + } + + return AS_MON_OK; +} + +static int +as_mon_get_jobstat_reduce_fn(as_mon *mon_object, cf_dyn_buf *db) +{ + int size = 0; + as_mon_jobstat * job_stats = NULL; + if (mon_object->cb.get_jobstat_all) { + job_stats = mon_object->cb.get_jobstat_all(&size); + } + + // return OK to go to next module + if (!job_stats) return AS_MON_OK; + + as_mon_jobstat * job; + job = job_stats; + + for (int i = 0; i < size; i++) { + cf_dyn_buf_append_string(db, "module="); + cf_dyn_buf_append_string(db, mon_object->type); + cf_dyn_buf_append_string(db, ":"); + as_mon_populate_jobstat(job, db); + cf_dyn_buf_append_string(db, ";"); + job++; + } + cf_free(job_stats); + return AS_MON_OK; +} + +/* + * This is called when the info call is triggered to get the info + * about all the jobs. + * + * parameter: + * @db: in/out which gets populated. Each module stats is colon separated + * key:value and each module info is semicolon separated. + * e.g module:query:cpu::mem:;module:query:cpu::mem:; + * + * returns: 0 in case of success + * negative value in case of failure + */ +int +as_mon_get_jobstat_all(const char *module, cf_dyn_buf *db) +{ + bool found_module = false; + for (int i = 0; i < g_as_mon_curr_mod_count; i++) { + if ((module && !strcmp(g_as_mon_module[i]->type, module)) + || (!module)) { + as_mon_get_jobstat_reduce_fn(g_as_mon_module[i], db); + if (module) { + found_module = true; + } + } + } + + if (module && !found_module) { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND); + cf_dyn_buf_append_string(db, ":module \""); + cf_dyn_buf_append_string(db, module); + cf_dyn_buf_append_string(db, "\" not found"); + } + else { + cf_dyn_buf_chomp(db); + } + return 0; +} + +/* + * This is called when the info call is triggered to get the info + * about a particular job in particular module. + * + * parameter: + * @db: in/out which gets populated. Each module stats is colon separated + * key:value and each module info is semicolon separated. + * e.g module:query:cpu::mem:;module:query:cpu::mem:; + * + * returns: 0 in case of success + * negative value in case of failure + */ +int +as_mon_get_jobstat(const char *module, uint64_t id, cf_dyn_buf *db) +{ + int retval = AS_MON_ERR; + as_mon * mon_object = as_mon_get_module(module);; + + if (!mon_object) { + cf_warning(AS_MON, "Failed to find module %s", module); + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND); + cf_dyn_buf_append_string(db, ":module \""); + cf_dyn_buf_append_string(db, module); + cf_dyn_buf_append_string(db, "\" not found"); + return retval; + } + + as_mon_jobstat * job_stat = NULL; + if (mon_object->cb.get_jobstat) { + job_stat = mon_object->cb.get_jobstat(id); + } + else { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER); + cf_dyn_buf_append_string(db, ":get-job not supported for module \""); + cf_dyn_buf_append_string(db, module); + cf_dyn_buf_append_string(db, "\""); + return retval; + } + + if (job_stat) { + retval = as_mon_populate_jobstat(job_stat, db); + cf_free(job_stat); + } + else { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND); + cf_dyn_buf_append_string(db, ":job not found"); + } + return retval; +} + +/* + * Manipulates the monitor system. + * Add, delete, reinit the modules. + * + */ + +void +as_mon_info_cmd(const char *module, char *cmd, uint64_t trid, uint32_t value, cf_dyn_buf *db) +{ + if (module == NULL) { + as_mon_get_jobstat_all(NULL, db); + return; + } + + if (cmd == NULL) { + as_mon_get_jobstat_all(module, db); + return; + } + + if (!strcmp(cmd, "get-job")) { + as_mon_get_jobstat(module, trid, db); + } + else if (!strcmp(cmd, "kill-job")) { + as_mon_killjob(module, trid, db); + } + else if (!strcmp(cmd, "set-priority")) { + as_mon_set_priority(module, trid, value, db); + } + else { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER); + cf_dyn_buf_append_string(db, ":unrecognized command \""); + cf_dyn_buf_append_string(db, cmd); + cf_dyn_buf_append_string(db, "\""); + } +} diff --git a/as/src/base/namespace.c b/as/src/base/namespace.c new file mode 100644 index 00000000..d721f3d2 --- /dev/null +++ b/as/src/base/namespace.c @@ -0,0 +1,746 @@ +/* + * namespace.c + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_hash_math.h" + +#include "dynbuf.h" +#include "fault.h" +#include "hist.h" +#include "linear_hist.h" +#include "vmapx.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/secondary_index.h" +#include "base/truncate.h" +#include "fabric/partition.h" +#include "fabric/roster.h" +#include "storage/storage.h" + + +//========================================================== +// Typedefs & constants. +// + + +//========================================================== +// Globals. +// + + +//========================================================== +// Forward declarations. +// + +static void append_set_props(as_set *p_set, cf_dyn_buf *db); + + +//========================================================== +// Inlines & macros. +// + +static inline uint32_t +ns_name_hash(char *name) +{ + uint32_t hv = cf_hash_fnv32((const uint8_t *)name, strlen(name)); + + // Don't collide with a ns-id. + if (hv <= AS_NAMESPACE_SZ) { + hv += AS_NAMESPACE_SZ; + } + + return hv; +} + + +//========================================================== +// Public API. +// + +as_namespace * +as_namespace_create(char *name) +{ + cf_assert_nostack(strlen(name) < AS_ID_NAMESPACE_SZ, + AS_NAMESPACE, "{%s} namespace name too long (max length is %u)", + name, AS_ID_NAMESPACE_SZ - 1); + + cf_assert_nostack(g_config.n_namespaces < AS_NAMESPACE_SZ, + AS_NAMESPACE, "too many namespaces (max is %u)", AS_NAMESPACE_SZ); + + uint32_t namehash = ns_name_hash(name); + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace *ns = g_config.namespaces[ns_ix]; + + if (strcmp(ns->name, name) == 0) { + cf_crash_nostack(AS_NAMESPACE, "{%s} duplicate namespace", name); + } + + // Check for CE also, in case deployment later becomes EE with XDR. + if (ns->namehash == namehash) { + cf_crash_nostack(AS_XDR, "{%s} {%s} namespace name hashes collide", + ns->name, name); + } + } + + as_namespace *ns = cf_malloc(sizeof(as_namespace)); + + g_config.namespaces[g_config.n_namespaces++] = ns; + + // Set all members 0/NULL/false to start with. + memset(ns, 0, sizeof(as_namespace)); + + strcpy(ns->name, name); + ns->id = g_config.n_namespaces; // note that id is 1-based + ns->namehash = namehash; + + ns->jem_arena = cf_alloc_create_arena(); + cf_info(AS_NAMESPACE, "{%s} uses JEMalloc arena %d", name, ns->jem_arena); + + ns->cold_start = false; // try warm or cool restart unless told not to + ns->arena = NULL; // can't create the arena until the configuration has been done + + //-------------------------------------------- + // Non-0/NULL/false configuration defaults. + // + + ns->cfg_replication_factor = 2; + ns->replication_factor = 0; // gets set on rebalance + ns->memory_size = 1024LL * 1024LL * 1024LL * 4LL; // default memory limit is 4G per namespace + + ns->sets_enable_xdr = true; // ship all the sets by default + ns->ns_allow_nonxdr_writes = true; // allow nonxdr writes by default + ns->ns_allow_xdr_writes = true; // allow xdr writes by default + cf_vector_pointer_init(&ns->xdr_dclist_v, 3, 0); + + ns->cold_start_evict_ttl = 0xFFFFffff; // unless this is specified via config file, use evict void-time saved in device header + ns->conflict_resolution_policy = AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_UNDEF; + ns->evict_hist_buckets = 10000; // for 30 day TTL, bucket width is 4 minutes 20 seconds + ns->evict_tenths_pct = 5; // default eviction amount is 0.5% + ns->hwm_disk_pct = 50; // evict when device usage exceeds 50% + ns->hwm_memory_pct = 60; // evict when memory usage exceeds 50% of namespace memory-size + ns->max_ttl = MAX_ALLOWED_TTL; // 10 years + ns->migrate_order = 5; + ns->migrate_retransmit_ms = 1000 * 5; // 5 seconds + ns->migrate_sleep = 1; + ns->obj_size_hist_max = OBJ_SIZE_HIST_NUM_BUCKETS; + ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_PROTO; + ns->stop_writes_pct = 90; // stop writes when 90% of either memory or disk is used + ns->tomb_raider_eligible_age = 60 * 60 * 24; // 1 day + ns->tomb_raider_period = 60 * 60 * 24; // 1 day + ns->tree_shared.n_lock_pairs = 8; + ns->tree_shared.n_sprigs = 64; + ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_PROTO; + + ns->storage_type = AS_STORAGE_ENGINE_MEMORY; + ns->storage_data_in_memory = true; + // Note - default true is consistent with AS_STORAGE_ENGINE_MEMORY, but + // cfg.c will set default false for AS_STORAGE_ENGINE_SSD. + + ns->storage_filesize = 1024UL * 1024UL * 1024UL * 16UL; // default file size is 16G per file + ns->storage_scheduler_mode = NULL; // null indicates default is to not change scheduler mode + ns->storage_write_block_size = 1024 * 1024; + ns->storage_defrag_lwm_pct = 50; // defrag if occupancy of block is < 50% + ns->storage_defrag_sleep = 1000; // sleep this many microseconds between each wblock + ns->storage_defrag_startup_minimum = 10; // defrag until >= 10% disk is writable before joining cluster + ns->storage_flush_max_us = 1000 * 1000; // wait this many microseconds before flushing inactive current write buffer (0 = never) + ns->storage_max_write_cache = 1024 * 1024 * 64; + ns->storage_min_avail_pct = 5; // stop writes when < 5% disk is writable + ns->storage_post_write_queue = 256; // number of wblocks per device used as post-write cache + ns->storage_tomb_raider_sleep = 1000; // sleep this many microseconds between each device read + ns->storage_write_threads = 1; + + ns->sindex_num_partitions = DEFAULT_PARTITIONS_PER_INDEX; + + ns->geo2dsphere_within_strict = true; + ns->geo2dsphere_within_min_level = 1; + ns->geo2dsphere_within_max_level = 30; + ns->geo2dsphere_within_max_cells = 12; + ns->geo2dsphere_within_level_mod = 1; + ns->geo2dsphere_within_earth_radius_meters = 6371000; // Wikipedia, mean + + return ns; +} + + +void +as_namespaces_init(bool cold_start_cmd, uint32_t instance) +{ + uint32_t stage_capacity = as_mem_check(); + + as_namespaces_setup(cold_start_cmd, instance, stage_capacity); + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace *ns = g_config.namespaces[ns_ix]; + + // Done with temporary sets configuration array. + if (ns->sets_cfg_array) { + cf_free(ns->sets_cfg_array); + } + + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + as_partition_init(ns, pid); + } + + as_truncate_init(ns); + as_sindex_init(ns); + } + + as_roster_init_smd(); + as_truncate_init_smd(); + as_sindex_init_smd(); // before as_storage_init() populates the indexes +} + + +bool +as_namespace_configure_sets(as_namespace *ns) +{ + for (uint32_t i = 0; i < ns->sets_cfg_count; i++) { + uint32_t idx; + cf_vmapx_err result = cf_vmapx_put_unique(ns->p_sets_vmap, + ns->sets_cfg_array[i].name, &idx); + + if (result == CF_VMAPX_OK || result == CF_VMAPX_ERR_NAME_EXISTS) { + as_set* p_set = NULL; + + if ((result = cf_vmapx_get_by_index(ns->p_sets_vmap, idx, + (void**)&p_set)) != CF_VMAPX_OK) { + // Should be impossible - just verified idx. + cf_crash(AS_NAMESPACE, "vmap error %d", result); + } + + // Transfer configurable metadata. + p_set->stop_writes_count = ns->sets_cfg_array[i].stop_writes_count; + p_set->disable_eviction = ns->sets_cfg_array[i].disable_eviction; + p_set->enable_xdr = ns->sets_cfg_array[i].enable_xdr; + } + else { + // Maybe exceeded max sets allowed, but try failing gracefully. + cf_warning(AS_NAMESPACE, "vmap error %d", result); + return false; + } + } + + return true; +} + + +as_namespace * +as_namespace_get_byname(char *name) +{ + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace *ns = g_config.namespaces[ns_ix]; + + if (strcmp(ns->name, name) == 0) { + return ns; + } + } + + return NULL; +} + + +as_namespace * +as_namespace_get_byid(uint32_t id) +{ + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace *ns = g_config.namespaces[ns_ix]; + + if (id == ns->id) { + return ns; + } + } + + return NULL; +} + + +as_namespace * +as_namespace_get_bybuf(uint8_t *buf, size_t len) +{ + if (len >= AS_ID_NAMESPACE_SZ) { + return NULL; + } + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace *ns = g_config.namespaces[ns_ix]; + + if (memcmp(buf, ns->name, len) == 0 && ns->name[len] == 0) { + return ns; + } + } + + return NULL; +} + + +as_namespace * +as_namespace_get_bymsgfield(as_msg_field *fp) +{ + return as_namespace_get_bybuf(fp->data, as_msg_field_get_value_sz(fp)); +} + + +const char * +as_namespace_get_set_name(as_namespace *ns, uint16_t set_id) +{ + // Note that set_id is 1-based, but cf_vmap index is 0-based. + // (This is because 0 in the index structure means 'no set'.) + + if (set_id == INVALID_SET_ID) { + return NULL; + } + + as_set *p_set; + + return cf_vmapx_get_by_index(ns->p_sets_vmap, (uint32_t)set_id - 1, + (void**)&p_set) == CF_VMAPX_OK ? p_set->name : NULL; +} + + +uint16_t +as_namespace_get_set_id(as_namespace *ns, const char *set_name) +{ + uint32_t idx; + + return cf_vmapx_get_index(ns->p_sets_vmap, set_name, &idx) == CF_VMAPX_OK ? + (uint16_t)(idx + 1) : INVALID_SET_ID; +} + + +// At the moment this is only used by the enterprise build security feature. +uint16_t +as_namespace_get_create_set_id(as_namespace *ns, const char *set_name) +{ + if (! set_name) { + // Should be impossible. + cf_warning(AS_NAMESPACE, "null set name"); + return INVALID_SET_ID; + } + + uint32_t idx; + cf_vmapx_err result = cf_vmapx_get_index(ns->p_sets_vmap, set_name, &idx); + + if (result == CF_VMAPX_OK) { + return (uint16_t)(idx + 1); + } + + if (result == CF_VMAPX_ERR_NAME_NOT_FOUND) { + result = cf_vmapx_put_unique(ns->p_sets_vmap, set_name, &idx); + + if (result == CF_VMAPX_ERR_NAME_EXISTS) { + return (uint16_t)(idx + 1); + } + + if (result == CF_VMAPX_ERR_BAD_PARAM) { + cf_warning(AS_NAMESPACE, "set name %s too long", set_name); + return INVALID_SET_ID; + } + + if (result == CF_VMAPX_ERR_FULL) { + cf_warning(AS_NAMESPACE, "can't add %s (at sets limit)", set_name); + return INVALID_SET_ID; + } + + if (result != CF_VMAPX_OK) { + // Currently, remaining errors are all some form of out-of-memory. + cf_warning(AS_NAMESPACE, "can't add %s (%d)", set_name, result); + return INVALID_SET_ID; + } + + return (uint16_t)(idx + 1); + } + + // Should be impossible. + cf_warning(AS_NAMESPACE, "unexpected error %d", result); + return INVALID_SET_ID; +} + + +int +as_namespace_set_set_w_len(as_namespace *ns, const char *set_name, size_t len, + uint16_t *p_set_id, bool apply_restrictions) +{ + as_set *p_set; + + if (as_namespace_get_create_set_w_len(ns, set_name, len, &p_set, + p_set_id) != 0) { + return -1; + } + + if (apply_restrictions && as_set_stop_writes(p_set)) { + return -2; + } + + cf_atomic64_incr(&p_set->n_objects); + + return 0; +} + + +int +as_namespace_get_create_set_w_len(as_namespace *ns, const char *set_name, + size_t len, as_set **pp_set, uint16_t *p_set_id) +{ + cf_assert(set_name, AS_NAMESPACE, "null set name"); + cf_assert(len != 0, AS_NAMESPACE, "empty set name"); + + uint32_t idx; + cf_vmapx_err result = cf_vmapx_get_index_w_len(ns->p_sets_vmap, set_name, + len, &idx); + + if (result == CF_VMAPX_ERR_NAME_NOT_FOUND) { + // Special case handling for name too long. + if (len >= AS_SET_NAME_MAX_SIZE) { + char bad_name[AS_SET_NAME_MAX_SIZE]; + + memcpy(bad_name, set_name, AS_SET_NAME_MAX_SIZE - 1); + bad_name[AS_SET_NAME_MAX_SIZE - 1] = 0; + + cf_warning(AS_NAMESPACE, "set name %s... too long", bad_name); + return -1; + } + + result = cf_vmapx_put_unique_w_len(ns->p_sets_vmap, set_name, len, + &idx); + + // Since this function can be called via many functions simultaneously. + // Need to handle race, So handle CF_VMAPX_ERR_NAME_EXISTS. + if (result == CF_VMAPX_ERR_FULL) { + cf_warning(AS_NAMESPACE, "at set names limit, can't add set"); + return -1; + } + + if (result != CF_VMAPX_OK && result != CF_VMAPX_ERR_NAME_EXISTS) { + cf_warning(AS_NAMESPACE, "error %d, can't add set", result); + return -1; + } + } + else if (result != CF_VMAPX_OK) { + // Should be impossible. + cf_warning(AS_NAMESPACE, "unexpected error %d", result); + return -1; + } + + if (pp_set) { + if ((result = cf_vmapx_get_by_index(ns->p_sets_vmap, idx, + (void**)pp_set)) != CF_VMAPX_OK) { + // Should be impossible - just verified idx. + cf_warning(AS_NAMESPACE, "unexpected error %d", result); + return -1; + } + } + + if (p_set_id) { + *p_set_id = (uint16_t)(idx + 1); + } + + return 0; +} + + +as_set * +as_namespace_get_set_by_name(as_namespace *ns, const char *set_name) +{ + uint32_t idx; + + if (cf_vmapx_get_index(ns->p_sets_vmap, set_name, &idx) != CF_VMAPX_OK) { + return NULL; + } + + as_set *p_set; + + if (cf_vmapx_get_by_index(ns->p_sets_vmap, idx, (void**)&p_set) != + CF_VMAPX_OK) { + // Should be impossible - just verified idx. + cf_crash(AS_NAMESPACE, "unexpected vmap error"); + } + + return p_set; +} + + +as_set * +as_namespace_get_set_by_id(as_namespace *ns, uint16_t set_id) +{ + if (set_id == INVALID_SET_ID) { + return NULL; + } + + as_set *p_set; + + if (cf_vmapx_get_by_index(ns->p_sets_vmap, set_id - 1, (void**)&p_set) != + CF_VMAPX_OK) { + // Should be impossible. + cf_warning(AS_NAMESPACE, "unexpected - record with set-id not in vmap"); + return NULL; + } + + return p_set; +} + + +as_set * +as_namespace_get_record_set(as_namespace *ns, const as_record *r) +{ + return as_namespace_get_set_by_id(ns, as_index_get_set_id(r)); +} + + +void +as_namespace_get_set_info(as_namespace *ns, const char *set_name, + cf_dyn_buf *db) +{ + as_set *p_set; + + if (set_name) { + if (cf_vmapx_get_by_name(ns->p_sets_vmap, set_name, (void**)&p_set) == + CF_VMAPX_OK) { + append_set_props(p_set, db); + } + + return; + } + + for (uint32_t idx = 0; idx < cf_vmapx_count(ns->p_sets_vmap); idx++) { + if (cf_vmapx_get_by_index(ns->p_sets_vmap, idx, (void**)&p_set) == + CF_VMAPX_OK) { + cf_dyn_buf_append_string(db, "ns="); + cf_dyn_buf_append_string(db, ns->name); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_string(db, "set="); + cf_dyn_buf_append_string(db, p_set->name); + cf_dyn_buf_append_char(db, ':'); + append_set_props(p_set, db); + } + } +} + + +void +as_namespace_adjust_set_memory(as_namespace *ns, uint16_t set_id, + int64_t delta_bytes) +{ + if (set_id == INVALID_SET_ID) { + return; + } + + as_set *p_set; + + if (cf_vmapx_get_by_index(ns->p_sets_vmap, set_id - 1, (void**)&p_set) != + CF_VMAPX_OK) { + cf_warning(AS_NAMESPACE, "set-id %u - failed vmap get", set_id); + return; + } + + if (cf_atomic64_add(&p_set->n_bytes_memory, delta_bytes) < 0) { + cf_warning(AS_NAMESPACE, "set-id %u - negative memory!", set_id); + } +} + + +void +as_namespace_release_set_id(as_namespace *ns, uint16_t set_id) +{ + if (set_id == INVALID_SET_ID) { + return; + } + + as_set *p_set; + + if (cf_vmapx_get_by_index(ns->p_sets_vmap, set_id - 1, (void**)&p_set) != + CF_VMAPX_OK) { + return; + } + + if (cf_atomic64_decr(&p_set->n_objects) < 0) { + cf_warning(AS_NAMESPACE, "set-id %u - negative objects!", set_id); + } +} + + +void +as_namespace_get_bins_info(as_namespace *ns, cf_dyn_buf *db, bool show_ns) +{ + if (show_ns) { + cf_dyn_buf_append_string(db, ns->name); + cf_dyn_buf_append_char(db, ':'); + } + + if (ns->single_bin) { + cf_dyn_buf_append_string(db, "[single-bin]"); + } + else { + uint32_t bin_count = cf_vmapx_count(ns->p_bin_name_vmap); + + cf_dyn_buf_append_string(db, "bin_names="); + cf_dyn_buf_append_uint32(db, bin_count); + cf_dyn_buf_append_string(db, ",bin_names_quota="); + cf_dyn_buf_append_uint32(db, BIN_NAMES_QUOTA); + + for (uint16_t i = 0; i < (uint16_t)bin_count; i++) { + cf_dyn_buf_append_char(db, ','); + cf_dyn_buf_append_string(db, as_bin_get_name_from_id(ns, i)); + } + } + + if (show_ns) { + cf_dyn_buf_append_char(db, ';'); + } +} + + +void +as_namespace_get_hist_info(as_namespace *ns, char *set_name, char *hist_name, + cf_dyn_buf *db, bool show_ns) +{ + if (show_ns) { + cf_dyn_buf_append_string(db, ns->name); + cf_dyn_buf_append_char(db, ':'); + } + + if (set_name == NULL || set_name[0] == 0) { + if (strcmp(hist_name, "ttl") == 0) { + cf_dyn_buf_append_string(db, "ttl="); + linear_hist_get_info(ns->ttl_hist, db); + cf_dyn_buf_append_char(db, ';'); + } + else if (strcmp(hist_name, "objsz") == 0) { + if (ns->storage_type == AS_STORAGE_ENGINE_SSD) { + cf_dyn_buf_append_string(db, "objsz="); + linear_hist_get_info(ns->obj_size_hist, db); + cf_dyn_buf_append_char(db, ';'); + } + else { + cf_dyn_buf_append_string(db, "hist-not-applicable"); + } + } + else { + cf_dyn_buf_append_string(db, "error-unknown-hist-name"); + } + + return; + } + + uint16_t set_id = as_namespace_get_set_id(ns, set_name); + + if (set_id != INVALID_SET_ID) { + if (strcmp(hist_name, "ttl") == 0) { + if (ns->set_ttl_hists[set_id]) { + cf_dyn_buf_append_string(db, "ttl="); + linear_hist_get_info(ns->set_ttl_hists[set_id], db); + cf_dyn_buf_append_char(db, ';'); + } + else { + cf_dyn_buf_append_string(db, "hist-unavailable"); + } + } + else if (strcmp(hist_name, "objsz") == 0) { + if (ns->storage_type == AS_STORAGE_ENGINE_SSD) { + if (ns->set_obj_size_hists[set_id]) { + cf_dyn_buf_append_string(db, "objsz="); + linear_hist_get_info(ns->set_obj_size_hists[set_id], db); + cf_dyn_buf_append_char(db, ';'); + } + else { + cf_dyn_buf_append_string(db, "hist-unavailable"); + } + } + else { + cf_dyn_buf_append_string(db, "hist-not-applicable"); + } + } + else { + cf_dyn_buf_append_string(db, "error-unknown-hist-name"); + } + } + else { + cf_dyn_buf_append_string(db, "error-unknown-set-name"); + } +} + + +//========================================================== +// Local helpers. +// + +static void +append_set_props(as_set *p_set, cf_dyn_buf *db) +{ + // Statistics: + + cf_dyn_buf_append_string(db, "objects="); + cf_dyn_buf_append_uint64(db, cf_atomic64_get(p_set->n_objects)); + cf_dyn_buf_append_char(db, ':'); + + cf_dyn_buf_append_string(db, "tombstones="); + cf_dyn_buf_append_uint64(db, cf_atomic64_get(p_set->n_tombstones)); + cf_dyn_buf_append_char(db, ':'); + + cf_dyn_buf_append_string(db, "memory_data_bytes="); + cf_dyn_buf_append_uint64(db, cf_atomic64_get(p_set->n_bytes_memory)); + cf_dyn_buf_append_char(db, ':'); + + cf_dyn_buf_append_string(db, "truncate_lut="); + cf_dyn_buf_append_uint64(db, p_set->truncate_lut); + cf_dyn_buf_append_char(db, ':'); + + // Configuration: + + cf_dyn_buf_append_string(db, "stop-writes-count="); + cf_dyn_buf_append_uint64(db, cf_atomic64_get(p_set->stop_writes_count)); + cf_dyn_buf_append_char(db, ':'); + + cf_dyn_buf_append_string(db, "set-enable-xdr="); + + if (cf_atomic32_get(p_set->enable_xdr) == AS_SET_ENABLE_XDR_TRUE) { + cf_dyn_buf_append_string(db, "true"); + } + else if (cf_atomic32_get(p_set->enable_xdr) == AS_SET_ENABLE_XDR_FALSE) { + cf_dyn_buf_append_string(db, "false"); + } + else if (cf_atomic32_get(p_set->enable_xdr) == AS_SET_ENABLE_XDR_DEFAULT) { + cf_dyn_buf_append_string(db, "use-default"); + } + else { + cf_dyn_buf_append_uint32(db, cf_atomic32_get(p_set->enable_xdr)); + } + + cf_dyn_buf_append_char(db, ':'); + + cf_dyn_buf_append_string(db, "disable-eviction="); + cf_dyn_buf_append_bool(db, IS_SET_EVICTION_DISABLED(p_set)); + cf_dyn_buf_append_char(db, ';'); +} diff --git a/as/src/base/namespace_ce.c b/as/src/base/namespace_ce.c new file mode 100644 index 00000000..2400b30b --- /dev/null +++ b/as/src/base/namespace_ce.c @@ -0,0 +1,142 @@ +/* + * namespace_cold.c + * + * Copyright (C) 2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include +#include + +#include "citrusleaf/alloc.h" + +#include "arenax.h" +#include "fault.h" +#include "vmapx.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" + +static bool +check_capacity(uint32_t capacity) +{ + uint8_t* test_index_stages[g_config.n_namespaces]; + uint8_t* test_data_blocks[g_config.n_namespaces]; + uint32_t i; + + for (i = 0; i < g_config.n_namespaces; i++) { + as_namespace *ns = g_config.namespaces[i]; + uint64_t stage_size = (uint64_t)as_index_size_get(ns) * capacity; + + if ((test_index_stages[i] = cf_try_malloc(stage_size)) == NULL) { + break; + } + + // Memory for overhead and data, proportional to (= to) stage size. + if ((test_data_blocks[i] = cf_try_malloc(stage_size)) == NULL) { + cf_free(test_index_stages[i]); + break; + } + } + + for (uint32_t j = 0; j < i; j++) { + cf_free(test_index_stages[j]); + cf_free(test_data_blocks[j]); + } + + return i == g_config.n_namespaces; +} + +#define MIN_STAGE_CAPACITY (MAX_STAGE_CAPACITY / 8) +#define NS_MIN_MB (((sizeof(as_index) * MIN_STAGE_CAPACITY) * 2) / (1024 * 1024)) + +uint32_t +as_mem_check() +{ + uint32_t capacity; + + for (capacity = MAX_STAGE_CAPACITY; capacity >= MIN_STAGE_CAPACITY; capacity /= 2) { + if (check_capacity(capacity)) { + break; + } + } + + if (capacity < MIN_STAGE_CAPACITY) { + cf_crash_nostack(AS_NAMESPACE, "server requires at least %luMb of memory per namespace", NS_MIN_MB); + } + + if (capacity < MAX_STAGE_CAPACITY) { + cf_info(AS_NAMESPACE, "detected small memory profile - will size arena stages 1/%u max", MAX_STAGE_CAPACITY / capacity); + } + + return capacity; +} + +static void +setup_namespace(as_namespace* ns, uint32_t stage_capacity) +{ + ns->cold_start = true; + + cf_info(AS_NAMESPACE, "{%s} beginning cold start", ns->name); + + //-------------------------------------------- + // Set up the set name vmap. + // + + ns->p_sets_vmap = (cf_vmapx*)cf_malloc(cf_vmapx_sizeof(sizeof(as_set), AS_SET_MAX_COUNT)); + + cf_vmapx_init(ns->p_sets_vmap, sizeof(as_set), AS_SET_MAX_COUNT, 1024, AS_SET_NAME_MAX_SIZE); + + // Transfer configuration file information about sets. + if (! as_namespace_configure_sets(ns)) { + cf_crash(AS_NAMESPACE, "{%s} can't configure sets", ns->name); + } + + //-------------------------------------------- + // Set up the bin name vmap. + // + + if (! ns->single_bin) { + ns->p_bin_name_vmap = (cf_vmapx*)cf_malloc(cf_vmapx_sizeof(VMAP_BIN_NAME_MAX_SZ, MAX_BIN_NAMES)); + + cf_vmapx_init(ns->p_bin_name_vmap, VMAP_BIN_NAME_MAX_SZ, MAX_BIN_NAMES, 4096, VMAP_BIN_NAME_MAX_SZ); + } + + //-------------------------------------------- + // Set up the index arena. + // + + ns->arena = (cf_arenax*)cf_malloc(cf_arenax_sizeof()); + + cf_arenax_init(ns->arena, 0, as_index_size_get(ns), stage_capacity, 0, CF_ARENAX_BIGLOCK); +} + +void +as_namespaces_setup(bool cold_start_cmd, uint32_t instance, uint32_t stage_capacity) +{ + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + setup_namespace(g_config.namespaces[i], stage_capacity); + } +} + +void +as_namespace_xmem_trusted(as_namespace *ns) +{ + // For enterprise version only. +} diff --git a/as/src/base/packet_compression.c b/as/src/base/packet_compression.c new file mode 100644 index 00000000..2eeb178f --- /dev/null +++ b/as/src/base/packet_compression.c @@ -0,0 +1,234 @@ +/* + * packet_compression.c + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include +#include +#include + +#include "citrusleaf/alloc.h" + +#include "fault.h" + +#include "base/packet_compression.h" +#include "base/proto.h" + +#define STACK_BUF_SZ (1024 * 16) + +/** + * Function to decompress the given data + * Expected arguments + * @param type Type of compression + * @param length Length of buffer to be decompressed + * @param buf Pointer to buffer to be decompressed + * @param out_buf_len Length of buffer to hold decompressed data + * @param out_buf Pointer to buffer to hold decompressed data + * @return 0 if successful + */ +int +as_decompress(compression_type type, size_t buf_len, const uint8_t *buf, size_t *out_buf_len, uint8_t *out_buf) +{ + int ret_value = -1; + cf_debug(AS_COMPRESSION, "In as_decompress"); + switch (type) { + case COMPRESSION_ZLIB: { + // manual convert to match types just in case + uLongf converted_out_buf_len = *out_buf_len; + // zlib api to decompress the data + ret_value = uncompress(out_buf, &converted_out_buf_len, buf, (uLongf) buf_len); + *out_buf_len = converted_out_buf_len; + break; + } + default: + cf_warning(AS_COMPRESSION, "Unknown as_proto compression type: %d", type); + break; + } + cf_debug(AS_COMPRESSION, "Returned as_decompress : %d", ret_value); + return ret_value; +} + +/** + * Function to get back decompressed packet from PROTO_TYPE_AS_MSG_COMPRESSED packet + * Packet : Header - Original size of message - Compressed message + * @param buf Pointer to PROTO_TYPE_AS_MSG_COMPRESSED packet. - Input + * @param output_packet Pointer holding address of decompressed packet. - Output + */ +int +as_packet_decompression(uint8_t *buf, uint8_t **output_packet, size_t *output_packet_size) +{ + int ret_value = -1; + as_comp_proto *as_comp_protop = (as_comp_proto *) buf; + + cf_debug(AS_COMPRESSION, "In as_packet_decompression"); + + if (as_comp_protop->proto.type != PROTO_TYPE_AS_MSG_COMPRESSED) { + cf_warning(AS_COMPRESSION, "as_packet_decompression : Invalid input data : type received %d != PROTO_TYPE_AS_MSG_COMPRESSED (%d)", + as_comp_protop->proto.type, PROTO_TYPE_AS_MSG_COMPRESSED); + cf_warning(AS_COMPRESSION, "Returned as_packet_decompression : %d", ret_value); + return ret_value; + } + +#if 0 // enable this when byte swap also fixed on client side + as_comp_protop->org_sz = cf_swap_from_be64(as_comp_protop->org_sz); +#endif + size_t decompressed_as_packet_sz = as_comp_protop->org_sz; + // sanity check for client supplied size + if (decompressed_as_packet_sz > PROTO_SIZE_MAX) { + // the closest error for this case is "input data was corrupted or incomplete" + return Z_DATA_ERROR; + } + + size_t buf_sz = as_comp_protop->proto.sz - 8; + buf += sizeof(as_comp_proto); + uint8_t *decompressed_packet = cf_malloc(decompressed_as_packet_sz); + ret_value = as_decompress(COMPRESSION_ZLIB, buf_sz, buf, &decompressed_as_packet_sz, decompressed_packet); + if (ret_value) { + cf_free(decompressed_packet); + } else { + *output_packet = decompressed_packet; + if (output_packet_size) { + *output_packet_size = decompressed_as_packet_sz; + } + } + cf_debug(AS_COMPRESSION, "Returned as_packet_decompression : %d", ret_value); + return (ret_value); +} + +/* + * Function to compress the given data + * Expected arguments + * 1. Type of compression + * 1 for zlib + * 2. Length of buffer to be compressed - mandatory + * 3. Pointer to buffer to be compressed - mandatory + * 4. Length of buffer to hold compressed data - mandatory + * 5. Pointer to buffer to hold compressed data - mandatory + * 6. Compression level - Optional, default Z_DEFAULT_COMPRESSION + */ +int +as_compress(int argc, uint8_t *argv[]) +{ +#define MANDATORY_NO_ARGUMENTS 5 + int compression_type; + uint8_t *buf; + size_t *buf_len; + uint8_t *out_buf; + size_t *out_buf_len; + int compression_level; + int ret_value = 0; + + cf_debug(AS_COMPRESSION, "In as_compress"); + + if (argc < MANDATORY_NO_ARGUMENTS) + { + // Insufficient arguments + cf_debug(AS_COMPRESSION, "as_compress : In sufficient arguments\n"); + cf_debug(AS_COMPRESSION, "Returned as_compress : -1"); + return -1; + } + + compression_type = *argv[0]; + buf_len = (size_t *) argv[1]; + buf = argv[2]; + out_buf_len = (size_t *) argv[3]; + out_buf = argv[4]; + + compression_level = (argc > MANDATORY_NO_ARGUMENTS) ? (*argv[MANDATORY_NO_ARGUMENTS + 1]) : Z_DEFAULT_COMPRESSION; + + switch (compression_type) + { + case COMPRESSION_ZLIB: + // zlib api to compress the data + ret_value = compress2(out_buf, out_buf_len, buf, *buf_len, compression_level); + break; + } + cf_debug(AS_COMPRESSION, "Returned as_compress : %d", ret_value); + return ret_value; +} + +/* + * Function to create packet to send compressed data. + * Packet : Header - Original size of message - Compressed message. + * Input : buf - Pointer to data to be compressed. - Input + * buf_sz - Size of the data to be compressed. - Input + * compressed_packet : Pointer holding address of compressed packet. - Output + * compressed_as_packet_sz : Size of the compressed packet. - Output + */ +int +as_packet_compression(uint8_t *buf, size_t buf_sz, uint8_t **compressed_packet, size_t *compressed_as_packet_sz) +{ + uint8_t *tmp_buf; + uint8_t wr_stack_buf[STACK_BUF_SZ]; + uint8_t *wr_buf = wr_stack_buf; + size_t wr_buf_sz = sizeof(wr_stack_buf); + cf_debug(AS_COMPRESSION, "In as_packet_compression"); + + /* Compress the data using client API for compression. + * Expected arguments + * 1. Type of compression + * 1 for zlib + * 2. Length of buffer to be compressed - mandatory + * 3. Pointer to buffer to be compressed - mandatory + * 4. Length of buffer to hold compressed data - mandatory + * 5. Pointer to buffer to hold compressed data - mandatory + * 6. Compression level - Optional, default Z_DEFAULT_COMPRESSION + */ + uint8_t *argv[5]; + int argc = 5; + int compression_type = COMPRESSION_ZLIB; + argv[0] = (uint8_t *)&compression_type; + argv[1] = (uint8_t *)&buf_sz; + argv[2] = buf; + argv[3] = (uint8_t *)&wr_buf_sz; + argv[4] = wr_buf; + + if (as_compress(argc, argv)) + { + compressed_packet = NULL; + compressed_as_packet_sz = 0; + cf_debug(AS_COMPRESSION, "Returned as_packet_compression : -1"); + return -1; + } + + // Allocate buffer to hold new packet + *compressed_as_packet_sz = sizeof(as_comp_proto) + wr_buf_sz; + *compressed_packet = (uint8_t *) cf_calloc(*compressed_as_packet_sz, 1); + if(!*compressed_packet) + { + cf_debug(AS_COMPRESSION, "as_packet_compression : failed to allocte memory"); + cf_debug(AS_COMPRESSION, "Returned as_packet_compression : -1"); + return -1; + } + // Construct the packet for compressed data. + as_comp_proto *as_comp_protop = (as_comp_proto *) *compressed_packet; + as_comp_protop->proto.version = PROTO_VERSION; + as_comp_protop->proto.type = PROTO_TYPE_AS_MSG_COMPRESSED; + as_comp_protop->proto.sz = *compressed_as_packet_sz - 8; + as_proto *proto = (as_proto *) *compressed_packet; + as_proto_swap(proto); + as_comp_protop->org_sz = buf_sz; + + tmp_buf = *compressed_packet + sizeof(as_comp_proto); + memcpy(tmp_buf, wr_buf, wr_buf_sz); + + cf_debug(AS_COMPRESSION, "Returned as_packet_compression : 0"); + return 0; +} diff --git a/as/src/base/particle.c b/as/src/base/particle.c new file mode 100644 index 00000000..1411c836 --- /dev/null +++ b/as/src/base/particle.c @@ -0,0 +1,1016 @@ +/* + * particle.c + * + * Copyright (C) 2008-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + +#include "base/particle.h" + +#include +#include +#include +#include + +#include "aerospike/as_buffer.h" +#include "aerospike/as_msgpack.h" +#include "aerospike/as_serializer.h" +#include "aerospike/as_val.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_byte_order.h" + +#include "dynbuf.h" +#include "fault.h" + +#include "base/datamodel.h" +#include "base/proto.h" +#include "fabric/partition.h" +#include "storage/storage.h" + + +//========================================================== +// Typedefs & constants. +// + +extern const as_particle_vtable integer_vtable; +extern const as_particle_vtable float_vtable; +extern const as_particle_vtable string_vtable; +extern const as_particle_vtable blob_vtable; +extern const as_particle_vtable map_vtable; +extern const as_particle_vtable list_vtable; +extern const as_particle_vtable geojson_vtable; + +// Array of particle vtable pointers. +const as_particle_vtable *particle_vtable[] = { + [AS_PARTICLE_TYPE_NULL] = NULL, + [AS_PARTICLE_TYPE_INTEGER] = &integer_vtable, + [AS_PARTICLE_TYPE_FLOAT] = &float_vtable, + [AS_PARTICLE_TYPE_STRING] = &string_vtable, + [AS_PARTICLE_TYPE_BLOB] = &blob_vtable, + [AS_PARTICLE_TYPE_TIMESTAMP] = &integer_vtable, + [AS_PARTICLE_TYPE_JAVA_BLOB] = &blob_vtable, + [AS_PARTICLE_TYPE_CSHARP_BLOB] = &blob_vtable, + [AS_PARTICLE_TYPE_PYTHON_BLOB] = &blob_vtable, + [AS_PARTICLE_TYPE_RUBY_BLOB] = &blob_vtable, + [AS_PARTICLE_TYPE_PHP_BLOB] = &blob_vtable, + [AS_PARTICLE_TYPE_ERLANG_BLOB] = &blob_vtable, + [AS_PARTICLE_TYPE_MAP] = &map_vtable, + [AS_PARTICLE_TYPE_LIST] = &list_vtable, + [AS_PARTICLE_TYPE_GEOJSON] = &geojson_vtable +}; + + +//========================================================== +// Local utilities. +// + +// Particle type check. +static inline as_particle_type +safe_particle_type(uint8_t type) +{ + switch ((as_particle_type)type) { + case AS_PARTICLE_TYPE_INTEGER: + case AS_PARTICLE_TYPE_FLOAT: + case AS_PARTICLE_TYPE_STRING: + case AS_PARTICLE_TYPE_BLOB: + case AS_PARTICLE_TYPE_TIMESTAMP: + case AS_PARTICLE_TYPE_JAVA_BLOB: + case AS_PARTICLE_TYPE_CSHARP_BLOB: + case AS_PARTICLE_TYPE_PYTHON_BLOB: + case AS_PARTICLE_TYPE_RUBY_BLOB: + case AS_PARTICLE_TYPE_PHP_BLOB: + case AS_PARTICLE_TYPE_ERLANG_BLOB: + case AS_PARTICLE_TYPE_MAP: + case AS_PARTICLE_TYPE_LIST: + case AS_PARTICLE_TYPE_GEOJSON: + return (as_particle_type)type; + // Note - AS_PARTICLE_TYPE_NULL is considered bad here. + default: + cf_warning(AS_PARTICLE, "encountered bad particle type %u", type); + return AS_PARTICLE_TYPE_BAD; + } +} + + +//========================================================== +// Particle "class static" functions. +// + +as_particle_type +as_particle_type_from_asval(const as_val *val) +{ + as_val_t vtype = as_val_type(val); + + switch (vtype) { + case AS_UNDEF: // if val was null - handle quietly + case AS_NIL: + return AS_PARTICLE_TYPE_NULL; + case AS_BOOLEAN: + case AS_INTEGER: + return AS_PARTICLE_TYPE_INTEGER; + case AS_DOUBLE: + return AS_PARTICLE_TYPE_FLOAT; + case AS_STRING: + return AS_PARTICLE_TYPE_STRING; + case AS_BYTES: + return AS_PARTICLE_TYPE_BLOB; + case AS_GEOJSON: + return AS_PARTICLE_TYPE_GEOJSON; + case AS_LIST: + return AS_PARTICLE_TYPE_LIST; + case AS_MAP: + return AS_PARTICLE_TYPE_MAP; + case AS_REC: + case AS_PAIR: + default: + cf_warning(AS_PARTICLE, "no particle type for as_val_t %d", vtype); + return AS_PARTICLE_TYPE_NULL; + } +} + +as_particle_type +as_particle_type_from_msgpack(const uint8_t *packed, uint32_t packed_size) +{ + as_val_t vtype = as_unpack_buf_peek_type(packed, packed_size); + + switch (vtype) { + case AS_NIL: + return AS_PARTICLE_TYPE_NULL; + case AS_BOOLEAN: + case AS_INTEGER: + return AS_PARTICLE_TYPE_INTEGER; + case AS_DOUBLE: + return AS_PARTICLE_TYPE_FLOAT; + case AS_STRING: + return AS_PARTICLE_TYPE_STRING; + case AS_BYTES: + return AS_PARTICLE_TYPE_BLOB; + case AS_GEOJSON: + return AS_PARTICLE_TYPE_GEOJSON; + case AS_LIST: + return AS_PARTICLE_TYPE_LIST; + case AS_MAP: + return AS_PARTICLE_TYPE_MAP; + case AS_UNDEF: + case AS_REC: + case AS_PAIR: + default: + cf_warning(AS_PARTICLE, "encountered bad as_val_t %d", vtype); + return AS_PARTICLE_TYPE_BAD; + } +} + +uint32_t +as_particle_size_from_asval(const as_val *val) +{ + as_particle_type type = as_particle_type_from_asval(val); + + if (type == AS_PARTICLE_TYPE_NULL) { + // Currently UDF code just skips unmanageable as_val types. + return 0; + } + + return particle_vtable[type]->size_from_asval_fn(val); +} + +uint32_t +as_particle_asval_client_value_size(const as_val *val) +{ + as_particle_type type = as_particle_type_from_asval(val); + + if (type == AS_PARTICLE_TYPE_NULL) { + // Currently UDF code just sends bin-op with NULL particle to client. + return 0; + } + + return particle_vtable[type]->asval_wire_size_fn(val); +} + +uint32_t +as_particle_asval_to_client(const as_val *val, as_msg_op *op) +{ + as_particle_type type = as_particle_type_from_asval(val); + + op->particle_type = type; + + if (type == AS_PARTICLE_TYPE_NULL) { + // Currently UDF code just sends bin-op with NULL particle to client. + return 0; + } + + uint8_t *value = (uint8_t *)op + sizeof(as_msg_op) + op->name_sz; + uint32_t added_size = particle_vtable[type]->asval_to_wire_fn(val, value); + + op->op_sz += added_size; + + return added_size; +} + + +//========================================================== +// as_bin particle functions. +// + +//------------------------------------------------ +// Destructor, etc. +// + +void +as_bin_particle_destroy(as_bin *b, bool free_particle) +{ + if (free_particle && as_bin_is_external_particle(b) && b->particle) { + particle_vtable[as_bin_get_particle_type(b)]->destructor_fn(b->particle); + } + + b->particle = NULL; +} + +uint32_t +as_bin_particle_size(as_bin *b) +{ + if (! as_bin_inuse(b)) { + // Single-bin will get here. + // TODO - clean up code paths so this doesn't happen? + return 0; + } + + return particle_vtable[as_bin_get_particle_type(b)]->size_fn(b->particle); +} + +//------------------------------------------------ +// Handle "wire" format. +// + +int +as_bin_particle_alloc_modify_from_client(as_bin *b, const as_msg_op *op) +{ + // This method does not destroy the existing particle, if any. We assume + // there is a copy of this bin (and particle reference) elsewhere, and that + // the copy will be responsible for the existing particle. Therefore it's + // important on failure to leave the existing particle intact. + + uint8_t operation = op->op; + as_particle_type op_type = safe_particle_type(op->particle_type); + + if (op_type == AS_PARTICLE_TYPE_BAD) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t op_value_size = as_msg_op_get_value_sz(op); + uint8_t *op_value = as_msg_op_get_value_p((as_msg_op *)op); + + // Currently all operations become creates if there's no existing particle. + if (! as_bin_inuse(b)) { + // Memcache increment is weird - manipulate to create integer. + if (operation == AS_MSG_OP_MC_INCR) { + if (op_value_size != 2 * sizeof(uint64_t) || op_type != AS_PARTICLE_TYPE_BLOB) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + op_type = AS_PARTICLE_TYPE_INTEGER; + op_value_size = sizeof(uint64_t); + op_value += sizeof(uint64_t); + } + + int32_t mem_size = particle_vtable[op_type]->size_from_wire_fn(op_value, op_value_size); + + if (mem_size < 0) { + return (int)mem_size; + } + + as_particle *old_particle = b->particle; + + if (mem_size != 0) { + b->particle = cf_malloc_ns((size_t)mem_size); + } + + // Load the new particle into the bin. + int result = particle_vtable[op_type]->from_wire_fn(op_type, op_value, op_value_size, &b->particle); + + // Set the bin's iparticle metadata. + if (result == 0) { + as_bin_state_set_from_type(b, op_type); + } + else { + if (mem_size != 0) { + cf_free(b->particle); + } + + b->particle = old_particle; + } + + return result; + } + + // There is an existing particle, which we will modify. + uint8_t existing_type = as_bin_get_particle_type(b); + int32_t new_mem_size = 0; + as_particle *new_particle = NULL; + + as_particle *old_particle = b->particle; + int result = 0; + + switch (operation) { + case AS_MSG_OP_MC_INCR: + if (op_value_size != 2 * sizeof(uint64_t) || op_type != AS_PARTICLE_TYPE_BLOB) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + op_type = AS_PARTICLE_TYPE_INTEGER; + // op_value_size of 16 will flag operation as memcache increment... + // no break + case AS_MSG_OP_INCR: + result = particle_vtable[existing_type]->incr_from_wire_fn(op_type, op_value, op_value_size, &b->particle); + break; + case AS_MSG_OP_MC_APPEND: + if (existing_type != AS_PARTICLE_TYPE_STRING) { + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + } + // no break + case AS_MSG_OP_APPEND: + new_mem_size = particle_vtable[existing_type]->concat_size_from_wire_fn(op_type, op_value, op_value_size, &b->particle); + if (new_mem_size < 0) { + return new_mem_size; + } + new_particle = cf_malloc_ns((size_t)new_mem_size); + memcpy(new_particle, b->particle, particle_vtable[existing_type]->size_fn(b->particle)); + b->particle = new_particle; + result = particle_vtable[existing_type]->append_from_wire_fn(op_type, op_value, op_value_size, &b->particle); + break; + case AS_MSG_OP_MC_PREPEND: + if (existing_type != AS_PARTICLE_TYPE_STRING) { + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + } + // no break + case AS_MSG_OP_PREPEND: + new_mem_size = particle_vtable[existing_type]->concat_size_from_wire_fn(op_type, op_value, op_value_size, &b->particle); + if (new_mem_size < 0) { + return new_mem_size; + } + new_particle = cf_malloc_ns((size_t)new_mem_size); + memcpy(new_particle, b->particle, particle_vtable[existing_type]->size_fn(b->particle)); + b->particle = new_particle; + result = particle_vtable[existing_type]->prepend_from_wire_fn(op_type, op_value, op_value_size, &b->particle); + break; + default: + // TODO - just crash? + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + if (result < 0) { + if (new_mem_size != 0) { + cf_free(b->particle); + } + + b->particle = old_particle; + } + + return result; +} + +int +as_bin_particle_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op) +{ + uint8_t operation = op->op; + as_particle_type op_type = safe_particle_type(op->particle_type); + + if (op_type == AS_PARTICLE_TYPE_BAD) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t op_value_size = as_msg_op_get_value_sz(op); + uint8_t *op_value = as_msg_op_get_value_p((as_msg_op *)op); + + // Currently all operations become creates if there's no existing particle. + if (! as_bin_inuse(b)) { + // Memcache increment is weird - manipulate to create integer. + if (operation == AS_MSG_OP_MC_INCR) { + if (op_value_size != 2 * sizeof(uint64_t) || op_type != AS_PARTICLE_TYPE_BLOB) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + op_type = AS_PARTICLE_TYPE_INTEGER; + op_value_size = sizeof(uint64_t); + op_value += sizeof(uint64_t); + } + + int32_t mem_size = particle_vtable[op_type]->size_from_wire_fn(op_value, op_value_size); + + if (mem_size < 0) { + return (int)mem_size; + } + + as_particle *old_particle = b->particle; + + // Instead of allocating, we use the stack buffer provided. (Note that + // embedded types like integer will overwrite this with the value.) + cf_ll_buf_reserve(particles_llb, (size_t)mem_size, (uint8_t **)&b->particle); + + // Load the new particle into the bin. + int result = particle_vtable[op_type]->from_wire_fn(op_type, op_value, op_value_size, &b->particle); + + // Set the bin's iparticle metadata. + if (result == 0) { + as_bin_state_set_from_type(b, op_type); + } + else { + b->particle = old_particle; + } + + return result; + } + + // There is an existing particle, which we will modify. + uint8_t existing_type = as_bin_get_particle_type(b); + int32_t new_mem_size = 0; + + as_particle *old_particle = b->particle; + int result = 0; + + switch (operation) { + case AS_MSG_OP_MC_INCR: + if (op_value_size != 2 * sizeof(uint64_t) || op_type != AS_PARTICLE_TYPE_BLOB) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + op_type = AS_PARTICLE_TYPE_INTEGER; + // op_value_size of 16 will flag operation as memcache increment... + // no break + case AS_MSG_OP_INCR: + result = particle_vtable[existing_type]->incr_from_wire_fn(op_type, op_value, op_value_size, &b->particle); + break; + case AS_MSG_OP_MC_APPEND: + if (existing_type != AS_PARTICLE_TYPE_STRING) { + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + } + // no break + case AS_MSG_OP_APPEND: + new_mem_size = particle_vtable[existing_type]->concat_size_from_wire_fn(op_type, op_value, op_value_size, &b->particle); + if (new_mem_size < 0) { + return (int)new_mem_size; + } + cf_ll_buf_reserve(particles_llb, (size_t)new_mem_size, (uint8_t **)&b->particle); + memcpy(b->particle, old_particle, particle_vtable[existing_type]->size_fn(old_particle)); + result = particle_vtable[existing_type]->append_from_wire_fn(op_type, op_value, op_value_size, &b->particle); + break; + case AS_MSG_OP_MC_PREPEND: + if (existing_type != AS_PARTICLE_TYPE_STRING) { + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + } + // no break + case AS_MSG_OP_PREPEND: + new_mem_size = particle_vtable[existing_type]->concat_size_from_wire_fn(op_type, op_value, op_value_size, &b->particle); + if (new_mem_size < 0) { + return (int)new_mem_size; + } + cf_ll_buf_reserve(particles_llb, (size_t)new_mem_size, (uint8_t **)&b->particle); + memcpy(b->particle, old_particle, particle_vtable[existing_type]->size_fn(old_particle)); + result = particle_vtable[existing_type]->prepend_from_wire_fn(op_type, op_value, op_value_size, &b->particle); + break; + default: + // TODO - just crash? + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + if (result < 0) { + b->particle = old_particle; + } + + return result; +} + +int +as_bin_particle_alloc_from_client(as_bin *b, const as_msg_op *op) +{ + // This method does not destroy the existing particle, if any. We assume + // there is a copy of this bin (and particle reference) elsewhere, and that + // the copy will be responsible for the existing particle. Therefore it's + // important on failure to leave the existing particle intact. + + as_particle_type type = safe_particle_type(op->particle_type); + + if (type == AS_PARTICLE_TYPE_BAD) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t value_size = as_msg_op_get_value_sz(op); + uint8_t *value = as_msg_op_get_value_p((as_msg_op *)op); + int32_t mem_size = particle_vtable[type]->size_from_wire_fn(value, value_size); + + if (mem_size < 0) { + return (int)mem_size; + } + + as_particle *old_particle = b->particle; + + if (mem_size != 0) { + b->particle = cf_malloc_ns((size_t)mem_size); + } + + // Load the new particle into the bin. + int result = particle_vtable[type]->from_wire_fn(type, value, value_size, &b->particle); + + // Set the bin's iparticle metadata. + if (result == 0) { + as_bin_state_set_from_type(b, type); + } + else { + if (mem_size != 0) { + cf_free(b->particle); + } + + b->particle = old_particle; + } + + return result; +} + +int +as_bin_particle_stack_from_client(as_bin *b, cf_ll_buf *particles_llb, const as_msg_op *op) +{ + // We assume that if we're using stack particles, the old particle is either + // nonexistent or also a stack particle - either way, don't destroy. + + as_particle_type type = safe_particle_type(op->particle_type); + + if (type == AS_PARTICLE_TYPE_BAD) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t value_size = as_msg_op_get_value_sz(op); + uint8_t *value = as_msg_op_get_value_p((as_msg_op *)op); + int32_t mem_size = particle_vtable[type]->size_from_wire_fn(value, value_size); + + if (mem_size < 0) { + return (int)mem_size; + } + + as_particle *old_particle = b->particle; + + // Instead of allocating, we use the stack buffer provided. (Note that + // embedded types like integer will overwrite this with the value.) + cf_ll_buf_reserve(particles_llb, (size_t)mem_size, (uint8_t **)&b->particle); + + // Load the new particle into the bin. + int result = particle_vtable[type]->from_wire_fn(type, value, value_size, &b->particle); + + // Set the bin's iparticle metadata. + if (result == 0) { + as_bin_state_set_from_type(b, type); + } + else { + b->particle = old_particle; + } + + return result; +} + +int +as_bin_particle_alloc_from_pickled(as_bin *b, const uint8_t **p_pickled, const uint8_t *end) +{ + // This method does not destroy the existing particle, if any. We assume + // there is a copy of this bin (and particle reference) elsewhere, and that + // the copy will be responsible for the existing particle. Therefore it's + // important on failure to leave the existing particle intact. + + const uint8_t *pickled = (const uint8_t *)*p_pickled; + + if (pickled + 1 + 4 > end) { + cf_warning(AS_PARTICLE, "incomplete pickled particle"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + as_particle_type type = safe_particle_type(*pickled++); + + if (type == AS_PARTICLE_TYPE_BAD) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + const uint32_t *p32 = (const uint32_t *)pickled; + uint32_t value_size = cf_swap_from_be32(*p32++); + const uint8_t *value = (const uint8_t *)p32; + + *p_pickled = value + value_size; + + // TODO - does this serve as a value_size sanity check? + if (*p_pickled > end) { + cf_warning(AS_PARTICLE, "incomplete pickled particle"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + int32_t mem_size = particle_vtable[type]->size_from_wire_fn(value, value_size); + + if (mem_size < 0) { + return (int)mem_size; + } + + as_particle *old_particle = b->particle; + + if (mem_size != 0) { + b->particle = cf_malloc_ns((size_t)mem_size); + } + + // Load the new particle into the bin. + int result = particle_vtable[type]->from_wire_fn(type, value, value_size, &b->particle); + + if (result < 0) { + if (mem_size != 0) { + cf_free(b->particle); + } + + b->particle = old_particle; + return result; + } + + // Set the bin's iparticle metadata. + as_bin_state_set_from_type(b, type); + + return 0; +} + +int +as_bin_particle_stack_from_pickled(as_bin *b, cf_ll_buf *particles_llb, const uint8_t **p_pickled, const uint8_t *end) +{ + // We assume that if we're using stack particles, the old particle is either + // nonexistent or also a stack particle - either way, don't destroy. + + const uint8_t *pickled = (const uint8_t *)*p_pickled; + + if (pickled + 1 + 4 > end) { + cf_warning(AS_PARTICLE, "incomplete pickled particle"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + as_particle_type type = safe_particle_type(*pickled++); + + if (type == AS_PARTICLE_TYPE_BAD) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + const uint32_t *p32 = (const uint32_t *)pickled; + uint32_t value_size = cf_swap_from_be32(*p32++); + const uint8_t *value = (const uint8_t *)p32; + + *p_pickled = value + value_size; + + // TODO - does this serve as a value_size sanity check? + if (*p_pickled > end) { + cf_warning(AS_PARTICLE, "incomplete pickled particle"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + int32_t mem_size = particle_vtable[type]->size_from_wire_fn(value, value_size); + + if (mem_size < 0) { + // Leave existing particle intact. + return (int)mem_size; + } + + as_particle *old_particle = b->particle; + + // Instead of allocating, we use the stack buffer provided. (Note that + // embedded types like integer will overwrite this with the value.) + cf_ll_buf_reserve(particles_llb, (size_t)mem_size, (uint8_t **)&b->particle); + + // Load the new particle into the bin. + int result = particle_vtable[type]->from_wire_fn(type, value, value_size, &b->particle); + + if (result < 0) { + b->particle = old_particle; + return result; + } + + // Set the bin's iparticle metadata. + as_bin_state_set_from_type(b, type); + + return 0; +} + +int +as_bin_particle_compare_from_pickled(const as_bin *b, uint8_t **p_pickled) +{ + if (! as_bin_inuse(b)) { + // TODO - just crash? + cf_warning(AS_PARTICLE, "comparing to unused bin"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + const uint8_t *pickled = (const uint8_t *)*p_pickled; + as_particle_type type = safe_particle_type(*pickled++); + const uint32_t *p32 = (const uint32_t *)pickled; + uint32_t value_size = cf_swap_from_be32(*p32++); + const uint8_t *value = (const uint8_t *)p32; + + *p_pickled = (uint8_t *)value + value_size; + + if (type == AS_PARTICLE_TYPE_BAD) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + return particle_vtable[as_bin_get_particle_type(b)]->compare_from_wire_fn(b->particle, type, value, value_size); +} + +uint32_t +as_bin_particle_client_value_size(const as_bin *b) +{ + if (! as_bin_inuse(b)) { + // UDF result bin (bin name "SUCCESS" or "FAILURE") will get here. + return 0; + } + + uint8_t type = as_bin_get_particle_type(b); + + return particle_vtable[type]->wire_size_fn(b->particle); +} + +uint32_t +as_bin_particle_to_client(const as_bin *b, as_msg_op *op) +{ + if (! (b && as_bin_inuse(b))) { + // UDF result bin (bin name "SUCCESS" or "FAILURE") will get here. + // Ordered ops that find no bin will get here. + op->particle_type = AS_PARTICLE_TYPE_NULL; + return 0; + } + + uint8_t type = as_bin_get_particle_type(b); + + op->particle_type = type; + + uint8_t *value = (uint8_t *)op + sizeof(as_msg_op) + op->name_sz; + uint32_t added_size = particle_vtable[type]->to_wire_fn(b->particle, value); + + op->op_sz += added_size; + + return added_size; +} + +uint32_t +as_bin_particle_pickled_size(const as_bin *b) +{ + uint8_t type = as_bin_get_particle_type(b); + + // Always a type byte and a 32-bit size. + return 1 + 4 + particle_vtable[type]->wire_size_fn(b->particle); +} + +uint32_t +as_bin_particle_to_pickled(const as_bin *b, uint8_t *pickled) +{ + uint8_t type = as_bin_get_particle_type(b); + + *pickled++ = type; + + uint32_t *p_size = (uint32_t *)pickled; + uint8_t *value = (uint8_t *)(p_size + 1); + uint32_t size = particle_vtable[type]->to_wire_fn(b->particle, value); + + *p_size = cf_swap_to_be32(size); + + return 1 + 4 + size; +} + +//------------------------------------------------ +// Handle as_val translation. +// + +int +as_bin_particle_replace_from_asval(as_bin *b, const as_val *val) +{ + uint8_t old_type = as_bin_get_particle_type(b); + as_particle_type new_type = as_particle_type_from_asval(val); + + if (new_type == AS_PARTICLE_TYPE_NULL) { + // Currently UDF code just skips unmanageable as_val types. + return 0; + } + + uint32_t new_mem_size = particle_vtable[new_type]->size_from_asval_fn(val); + // TODO - could this ever fail? + + as_particle *old_particle = b->particle; + + if (new_mem_size != 0) { + b->particle = cf_malloc_ns(new_mem_size); + } + + // Load the new particle into the bin. + particle_vtable[new_type]->from_asval_fn(val, &b->particle); + // TODO - could this ever fail? + + if (as_bin_inuse(b)) { + // Destroy the old particle. + particle_vtable[old_type]->destructor_fn(old_particle); + } + + // Set the bin's iparticle metadata. + as_bin_state_set_from_type(b, new_type); + + return 0; +} + +void +as_bin_particle_stack_from_asval(as_bin *b, uint8_t* stack, const as_val *val) +{ + // We assume that if we're using stack particles, the old particle is either + // nonexistent or also a stack particle - either way, don't destroy. + + as_particle_type type = as_particle_type_from_asval(val); + + if (type == AS_PARTICLE_TYPE_NULL) { + // Currently UDF code just skips unmanageable as_val types. + return; + } + + // Instead of allocating, we use the stack buffer provided. (Note that + // embedded types like integer will overwrite this with the value.) + b->particle = (as_particle *)stack; + + // Load the new particle into the bin. + particle_vtable[type]->from_asval_fn(val, &b->particle); + // TODO - could this ever fail? + + // Set the bin's iparticle metadata. + as_bin_state_set_from_type(b, type); + + // TODO - we don't bother returning size written, since nothing yet needs + // it and it's very expensive for CDTs to do an extra size_from_asval_fn() + // call. Perhaps we could have from_asval_fn() return the size if needed? +} + +as_val * +as_bin_particle_to_asval(const as_bin *b) +{ + uint8_t type = as_bin_get_particle_type(b); + + // Caller is responsible for freeing as_val returned here. + return particle_vtable[type]->to_asval_fn(b->particle); +} + +//------------------------------------------------ +// Handle msgpack translation. +// + +int +as_bin_particle_alloc_from_msgpack(as_bin *b, const uint8_t *packed, uint32_t packed_size) +{ + // We assume the bin is empty. + + as_particle_type type = as_particle_type_from_msgpack(packed, packed_size); + + if (type == AS_PARTICLE_TYPE_BAD) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + if (type == AS_PARTICLE_TYPE_NULL) { + return AS_PROTO_RESULT_OK; + } + + uint32_t mem_size = particle_vtable[type]->size_from_msgpack_fn(packed, packed_size); + + if (mem_size != 0) { + b->particle = cf_malloc(mem_size); // response, so not cf_malloc_ns() + } + + particle_vtable[type]->from_msgpack_fn(packed, packed_size, &b->particle); + + // Set the bin's iparticle metadata. + as_bin_state_set_from_type(b, type); + + return AS_PROTO_RESULT_OK; +} + +//------------------------------------------------ +// Handle on-device "flat" format. +// + +// TODO - re-do to leave original intact on failure. +int +as_bin_particle_cast_from_flat(as_bin *b, uint8_t *flat, uint32_t flat_size) +{ + if (as_bin_inuse(b)) { + // TODO - just crash? + cf_warning(AS_PARTICLE, "cast from flat into used bin"); + return -1; + } + + as_particle_type type = safe_particle_type(*flat); + + if (type == AS_PARTICLE_TYPE_BAD) { + return -1; + } + + // Cast the new particle into the bin. + int result = particle_vtable[type]->cast_from_flat_fn(flat, flat_size, &b->particle); + + // Set the bin's iparticle metadata. + if (result == 0) { + as_bin_state_set_from_type(b, type); + } + else { + as_bin_set_empty(b); + } + + return result; +} + +// TODO - re-do to leave original intact on failure. +int +as_bin_particle_replace_from_flat(as_bin *b, const uint8_t *flat, uint32_t flat_size) +{ + uint8_t old_type = as_bin_get_particle_type(b); + as_particle_type new_type = safe_particle_type(*flat); + + if (new_type == AS_PARTICLE_TYPE_BAD) { + return -1; + } + + // Just destroy the old particle, if any - we're replacing it. + if (as_bin_inuse(b)) { + particle_vtable[old_type]->destructor_fn(b->particle); + } + + // Load the new particle into the bin. + int result = particle_vtable[new_type]->from_flat_fn(flat, flat_size, &b->particle); + + // Set the bin's iparticle metadata. + if (result == 0) { + as_bin_state_set_from_type(b, new_type); + } + else { + as_bin_set_empty(b); + } + + return result; +} + +uint32_t +as_bin_particle_flat_size(as_bin *b) +{ + if (! as_bin_inuse(b)) { + // TODO - just crash? + cf_warning(AS_PARTICLE, "flat sizing unused bin"); + return 0; + } + + uint8_t type = as_bin_get_particle_type(b); + + return particle_vtable[type]->flat_size_fn(b->particle); +} + +uint32_t +as_bin_particle_to_flat(const as_bin *b, uint8_t *flat) +{ + if (! as_bin_inuse(b)) { + // TODO - just crash? + cf_warning(AS_PARTICLE, "flattening unused bin"); + return 0; + } + + uint8_t type = as_bin_get_particle_type(b); + + *flat = type; + + return particle_vtable[type]->to_flat_fn(b->particle, flat); +} + + +//========================================================== +// as_bin particle functions specific to CDTs. +// + +//------------------------------------------------ +// Handle "wire" format. +// + +int +as_bin_cdt_read_from_client(const as_bin *b, as_msg_op *op, as_bin *result) +{ + return as_bin_cdt_packed_read(b, op, result); +} + +int +as_bin_cdt_alloc_modify_from_client(as_bin *b, as_msg_op *op, as_bin *result) +{ + return as_bin_cdt_packed_modify(b, op, result, NULL); +} + +int +as_bin_cdt_stack_modify_from_client(as_bin *b, cf_ll_buf *particles_llb, as_msg_op *op, as_bin *result) +{ + return as_bin_cdt_packed_modify(b, op, result, particles_llb); +} diff --git a/as/src/base/particle_blob.c b/as/src/base/particle_blob.c new file mode 100644 index 00000000..0c8ec98f --- /dev/null +++ b/as/src/base/particle_blob.c @@ -0,0 +1,432 @@ +/* + * particle_blob.c + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + +#include "base/particle_blob.h" + +#include +#include +#include + +#include "aerospike/as_bytes.h" +#include "aerospike/as_msgpack.h" +#include "aerospike/as_val.h" +#include "citrusleaf/alloc.h" + +#include "fault.h" + +#include "base/datamodel.h" +#include "base/particle.h" +#include "base/proto.h" + + +// BLOB particle interface function declarations are in particle_blob.h since +// BLOB functions are used by other particles derived from BLOB. + + +//========================================================== +// BLOB particle interface - vtable. +// + +const as_particle_vtable blob_vtable = { + blob_destruct, + blob_size, + + blob_concat_size_from_wire, + blob_append_from_wire, + blob_prepend_from_wire, + blob_incr_from_wire, + blob_size_from_wire, + blob_from_wire, + blob_compare_from_wire, + blob_wire_size, + blob_to_wire, + + blob_size_from_asval, + blob_from_asval, + blob_to_asval, + blob_asval_wire_size, + blob_asval_to_wire, + + blob_size_from_msgpack, + blob_from_msgpack, + + blob_size_from_flat, + blob_cast_from_flat, + blob_from_flat, + blob_flat_size, + blob_to_flat +}; + + +//========================================================== +// Typedefs & constants. +// + +typedef struct blob_mem_s { + uint8_t type; + uint32_t sz; + uint8_t data[]; +} __attribute__ ((__packed__)) blob_mem; + +typedef struct blob_flat_s { + uint8_t type; + uint32_t size; // host order on device + uint8_t data[]; +} __attribute__ ((__packed__)) blob_flat; + + +//========================================================== +// Forward declarations. +// + +static inline as_particle_type blob_bytes_type_to_particle_type(as_bytes_type type); + + +//========================================================== +// BLOB particle interface - function definitions. +// + +//------------------------------------------------ +// Destructor, etc. +// + +void +blob_destruct(as_particle *p) +{ + cf_free(p); +} + +uint32_t +blob_size(const as_particle *p) +{ + return (uint32_t)(sizeof(blob_mem) + ((blob_mem *)p)->sz); +} + +//------------------------------------------------ +// Handle "wire" format. +// + +int32_t +blob_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + blob_mem *p_blob_mem = (blob_mem *)*pp; + + if (wire_type != p_blob_mem->type) { + cf_warning(AS_PARTICLE, "type mismatch concat sizing blob/string, %d:%d", p_blob_mem->type, wire_type); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + } + + return (int32_t)(sizeof(blob_mem) + p_blob_mem->sz + value_size); +} + +int +blob_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + blob_mem *p_blob_mem = (blob_mem *)*pp; + + if (wire_type != p_blob_mem->type) { + cf_warning(AS_PARTICLE, "type mismatch appending to blob/string, %d:%d", p_blob_mem->type, wire_type); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + } + + memcpy(p_blob_mem->data + p_blob_mem->sz, wire_value, value_size); + p_blob_mem->sz += value_size; + + return 0; +} + +int +blob_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + blob_mem *p_blob_mem = (blob_mem *)*pp; + + if (wire_type != p_blob_mem->type) { + cf_warning(AS_PARTICLE, "type mismatch prepending to blob/string, %d:%d", p_blob_mem->type, wire_type); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + } + + memmove(p_blob_mem->data + value_size, p_blob_mem->data, p_blob_mem->sz); + memcpy(p_blob_mem->data, wire_value, value_size); + p_blob_mem->sz += value_size; + + return 0; +} + +int +blob_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "unexpected increment of blob/string"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +int32_t +blob_size_from_wire(const uint8_t *wire_value, uint32_t value_size) +{ + // Wire value is same as in-memory value. + return (int32_t)(sizeof(blob_mem) + value_size); +} + +int +blob_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + blob_mem *p_blob_mem = (blob_mem *)*pp; + + p_blob_mem->type = wire_type; + p_blob_mem->sz = value_size; + memcpy(p_blob_mem->data, wire_value, p_blob_mem->sz); + + return 0; +} + +int +blob_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size) +{ + blob_mem *p_blob_mem = (blob_mem *)p; + + return (wire_type == p_blob_mem->type && + value_size == p_blob_mem->sz && + memcmp(wire_value, p_blob_mem->data, value_size) == 0) ? 0 : 1; +} + +uint32_t +blob_wire_size(const as_particle *p) +{ + blob_mem *p_blob_mem = (blob_mem *)p; + + return p_blob_mem->sz; +} + +uint32_t +blob_to_wire(const as_particle *p, uint8_t *wire) +{ + blob_mem *p_blob_mem = (blob_mem *)p; + + memcpy(wire, p_blob_mem->data, p_blob_mem->sz); + + return p_blob_mem->sz; +} + +//------------------------------------------------ +// Handle as_val translation. +// + +uint32_t +blob_size_from_asval(const as_val *val) +{ + return (uint32_t)sizeof(blob_mem) + as_bytes_size(as_bytes_fromval(val)); +} + +void +blob_from_asval(const as_val *val, as_particle **pp) +{ + blob_mem *p_blob_mem = (blob_mem *)*pp; + + as_bytes *bytes = as_bytes_fromval(val); + + p_blob_mem->type = (uint8_t)blob_bytes_type_to_particle_type(bytes->type); + p_blob_mem->sz = as_bytes_size(bytes); + memcpy(p_blob_mem->data, as_bytes_get(bytes), p_blob_mem->sz); +} + +as_val * +blob_to_asval(const as_particle *p) +{ + blob_mem *p_blob_mem = (blob_mem *)p; + + uint8_t *value = cf_malloc(p_blob_mem->sz); + + memcpy(value, p_blob_mem->data, p_blob_mem->sz); + + return (as_val *)as_bytes_new_wrap(value, p_blob_mem->sz, true); +} + +uint32_t +blob_asval_wire_size(const as_val *val) +{ + return as_bytes_size(as_bytes_fromval(val)); +} + +uint32_t +blob_asval_to_wire(const as_val *val, uint8_t *wire) +{ + as_bytes *bytes = as_bytes_fromval(val); + uint32_t size = as_bytes_size(bytes); + + memcpy(wire, as_bytes_get(bytes), size); + + return size; +} + +//------------------------------------------------ +// Handle msgpack translation. +// + +uint32_t +blob_size_from_msgpack(const uint8_t *packed, uint32_t packed_size) +{ + // Ok to oversize by a few bytes - only used for allocation sizing. + // -1 for blob internal type and -1 for blob header. + return (uint32_t)sizeof(blob_mem) + packed_size - 2; +} + +void +blob_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp) +{ + as_unpacker pk = { + .buffer = packed, + .offset = 0, + .length = packed_size + }; + + int64_t blob_size = as_unpack_blob_size(&pk); + const uint8_t *ptr = pk.buffer + pk.offset; + + uint8_t type = *ptr; + + // Adjust for type (1 byte). + ptr++; + blob_size--; + + blob_mem *p_blob_mem = (blob_mem *)*pp; + + p_blob_mem->type = (uint8_t)blob_bytes_type_to_particle_type((as_bytes_type)type); + p_blob_mem->sz = blob_size; + memcpy(p_blob_mem->data, ptr, p_blob_mem->sz); +} + +//------------------------------------------------ +// Handle on-device "flat" format. +// + +int32_t +blob_size_from_flat(const uint8_t *flat, uint32_t flat_size) +{ + blob_flat *p_blob_flat = (blob_flat *)flat; + // Assume type is correct, since we got here. + + // Sanity check length. + if (p_blob_flat->size != flat_size - sizeof(blob_flat)) { + cf_warning(AS_PARTICLE, "unexpected flat blob/string: flat size %u, len %u", + flat_size, p_blob_flat->size); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + // Flat value is same as in-memory value. + return (int32_t)(sizeof(blob_mem) + p_blob_flat->size); +} + +int +blob_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp) +{ + // Sizing is only a sanity check. + int32_t mem_size = blob_size_from_flat(flat, flat_size); + + if (mem_size < 0) { + return mem_size; + } + + // We can do this only because the flat and in-memory formats are identical. + *pp = (as_particle *)flat; + + return 0; +} + +int +blob_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp) +{ + int32_t mem_size = blob_size_from_flat(flat, flat_size); + + if (mem_size < 0) { + return mem_size; + } + + blob_mem *p_blob_mem = (blob_mem *)cf_malloc_ns((size_t)mem_size); + const blob_flat *p_blob_flat = (const blob_flat *)flat; + + p_blob_mem->type = p_blob_flat->type; + p_blob_mem->sz = p_blob_flat->size; + memcpy(p_blob_mem->data, p_blob_flat->data, p_blob_mem->sz); + + *pp = (as_particle *)p_blob_mem; + + return 0; +} + +uint32_t +blob_flat_size(const as_particle *p) +{ + return (uint32_t)(sizeof(blob_flat) + ((blob_mem *)p)->sz); +} + +uint32_t +blob_to_flat(const as_particle *p, uint8_t *flat) +{ + blob_mem *p_blob_mem = (blob_mem *)p; + blob_flat *p_blob_flat = (blob_flat *)flat; + + // Already wrote the type. + p_blob_flat->size = p_blob_mem->sz; + memcpy(p_blob_flat->data, p_blob_mem->data, p_blob_flat->size); + + return blob_flat_size(p); +} + + +//========================================================== +// Local helpers. +// + +static inline as_particle_type +blob_bytes_type_to_particle_type(as_bytes_type type) +{ + switch (type) { + case AS_BYTES_STRING: + return AS_PARTICLE_TYPE_STRING; + case AS_BYTES_BLOB: + return AS_PARTICLE_TYPE_BLOB; + case AS_BYTES_JAVA: + return AS_PARTICLE_TYPE_JAVA_BLOB; + case AS_BYTES_CSHARP: + return AS_PARTICLE_TYPE_CSHARP_BLOB; + case AS_BYTES_PYTHON: + return AS_PARTICLE_TYPE_PYTHON_BLOB; + case AS_BYTES_RUBY: + return AS_PARTICLE_TYPE_RUBY_BLOB; + case AS_BYTES_PHP: + return AS_PARTICLE_TYPE_PHP_BLOB; + case AS_BYTES_ERLANG: + return AS_PARTICLE_TYPE_ERLANG_BLOB; + case AS_BYTES_GEOJSON: + return AS_PARTICLE_TYPE_GEOJSON; + case AS_BYTES_INTEGER: + case AS_BYTES_DOUBLE: + case AS_BYTES_MAP: + case AS_BYTES_LIST: + case AS_BYTES_UNDEF: + default: + break; + } + + // Invalid blob types remain as blobs. + return AS_PARTICLE_TYPE_BLOB; +} diff --git a/as/src/base/particle_float.c b/as/src/base/particle_float.c new file mode 100644 index 00000000..203bb3db --- /dev/null +++ b/as/src/base/particle_float.c @@ -0,0 +1,200 @@ +/* + * particle_float.c + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + +#include +#include + +#include "aerospike/as_double.h" +#include "aerospike/as_msgpack.h" +#include "aerospike/as_val.h" +#include "citrusleaf/cf_byte_order.h" + +#include "fault.h" + +#include "base/datamodel.h" +#include "base/particle.h" +#include "base/particle_integer.h" +#include "base/proto.h" + + +//========================================================== +// FLOAT particle interface - function declarations. +// + +// Most FLOAT particle table functions just use the equivalent INTEGER particle +// functions. Here are the differences... + +// Handle "wire" format. +int float_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int float_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int float_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size); + +// Handle as_val translation. +void float_from_asval(const as_val *val, as_particle **pp); +as_val *float_to_asval(const as_particle *p); +uint32_t float_asval_to_wire(const as_val *val, uint8_t *wire); + +// Handle msgpack translation. +void float_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp); + + +//========================================================== +// FLOAT particle interface - vtable. +// + +const as_particle_vtable float_vtable = { + integer_destruct, + integer_size, + + integer_concat_size_from_wire, + integer_append_from_wire, + integer_prepend_from_wire, + float_incr_from_wire, + integer_size_from_wire, + float_from_wire, + float_compare_from_wire, + integer_wire_size, + integer_to_wire, + + integer_size_from_asval, + float_from_asval, + float_to_asval, + integer_asval_wire_size, + float_asval_to_wire, + + integer_size_from_msgpack, + float_from_msgpack, + + integer_size_from_flat, + integer_cast_from_flat, + integer_from_flat, + integer_flat_size, + integer_to_flat +}; + + +//========================================================== +// FLOAT particle interface - function definitions. +// + +// Most FLOAT particle table functions just use the equivalent INTEGER particle +// functions. Here are the differences... + +//------------------------------------------------ +// Handle "wire" format. +// + +int +float_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + // For now we won't allow adding integers (or anything else) to floats. + if (wire_type != AS_PARTICLE_TYPE_FLOAT) { + cf_warning(AS_PARTICLE, "increment with non float type %u", wire_type); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + } + + uint64_t i; + + switch (value_size) { + case 8: + i = cf_swap_from_be64(*(uint64_t *)wire_value); + break; + default: + cf_warning(AS_PARTICLE, "unexpected value size %u", value_size); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + (*(double *)pp) += *(double *)&i; + + return 0; +} + +int +float_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + if (value_size != 8) { + cf_warning(AS_PARTICLE, "unexpected value size %u", value_size); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return integer_from_wire(wire_type, wire_value, value_size, pp); +} + +int +float_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size) +{ + if (wire_type != AS_PARTICLE_TYPE_FLOAT) { + return 1; + } + + if (value_size != 8) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + return integer_compare_from_wire(p, AS_PARTICLE_TYPE_INTEGER, wire_value, value_size); +} + +//------------------------------------------------ +// Handle as_val translation. +// + +void +float_from_asval(const as_val *val, as_particle **pp) +{ + *(double *)pp = as_double_get(as_double_fromval(val)); +} + +as_val * +float_to_asval(const as_particle *p) +{ + return (as_val *)as_double_new(*(double *)&p); +} + +uint32_t +float_asval_to_wire(const as_val *val, uint8_t *wire) +{ + double x = as_double_get(as_double_fromval(val)); + + *(uint64_t *)wire = cf_swap_to_be64(*(uint64_t *)&x); + + return (uint32_t)sizeof(uint64_t); +} + +//------------------------------------------------ +// Handle msgpack translation. +// + +void +float_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp) +{ + double x; + as_unpacker pk = { + .buffer = packed, + .offset = 0, + .length = packed_size + }; + + as_unpack_double(&pk, &x); + + *(double *)pp = x; +} diff --git a/as/src/base/particle_geojson.c b/as/src/base/particle_geojson.c new file mode 100644 index 00000000..72cd7b87 --- /dev/null +++ b/as/src/base/particle_geojson.c @@ -0,0 +1,600 @@ +/* + * particle_geojson.c + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + +#include +#include +#include + +#include "aerospike/as_geojson.h" +#include "aerospike/as_msgpack.h" +#include "aerospike/as_val.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_byte_order.h" + +#include "fault.h" + +#include "base/datamodel.h" +#include "base/particle.h" +#include "base/particle_blob.h" +#include "base/proto.h" +#include "geospatial/geospatial.h" + + +//========================================================== +// GEOJSON particle interface - function declarations. +// + +// Most GEOJSON particle table functions just use the equivalent BLOB particle +// functions. Here are the differences... + +// Handle "wire" format. +int32_t geojson_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int geojson_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int geojson_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int geojson_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int32_t geojson_size_from_wire(const uint8_t *wire_value, uint32_t value_size); +int geojson_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +uint32_t geojson_to_wire(const as_particle *p, uint8_t *wire); + +// Handle as_val translation. +uint32_t geojson_size_from_asval(const as_val *val); +void geojson_from_asval(const as_val *val, as_particle **pp); +as_val *geojson_to_asval(const as_particle *p); +uint32_t geojson_asval_wire_size(const as_val *val); +uint32_t geojson_asval_to_wire(const as_val *val, uint8_t *wire); + +// Handle msgpack translation. +uint32_t geojson_size_from_msgpack(const uint8_t *packed, uint32_t packed_size); +void geojson_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp); + + +//========================================================== +// GEOJSON particle interface - vtable. +// + +const as_particle_vtable geojson_vtable = { + blob_destruct, + blob_size, + + geojson_concat_size_from_wire, + geojson_append_from_wire, + geojson_prepend_from_wire, + geojson_incr_from_wire, + geojson_size_from_wire, + geojson_from_wire, + blob_compare_from_wire, + blob_wire_size, + geojson_to_wire, + + geojson_size_from_asval, + geojson_from_asval, + geojson_to_asval, + geojson_asval_wire_size, + geojson_asval_to_wire, + + geojson_size_from_msgpack, + geojson_from_msgpack, + + blob_size_from_flat, + blob_cast_from_flat, + blob_from_flat, + blob_flat_size, + blob_to_flat +}; + + +//========================================================== +// Typedefs & constants. +// + +// GEOJSON particle flag bit-fields. +#define GEOJSON_ISREGION 0x1 + +// The GEOJSON particle structs overlay the related BLOB structs. + +typedef struct geojson_mem_s { + uint8_t type; // IMPORTANT: overlay blob_mem! + uint32_t sz; // IMPORTANT: overlay blob_mem! + uint8_t flags; + uint16_t ncells; + uint8_t data[]; // (ncells * uint64_t) + jsonstr +} __attribute__ ((__packed__)) geojson_mem; + +typedef struct geojson_flat_s { + uint8_t type; // IMPORTANT: overlay blob_flat! + uint32_t size; // IMPORTANT: overlay blob_flat! + uint8_t flags; + uint16_t ncells; + uint8_t data[]; // (ncells * uint64_t) + jsonstr +} __attribute__ ((__packed__)) geojson_flat; + + +//========================================================== +// Forward declarations. +// + +static bool geojson_match(bool particle_is_region, uint64_t particle_cellid, geo_region_t particle_region, uint64_t query_cellid, geo_region_t query_region, bool is_strict); +static inline uint32_t geojson_size(uint32_t n_cells, size_t string_size); + + +//========================================================== +// GEOJSON particle interface - function definitions. +// + +// Most GEOJSON particle table functions just use the equivalent BLOB particle +// functions. Here are the differences... + +//------------------------------------------------ +// Handle "wire" format. +// + +int32_t +geojson_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "invalid operation on geojson particle"); + return -1; +} + +int32_t +geojson_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "invalid operation on geojson particle"); + return -1; +} + +int32_t +geojson_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "invalid operation on geojson particle"); + return -1; +} + +int32_t +geojson_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "invalid operation on geojson particle"); + return -1; +} + +int32_t +geojson_size_from_wire(const uint8_t *wire_value, uint32_t value_size) +{ + // NOTE - Unfortunately we would need to run the JSON parser and region + // coverer to find out exactly how many cells we need to allocate for this + // particle. + // + // For now we always allocate the maximum number of cells (MAX_REGION_CELLS) + // for the in-memory particle. + // + // For now also ignore any incoming cells entirely. + + uint8_t const *incp = (uint8_t const *)wire_value + 1; + uint16_t incells = cf_swap_from_be16(*(uint16_t const *)incp); + size_t incellsz = incells * sizeof(uint64_t); + size_t injsonsz = value_size - sizeof(uint8_t) - sizeof(uint16_t) - incellsz; + + return (int32_t)(sizeof(geojson_mem) + (MAX_REGION_CELLS * sizeof(uint64_t)) + injsonsz); +} + +int +geojson_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + uint8_t const *incp = (uint8_t const *)wire_value + 1; + uint16_t incells = cf_swap_from_be16(*(uint16_t const *)incp); + size_t incellsz = incells * sizeof(uint64_t); + char const *injsonptr = (char const *)incp + sizeof(uint16_t) + incellsz; + size_t injsonsz = value_size - sizeof(uint8_t) - sizeof(uint16_t) - incellsz; + + // We ignore any incoming cells entirely. + + uint64_t cellid = 0; + geo_region_t region = NULL; + + if (! geo_parse(NULL, injsonptr, injsonsz, &cellid, ®ion)) { + cf_warning(AS_PARTICLE, "geo_parse failed"); + return -AS_PROTO_RESULT_FAIL_GEO_INVALID_GEOJSON; + } + + if (cellid && region) { + geo_region_destroy(region); + cf_warning(AS_PARTICLE, "geo_parse found both point and region"); + return -AS_PROTO_RESULT_FAIL_GEO_INVALID_GEOJSON; + } + + if (! cellid && ! region) { + cf_warning(AS_PARTICLE, "geo_parse found neither point nor region"); + return -AS_PROTO_RESULT_FAIL_GEO_INVALID_GEOJSON; + } + + geojson_mem *p_geojson_mem = (geojson_mem *)*pp; + + p_geojson_mem->type = wire_type; + + // We'll come back and set the size at the end. + uint64_t *p_outcells = (uint64_t *)p_geojson_mem->data; + + p_geojson_mem->flags = 0; + + if (cellid) { + // POINT + p_geojson_mem->flags &= ~GEOJSON_ISREGION; + p_geojson_mem->ncells = 1; + p_outcells[0] = cellid; + } + else { + // REGION + p_geojson_mem->flags |= GEOJSON_ISREGION; + + int numcells; + + if (! geo_region_cover(NULL, region, MAX_REGION_CELLS, p_outcells, NULL, NULL, &numcells)) { + geo_region_destroy(region); + cf_warning(AS_PARTICLE, "geo_region_cover failed"); + return -AS_PROTO_RESULT_FAIL_GEO_INVALID_GEOJSON; + } + + p_geojson_mem->ncells = numcells; + } + + if (region) { + geo_region_destroy(region); + } + + // Copy the JSON into place. + char *p_outjson = (char *)&p_outcells[p_geojson_mem->ncells]; + + memcpy(p_outjson, injsonptr, injsonsz); + + // Set the actual size; we will waste some space at the end of the allocated + // particle. + p_geojson_mem->sz = sizeof(uint8_t) + sizeof(uint16_t) + (p_geojson_mem->ncells * sizeof(uint64_t)) + injsonsz; + + return AS_PROTO_RESULT_OK; +} + +uint32_t +geojson_to_wire(const as_particle *p, uint8_t *wire) +{ + // Use blob routine first. + uint32_t sz = blob_to_wire(p, wire); + + // Swap ncells. + uint16_t *p_ncells = (uint16_t *)(wire + sizeof(uint8_t)); + uint16_t ncells = *p_ncells; + + *p_ncells = cf_swap_to_be16(*p_ncells); + ++p_ncells; + + // Swap the cells. + uint64_t *p_cell_begin = (uint64_t *)p_ncells; + uint64_t *p_cell_end = p_cell_begin + ncells; + + for (uint64_t *p_cell = p_cell_begin; p_cell < p_cell_end; ++p_cell) { + *p_cell = cf_swap_to_be64(*p_cell); + } + + return sz; +} + +//------------------------------------------------ +// Handle as_val translation. +// + +uint32_t +geojson_size_from_asval(const as_val *val) +{ + as_geojson *pg = as_geojson_fromval(val); + size_t jsz = as_geojson_len(pg); + + // Compute the size; we won't be writing any cellids ... + return geojson_size(0, jsz); +} + +void +geojson_from_asval(const as_val *val, as_particle **pp) +{ + geojson_mem *p_geojson_mem = (geojson_mem *)*pp; + + as_geojson *pg = as_geojson_fromval(val); + size_t jsz = as_geojson_len(pg); + + p_geojson_mem->type = AS_PARTICLE_TYPE_GEOJSON; + p_geojson_mem->sz = geojson_size(0, jsz); + p_geojson_mem->flags = 0; + p_geojson_mem->ncells = 0; + + uint8_t *p8 = (uint8_t *)p_geojson_mem->data; + memcpy(p8, as_geojson_get(pg), jsz); +} + +as_val * +geojson_to_asval(const as_particle *p) +{ + size_t jsonsz; + char const *jsonptr = as_geojson_mem_jsonstr(p, &jsonsz); + char *buf = cf_malloc(jsonsz + 1); + + memcpy(buf, jsonptr, jsonsz); + buf[jsonsz] = '\0'; + + return (as_val *)as_geojson_new_wlen(buf, jsonsz, true); +} + +uint32_t +geojson_asval_wire_size(const as_val *val) +{ + as_geojson *pg = as_geojson_fromval(val); + size_t jsz = as_geojson_len(pg); + + // We won't be writing any cellids ... + return geojson_size(0, jsz); +} + +uint32_t +geojson_asval_to_wire(const as_val *val, uint8_t *wire) +{ + as_geojson *pg = as_geojson_fromval(val); + size_t jsz = as_geojson_len(pg); + + uint8_t *p8 = wire; + + *p8++ = 0; // flags + + uint16_t *p16 = (uint16_t *)p8; + + *p16++ = cf_swap_to_be16(0); // no cells on output to client + p8 = (uint8_t *)p16; + memcpy(p8, as_geojson_get(pg), jsz); + + return geojson_size(0, jsz); +} + +//------------------------------------------------ +// Handle msgpack translation. +// + +uint32_t +geojson_size_from_msgpack(const uint8_t *packed, uint32_t packed_size) +{ + // Oversize by a few bytes doing the easy thing. + size_t jsz = (size_t)packed_size; + + // Compute the size; we won't be writing any cellids ... + return geojson_size(0, jsz); +} + +void +geojson_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp) +{ + geojson_mem *p_geojson_mem = (geojson_mem *)*pp; + + as_unpacker pk = { + .buffer = packed, + .offset = 0, + .length = packed_size + }; + + int64_t blob_size = as_unpack_blob_size(&pk); + const uint8_t *ptr = pk.buffer + pk.offset; + + // *ptr should be AS_BYTES_GEOJSON at this point. + + // Adjust for type (1 byte). + ptr++; + blob_size--; + + size_t jsz = (size_t)blob_size; + + p_geojson_mem->type = AS_PARTICLE_TYPE_GEOJSON; + p_geojson_mem->sz = geojson_size(0, jsz); + p_geojson_mem->flags = 0; + p_geojson_mem->ncells = 0; + + uint8_t *p8 = (uint8_t *)p_geojson_mem->data; + memcpy(p8, ptr, jsz); +} + + +//========================================================== +// Particle functions specific to GEOJSON. +// + +size_t +as_bin_particle_geojson_cellids(const as_bin *b, uint64_t **ppcells) +{ + geojson_mem *gp = (geojson_mem *)b->particle; + + *ppcells = (uint64_t *)gp->data; + + return (size_t)gp->ncells; +} + +bool +as_particle_geojson_match(as_particle *particle, uint64_t query_cellid, geo_region_t query_region, bool is_strict) +{ + // Determine whether the candidate particle geometry is a match + // for the query geometry. + // + // If query_cellid is non-zero this is a regions-containing-point query. + // + // If query_region is non-null this is a points-in-region query. + // + // Candidate geometry can either be a point or a region. Regions + // will have the GEOJSON_ISREGION flag set. + + geojson_mem *gp = (geojson_mem *)particle; + + uint64_t *cells = (uint64_t *)gp->data; + + uint64_t candidate_cellid = cells[0]; + geo_region_t candidate_region = NULL; + + bool candidate_is_region = (gp->flags & GEOJSON_ISREGION) != 0; + + // If we are a strict RCP query on a region candidate we need to + // run the parser to obtain a candidate_region for the matcher. + // + if (query_cellid != 0 && candidate_is_region && is_strict) { + size_t jsonsz; + char const *jsonptr = as_geojson_mem_jsonstr(particle, &jsonsz); + + if (! geo_parse(NULL, jsonptr, jsonsz, &candidate_cellid, + &candidate_region)) { + cf_warning(AS_PARTICLE, "geo_parse() failed - unexpected"); + geo_region_destroy(candidate_region); + return false; + } + } + + bool ismatch = geojson_match( + candidate_is_region, + candidate_cellid, + candidate_region, + query_cellid, + query_region, + is_strict); + + geo_region_destroy(candidate_region); + + return ismatch; +} + +bool +as_particle_geojson_match_asval(const as_val *val, uint64_t query_cellid, geo_region_t query_region, bool is_strict) +{ + as_geojson *pg = as_geojson_fromval(val); + size_t jsonsz = as_geojson_len(pg); + char * jsonptr = as_geojson_get(pg); + + uint64_t candidate_cellid = 0; + geo_region_t candidate_region = NULL; + + if (! geo_parse(NULL, jsonptr, jsonsz, &candidate_cellid, + &candidate_region)) { + cf_warning(AS_PARTICLE, "geo_parse() failed - unexpected"); + geo_region_destroy(candidate_region); + return false; + } + + bool candidate_is_region = candidate_cellid == 0; + + bool ismatch = geojson_match( + candidate_is_region, + candidate_cellid, + candidate_region, + query_cellid, + query_region, + is_strict); + + geo_region_destroy(candidate_region); + + return ismatch; +} + +char const * +as_geojson_mem_jsonstr(as_particle const *particle, size_t *p_jsonsz) +{ + geojson_mem *p_geojson_mem = (geojson_mem *)particle; + + size_t cellsz = p_geojson_mem->ncells * sizeof(uint64_t); + + *p_jsonsz = p_geojson_mem->sz - sizeof(uint8_t) - sizeof(uint16_t) - cellsz; + + return (char const *)p_geojson_mem->data + cellsz; +} + + +//========================================================== +// Local helpers. +// + +static bool +geojson_match(bool candidate_is_region, uint64_t candidate_cellid, geo_region_t candidate_region, uint64_t query_cellid, geo_region_t query_region, bool is_strict) +{ + // Determine whether the candidate geometry is a match for the + // query geometry. + // + // If query_cellid is non-zero this is a regions-containing-point query. + // + // If query_region is non-null this is a points-in-region query. + // + // Candidate geometry can either be a point or a region. Regions + // will have the GEOJSON_ISREGION flag set. + + // Is this a REGIONS-CONTAINING-POINT query? + // + if (query_cellid != 0) { + + if (candidate_is_region) { + // Candidate is a REGION. + + // Shortcut, if we aren't strict just return true. + if (! is_strict) { + return true; + } + + return geo_point_within(query_cellid, candidate_region); + } + else { + // Candidate is a POINT, skip it. + return false; + } + } + + // Is this a POINTS-IN-REGION query? + // + if (query_region) { + + if (candidate_is_region) { + // Candidate is a REGION, skip it. + return false; + } + else { + // Sanity check, make sure this geometry has been processed. + if (candidate_cellid == 0) { + cf_warning(AS_PARTICLE, "candidate cellid has no value"); + return false; + } + + // Candidate is a POINT. + if (is_strict) { + return geo_point_within(candidate_cellid, query_region); + } + else { + return true; + } + } + } + + return false; +} + +static inline uint32_t +geojson_size(uint32_t n_cells, size_t string_size) +{ + return (uint32_t)( + sizeof(uint8_t) + // flags + sizeof(uint16_t) + // ncells (always 0 here) + (n_cells * sizeof(uint64_t)) + // cell array + string_size); // json string +} diff --git a/as/src/base/particle_integer.c b/as/src/base/particle_integer.c new file mode 100644 index 00000000..ad7f894a --- /dev/null +++ b/as/src/base/particle_integer.c @@ -0,0 +1,446 @@ +/* + * particle_integer.c + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + +#include "base/particle_integer.h" + +#include +#include + +#include "aerospike/as_boolean.h" +#include "aerospike/as_integer.h" +#include "aerospike/as_msgpack.h" +#include "aerospike/as_val.h" +#include "citrusleaf/cf_byte_order.h" + +#include "fault.h" + +#include "base/datamodel.h" +#include "base/particle.h" +#include "base/proto.h" + + +// INTEGER particle interface function declarations are in particle_int.h since +// INTEGER functions are used by other particles derived from INTEGER. + + +//========================================================== +// INTEGER particle interface - vtable. +// + +const as_particle_vtable integer_vtable = { + integer_destruct, + integer_size, + + integer_concat_size_from_wire, + integer_append_from_wire, + integer_prepend_from_wire, + integer_incr_from_wire, + integer_size_from_wire, + integer_from_wire, + integer_compare_from_wire, + integer_wire_size, + integer_to_wire, + + integer_size_from_asval, + integer_from_asval, + integer_to_asval, + integer_asval_wire_size, + integer_asval_to_wire, + + integer_size_from_msgpack, + integer_from_msgpack, + + integer_size_from_flat, + integer_cast_from_flat, + integer_from_flat, + integer_flat_size, + integer_to_flat +}; + + +//========================================================== +// Typedefs & constants. +// + +typedef struct integer_mem_s { + uint8_t do_not_use; // already know it's an int type + uint64_t i; +} __attribute__ ((__packed__)) integer_mem; + +typedef struct integer_flat_s { + uint8_t type; + uint8_t size; + uint64_t i; +} __attribute__ ((__packed__)) integer_flat; + + +//========================================================== +// INTEGER particle interface - function definitions. +// + +//------------------------------------------------ +// Destructor, etc. +// + +void +integer_destruct(as_particle *p) +{ + // Nothing to do - integer values live in the as_bin. +} + +uint32_t +integer_size(const as_particle *p) +{ + // Integer values live in the as_bin instead of a pointer. + return 0; +} + +//------------------------------------------------ +// Handle "wire" format. +// + +int32_t +integer_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "concat size for integer/float"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +int +integer_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "append to integer/float"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +int +integer_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "prepend to integer/float"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +int +integer_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + if (wire_type != AS_PARTICLE_TYPE_INTEGER) { + cf_warning(AS_PARTICLE, "increment with non integer type %u", wire_type); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + } + + uint64_t i; + + switch (value_size) { + case 8: + i = cf_swap_from_be64(*(uint64_t *)wire_value); + break; + case 4: + i = (uint64_t)cf_swap_from_be32(*(uint32_t *)wire_value); + break; + case 2: + i = (uint64_t)cf_swap_from_be16(*(uint16_t *)wire_value); + break; + case 1: + i = (uint64_t)*wire_value; + break; + case 16: // memcache increment - it's special + i = cf_swap_from_be64(*(uint64_t *)wire_value); + // For memcache, decrements floor at 0. + if ((int64_t)i < 0 && *(uint64_t *)pp + i > *(uint64_t *)pp) { + *pp = 0; + return 0; + } + break; + default: + cf_warning(AS_PARTICLE, "unexpected value size %u", value_size); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + (*(uint64_t *)pp) += i; + + return 0; +} + +int32_t +integer_size_from_wire(const uint8_t *wire_value, uint32_t value_size) +{ + // Integer values live in the as_bin instead of a pointer. + return 0; +} + +int +integer_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + uint64_t i; + + switch (value_size) { + case 8: + i = cf_swap_from_be64(*(uint64_t *)wire_value); + break; + case 4: + i = (uint64_t)cf_swap_from_be32(*(uint32_t *)wire_value); + break; + case 2: + i = (uint64_t)cf_swap_from_be16(*(uint16_t *)wire_value); + break; + case 1: + i = (uint64_t)*wire_value; + break; + default: + cf_warning(AS_PARTICLE, "unexpected value size %u", value_size); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + *pp = (as_particle *)i; + + return 0; +} + +int +integer_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size) +{ + if (wire_type != AS_PARTICLE_TYPE_INTEGER) { + return 1; + } + + uint64_t i; + + switch (value_size) { + case 8: + i = cf_swap_from_be64(*(uint64_t *)wire_value); + break; + case 4: + i = (uint64_t)cf_swap_from_be32(*(uint32_t *)wire_value); + break; + case 2: + i = (uint64_t)cf_swap_from_be16(*(uint16_t *)wire_value); + break; + case 1: + i = (uint64_t)*wire_value; + break; + default: + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + return (uint64_t)p == i ? 0 : 1; +} + +uint32_t +integer_wire_size(const as_particle *p) +{ + return (uint32_t)sizeof(uint64_t); +} + +uint32_t +integer_to_wire(const as_particle *p, uint8_t *wire) +{ + *(uint64_t *)wire = cf_swap_to_be64((uint64_t)p); + + return (uint32_t)sizeof(uint64_t); +} + +//------------------------------------------------ +// Handle as_val translation. +// + +uint32_t +integer_size_from_asval(const as_val *val) +{ + // Integer values live in the as_bin instead of a pointer. + return 0; +} + +void +integer_from_asval(const as_val *val, as_particle **pp) +{ + // Unfortunately AS_BOOLEANs (as well as AS_INTEGERs) become INTEGER + // particles, so we have to check the as_val type here. + + as_val_t vtype = as_val_type(val); + int64_t i; + + switch (vtype) { + case AS_INTEGER: + i = as_integer_get(as_integer_fromval(val)); + break; + case AS_BOOLEAN: + i = as_boolean_get(as_boolean_fromval(val)) ? 1 : 0; + break; + default: + cf_crash(AS_PARTICLE, "unexpected as_val_t %d", vtype); + return; + } + + *pp = (as_particle *)i; +} + +as_val * +integer_to_asval(const as_particle *p) +{ + return (as_val *)as_integer_new((uint64_t)p); +} + +uint32_t +integer_asval_wire_size(const as_val *val) +{ + return (uint32_t)sizeof(uint64_t); +} + +uint32_t +integer_asval_to_wire(const as_val *val, uint8_t *wire) +{ + // Unfortunately AS_BOOLEANs (as well as AS_INTEGERs) become INTEGER + // particles, so we have to check the as_val type here. + + as_val_t vtype = as_val_type(val); + int64_t i; + + switch (vtype) { + case AS_INTEGER: + i = as_integer_get(as_integer_fromval(val)); + break; + case AS_BOOLEAN: + i = as_boolean_get(as_boolean_fromval(val)) ? 1 : 0; + break; + default: + cf_crash(AS_PARTICLE, "unexpected as_val_t %d", vtype); + return 0; + } + + *(uint64_t *)wire = cf_swap_to_be64((uint64_t)i); + + return (uint32_t)sizeof(uint64_t); +} + +//------------------------------------------------ +// Handle msgpack translation. +// + +uint32_t +integer_size_from_msgpack(const uint8_t *packed, uint32_t packed_size) +{ + // Integer values live in the as_bin instead of a pointer. + return 0; +} + +void +integer_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp) +{ + int64_t i; + as_unpacker pk = { + .buffer = packed, + .offset = 0, + .length = packed_size + }; + + as_unpack_int64(&pk, &i); + + *pp = (as_particle *)i; +} + +//------------------------------------------------ +// Handle on-device "flat" format. +// + +int32_t +integer_size_from_flat(const uint8_t *flat, uint32_t flat_size) +{ + // Integer values live in the as_bin instead of a pointer. + return 0; +} + +int +integer_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp) +{ + integer_flat *p_int_flat = (integer_flat *)flat; + // Assume type is correct, since we got here. + + // Sanity check lengths. + if (p_int_flat->size != 8 || flat_size != sizeof(integer_flat)) { + cf_warning(AS_PARTICLE, "unexpected flat integer/float: flat_size %u, len %u", + flat_size, p_int_flat->size); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + // Integer values live in an as_bin instead of a pointer. Also, flat + // integers are host order, so no byte swap. + *pp = (as_particle *)p_int_flat->i; + + return 0; +} + +int +integer_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp) +{ + const integer_flat *p_int_flat = (const integer_flat *)flat; + // Assume type is correct, since we got here. + + // Sanity check lengths. + if (p_int_flat->size != 8 || flat_size != sizeof(integer_flat)) { + cf_warning(AS_PARTICLE, "unexpected flat integer/float: flat_size %u, len %u", + flat_size, p_int_flat->size); + return -1; // TODO - AS_PROTO error code seems inappropriate? + } + + // Integer values live in an as_bin instead of a pointer. Also, flat + // integers are host order, so no byte swap. + *pp = (as_particle *)p_int_flat->i; + + return 0; +} + +uint32_t +integer_flat_size(const as_particle *p) +{ + return sizeof(integer_flat); +} + +uint32_t +integer_to_flat(const as_particle *p, uint8_t *flat) +{ + integer_flat *p_int_flat = (integer_flat *)flat; + + // Already wrote the type. + p_int_flat->size = 8; + p_int_flat->i = (uint64_t)p; + + return integer_flat_size(p); +} + + +//========================================================== +// as_bin particle functions specific to INTEGER. +// + +int64_t +as_bin_particle_integer_value(const as_bin *b) +{ + // Caller must ensure this is called only for INTEGER particles. + return (int64_t)b->particle; +} + +void +as_bin_particle_integer_set(as_bin *b, int64_t i) +{ + b->particle = (as_particle *)i; +} diff --git a/as/src/base/particle_list.c b/as/src/base/particle_list.c new file mode 100644 index 00000000..54e04331 --- /dev/null +++ b/as/src/base/particle_list.c @@ -0,0 +1,4519 @@ +/* + * particle_list.c + * + * Copyright (C) 2015-2018 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include +#include +#include +#include +#include + +#include "aerospike/as_buffer.h" +#include "aerospike/as_msgpack.h" +#include "aerospike/as_serializer.h" +#include "aerospike/as_val.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_byte_order.h" + +#include "fault.h" + +#include "base/cdt.h" +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/particle.h" +#include "base/proto.h" + + +//========================================================== +// LIST particle interface - function declarations. +// + +// Destructor, etc. +void list_destruct(as_particle *p); +uint32_t list_size(const as_particle *p); + +// Handle "wire" format. +int32_t list_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int list_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int list_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int list_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int32_t list_size_from_wire(const uint8_t *wire_value, uint32_t value_size); +int list_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int list_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size); +uint32_t list_wire_size(const as_particle *p); +uint32_t list_to_wire(const as_particle *p, uint8_t *wire); + +// Handle as_val translation. +uint32_t list_size_from_asval(const as_val *val); +void list_from_asval(const as_val *val, as_particle **pp); +as_val *list_to_asval(const as_particle *p); +uint32_t list_asval_wire_size(const as_val *val); +uint32_t list_asval_to_wire(const as_val *val, uint8_t *wire); + +// Handle msgpack translation. +uint32_t list_size_from_msgpack(const uint8_t *packed, uint32_t packed_size); +void list_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp); + +// Handle on-device "flat" format. +int32_t list_size_from_flat(const uint8_t *flat, uint32_t flat_size); +int list_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp); +int list_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp); +uint32_t list_flat_size(const as_particle *p); +uint32_t list_to_flat(const as_particle *p, uint8_t *flat); + + +//========================================================== +// LIST particle interface - vtable. +// + +const as_particle_vtable list_vtable = { + list_destruct, + list_size, + + list_concat_size_from_wire, + list_append_from_wire, + list_prepend_from_wire, + list_incr_from_wire, + list_size_from_wire, + list_from_wire, + list_compare_from_wire, + list_wire_size, + list_to_wire, + + list_size_from_asval, + list_from_asval, + list_to_asval, + list_asval_wire_size, + list_asval_to_wire, + + list_size_from_msgpack, + list_from_msgpack, + + list_size_from_flat, + list_cast_from_flat, + list_from_flat, + list_flat_size, + list_to_flat +}; + + +//========================================================== +// Typedefs & constants. +// + +//#define LIST_DEBUG_VERIFY + +#define PACKED_LIST_INDEX_STEP 128 + +#define AS_PACKED_LIST_FLAG_NONE 0x00 +#define AS_PACKED_LIST_FLAG_ORDERED 0x01 + +#define PACKED_LIST_FLAG_OFF_IDX 0x10 +#define PACKED_LIST_FLAG_FULLOFF_IDX 0x20 + +typedef struct packed_list_s { + const uint8_t *packed; + uint32_t packed_sz; + + uint32_t ele_count; // excludes ext ele + // Mutable state member (is considered mutable in const objects). + offset_index offidx; // offset start at contents (excluding ext metadata ele) + // Mutable state member (is considered mutable in const objects). + offset_index full_offidx; // index at every element + uint8_t ext_flags; + + const uint8_t *contents; // where elements start (excludes ext) + uint32_t content_sz; +} packed_list; + +typedef struct packed_list_op_s { + const packed_list *list; + + uint32_t new_ele_count; + uint32_t new_content_sz; + + uint32_t seg1_sz; + uint32_t seg2_offset; + uint32_t seg2_sz; + uint32_t nil_ele_sz; // number of nils we need to insert +} packed_list_op; + +typedef struct list_mem_s { + uint8_t type; + uint32_t sz; + uint8_t data[]; +} __attribute__ ((__packed__)) list_mem; + +typedef struct list_flat_s { + uint8_t type; + uint32_t sz; // host order on device and in memory + uint8_t data[]; +} __attribute__ ((__packed__)) list_flat; + +typedef struct msgpack_list_empty_flagged_s { + uint8_t list_hdr; + uint8_t ext_hdr; + uint8_t ext_sz; + uint8_t ext_flags; +} __attribute__ ((__packed__)) msgpack_list_empty_flagged; + +typedef struct list_mem_empty_flagged_s { + list_mem mem; + msgpack_list_empty_flagged list; +} list_mem_empty_flagged; + +static const list_mem_empty_flagged list_ordered_empty = { + .mem = { + .type = AS_PARTICLE_TYPE_LIST, + .sz = sizeof(msgpack_list_empty_flagged) + }, + .list = { + .list_hdr = 0x91, + .ext_hdr = 0xC7, + .ext_sz = 0, + .ext_flags = AS_PACKED_LIST_FLAG_ORDERED + } +}; +static const list_mem list_mem_empty = { + .type = AS_PARTICLE_TYPE_LIST, + .sz = 1, + .data = {0x90} +}; + +typedef struct { + const offset_index *offsets; + const order_index *order; + as_cdt_sort_flags flags; + bool error; +} list_order_index_sort_userdata; + +#define define_packed_list_op(__name, __list_p) \ + packed_list_op __name; \ + packed_list_op_init(&__name, __list_p) + +#define list_full_offidx_p(__list_p) \ + (offset_index *)(list_is_ordered(__list_p) ? &(__list_p)->offidx : &(__list_p)->full_offidx) + +#define vla_list_full_offidx_if_invalid(__name, __list_p) \ + union { \ + offset_index *offidx; \ + uint8_t mem_temp[sizeof(offset_index *) + (offset_index_is_valid(list_full_offidx_p(__list_p)) ? 0 : offset_index_size(list_full_offidx_p(__list_p)))]; \ + } __name; \ + __name.offidx = list_full_offidx_p(__list_p); \ + if (! __name.offidx->_.ptr) { \ + __name.offidx->_.ptr = __name.mem_temp + sizeof(offset_index *); \ + offset_index_set_filled(__name.offidx, 1); \ + } + +#define define_packed_list_particle(__name, __particle, __ret) \ + packed_list __name; \ + bool __ret = packed_list_init_from_particle(&__name, __particle) + + +//========================================================== +// Forward declarations. +// + +static inline bool is_list_type(uint8_t type); +static inline bool flags_is_ordered(uint8_t flags); +static inline bool list_is_ordered(const packed_list *list); +static inline uint8_t get_ext_flags(bool ordered); +static uint32_t list_calc_ext_content_sz(uint32_t ele_count, uint32_t content_sz, bool ordered); + +static uint32_t list_pack_header(uint8_t *buf, uint32_t ele_count); +static void list_pack_empty_index(as_packer *pk, uint32_t ele_count, const uint8_t *contents, uint32_t content_sz, bool is_ordered); + +// as_bin +static inline void as_bin_set_empty_list(as_bin *b, rollback_alloc *alloc_buf, bool is_ordered); +static void as_bin_set_ordered_empty_list(as_bin *b, rollback_alloc *alloc_buf); +static inline void as_bin_set_temp_list_if_notinuse(as_bin *b, uint64_t create_flags); + +// packed_list +static bool packed_list_init(packed_list *list, const uint8_t *buf, uint32_t sz); +static inline bool packed_list_init_from_particle(packed_list *list, const as_particle *p); +static bool packed_list_init_from_bin(packed_list *list, const as_bin *b); +static bool packed_list_unpack_hdridx(packed_list *list); +static void packed_list_partial_offidx_update(const packed_list *list); + +static bool packed_list_find_by_value_ordered(const packed_list *list, const cdt_payload *value, order_index_find *find); +static uint32_t packed_list_find_idx_offset(const packed_list *list, uint32_t index); +static bool packed_list_find_rank_range_by_value_interval_ordered(const packed_list *list, const cdt_payload *value_start, const cdt_payload *value_end, uint32_t *rank_r, uint32_t *count_r, bool is_multi); +static bool packed_list_find_rank_range_by_value_interval_unordered(const packed_list *list, const cdt_payload *value_start, const cdt_payload *value_end, uint32_t *rank, uint32_t *count, uint64_t *mask_val, bool inverted, bool is_multi); + +static uint32_t packed_list_mem_sz(const packed_list *list, bool has_ext, uint32_t *ext_content_sz_r); +static uint32_t packed_list_pack_buf(const packed_list *list, uint8_t *buf, uint32_t sz, uint32_t ext_content_sz, bool strip_flags); +static list_mem *packed_list_pack_mem(const packed_list *list, list_mem *p_list_mem); +static void packed_list_content_pack(const packed_list *list, as_packer *pk); +static int packed_list_remove_by_idx(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, const uint64_t rm_idx, uint32_t *rm_sz); +static int packed_list_remove_by_mask(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, const uint64_t *rm_mask, uint32_t rm_count, uint32_t *rm_sz); + +static int packed_list_trim(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, uint32_t index, uint32_t count, cdt_result_data *result); +static int packed_list_get_remove_by_index_range(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, int64_t index, uint64_t count, cdt_result_data *result); +static int packed_list_get_remove_by_value_interval(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_start, const cdt_payload *value_end, cdt_result_data *result); +static int packed_list_get_remove_by_rank_range(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, int64_t rank, uint64_t count, cdt_result_data *result); +static int packed_list_get_remove_all_by_value_list(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_list, cdt_result_data *result); + +static int packed_list_insert(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, int64_t index, const cdt_payload *payload, bool payload_is_list, uint64_t mod_flags, cdt_result_data *result); +static int packed_list_add_ordered(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *payload, bool unique, cdt_result_data *result); +static int packed_list_add_items_ordered(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *items, bool unique, cdt_result_data *result); +static int packed_list_replace_ordered(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, uint32_t index, const cdt_payload *value, uint64_t mod_flags); + +// packed_list_op +static void packed_list_op_init(packed_list_op *op, const packed_list *list); +static bool packed_list_op_insert(packed_list_op *op, uint32_t index, uint32_t count, uint32_t insert_sz); +static bool packed_list_op_remove(packed_list_op *op, uint32_t index, uint32_t count); + +static uint32_t packed_list_op_write_seg1(const packed_list_op *op, uint8_t *buf); +static uint32_t packed_list_op_write_seg2(const packed_list_op *op, uint8_t *buf); + +static bool packed_list_builder_add_ranks_by_range(const packed_list *list, cdt_container_builder *builder, as_unpacker *start, uint32_t count, bool reverse); + +// list +static list_mem *list_create(rollback_alloc *alloc_buf, uint32_t ele_count, uint32_t content_sz); +static as_particle *list_simple_create_from_buf(rollback_alloc *alloc_buf, uint32_t ele_count, const uint8_t *contents, uint32_t content_sz); +static as_particle *list_simple_create(rollback_alloc *alloc_buf, uint32_t ele_count, uint32_t content_sz, uint8_t **contents_r); + +static int list_set_flags(as_bin *b, rollback_alloc *alloc_buf, uint8_t flags, cdt_result_data *result); +static int list_append(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *payload, bool payload_is_list, uint64_t mod_flags, cdt_result_data *result); +static int list_insert(as_bin *b, rollback_alloc *alloc_buf, int64_t index, const cdt_payload *payload, bool payload_is_list, uint64_t mod_flags, cdt_result_data *result); +static int list_set(as_bin *b, rollback_alloc *alloc_buf, int64_t index, const cdt_payload *value, uint64_t mod_flags); +static int list_increment(as_bin *b, rollback_alloc *alloc_buf, int64_t index, cdt_payload *delta_value, uint64_t mod_flags, cdt_result_data *result); +static int list_sort(as_bin *b, rollback_alloc *alloc_buf, as_cdt_sort_flags sort_flags); + +static int list_remove_by_index_range(as_bin *b, rollback_alloc *alloc_buf, int64_t index, uint64_t count, cdt_result_data *result); +static int list_remove_by_value_interval(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_start, const cdt_payload *value_end, cdt_result_data *result); +static int list_remove_by_rank_range(as_bin *b, rollback_alloc *alloc_buf, int64_t rank, uint64_t count, cdt_result_data *result); +static int list_remove_all_by_value_list(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_list, cdt_result_data *result); + +static uint8_t *list_setup_bin(as_bin *b, rollback_alloc *alloc_buf, uint8_t flags, uint32_t content_sz, uint32_t ele_count, uint32_t idx_trunc, const offset_index *old_offidx, offset_index *new_offidx); + +// list_offset_index +static inline void list_offset_index_init(offset_index *offidx, uint8_t *idx_mem_ptr, uint32_t ele_count, const uint8_t *contents, uint32_t content_sz); +static void list_offset_index_rm_mask_cpy(offset_index *dst, const offset_index *full_src, const uint64_t *rm_mask, uint32_t rm_count); + +// list_full_offset_index +static inline void list_full_offset_index_init(offset_index *offidx, uint8_t *idx_mem_ptr, uint32_t ele_count, const uint8_t *contents, uint32_t content_sz); +static bool list_full_offset_index_fill_to(offset_index *offidx, uint32_t index); + +// list_order_index +static int list_order_index_sort_cmp_fn(const void *x, const void *y, void *p); +static uint8_t *list_order_index_pack(const order_index *ordidx, const offset_index *full_offidx, uint8_t *buf, offset_index *new_offidx); + +// list_order_heap +static msgpack_compare_t list_order_heap_cmp_fn(const void *udata, uint32_t idx1, uint32_t idx2); + +// list_result_data +static bool list_result_data_set_not_found(cdt_result_data *rd, int64_t index); +static void list_result_data_set_values_by_mask(cdt_result_data *rd, const uint64_t *mask, const offset_index *full_offidx, uint32_t count, uint32_t sz); +static void list_result_data_set_values_by_idxcount(cdt_result_data *rd, const order_index *idxcnt, const offset_index *full_offidx); +static bool list_result_data_set_values_by_ordidx(cdt_result_data *rd, const order_index *ordidx, const offset_index *full_offidx, uint32_t count, uint32_t sz); + +// Debugging support +static void list_print(const packed_list *list, const char *name); +static bool list_verify(const as_bin *b); + + +//========================================================== +// LIST particle interface - function definitions. +// + +//------------------------------------------------ +// Destructor, etc. +// + +void +list_destruct(as_particle *p) +{ + cf_free(p); +} + +uint32_t +list_size(const as_particle *p) +{ + const list_mem *p_list_mem = (const list_mem *)p; + return (uint32_t)sizeof(list_mem) + p_list_mem->sz; +} + +//------------------------------------------------ +// Handle "wire" format. +// + +int32_t +list_concat_size_from_wire(as_particle_type wire_type, + const uint8_t *wire_value, uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "concat size for list"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +int +list_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, + uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "append to list"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +int +list_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, + uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "prepend to list"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +int +list_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, + uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "increment of list"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +int32_t +list_size_from_wire(const uint8_t *wire_value, uint32_t value_size) +{ + // TODO - CDT can't determine in memory or not. + packed_list list; + + if (! packed_list_init(&list, wire_value, value_size)) { + cf_warning(AS_PARTICLE, "list_size_from_wire() invalid packed list"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + return (int32_t)(sizeof(list_mem) + packed_list_mem_sz(&list, true, NULL)); +} + +int +list_from_wire(as_particle_type wire_type, const uint8_t *wire_value, + uint32_t value_size, as_particle **pp) +{ + // TODO - CDT can't determine in memory or not. + // It works for data-not-in-memory but we'll incur a memcpy that could be + // eliminated. + packed_list list; + + if (! packed_list_init(&list, wire_value, value_size)) { + cf_warning(AS_PARTICLE, "list_from_wire() invalid packed list"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + list_mem *p_list_mem = packed_list_pack_mem(&list, (list_mem *)*pp); + + p_list_mem->type = wire_type; + + return AS_PROTO_RESULT_OK; +} + +int +list_compare_from_wire(const as_particle *p, as_particle_type wire_type, + const uint8_t *wire_value, uint32_t value_size) +{ + // TODO + cf_warning(AS_PARTICLE, "list_compare_from_wire() not implemented"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +uint32_t +list_wire_size(const as_particle *p) +{ + define_packed_list_particle(list, p, success); + cf_assert(success, AS_PARTICLE, "list_wire_size() invalid packed list"); + + return packed_list_mem_sz(&list, false, NULL); +} + +uint32_t +list_to_wire(const as_particle *p, uint8_t *wire) +{ + define_packed_list_particle(list, p, success); + cf_assert(success, AS_PARTICLE, "list_to_wire() invalid packed list"); + + return packed_list_pack_buf(&list, wire, INT_MAX, 0, true); +} + +//------------------------------------------------ +// Handle as_val translation. +// + +uint32_t +list_size_from_asval(const as_val *val) +{ + as_serializer s; + as_msgpack_init(&s); + + uint32_t sz = as_serializer_serialize_getsize(&s, (as_val *)val); + + as_serializer_destroy(&s); + + const as_list *list = (const as_list *)val; + + uint32_t ele_count = as_list_size(list); + uint32_t base_hdr_sz = as_pack_list_header_get_size(ele_count); + uint32_t content_sz = sz - base_hdr_sz; + bool is_ordered = flags_is_ordered((uint8_t)list->flags); + uint32_t ext_content_sz = list_calc_ext_content_sz(ele_count, content_sz, + is_ordered); + uint32_t hdr_sz = (is_ordered || ext_content_sz != 0) ? + as_pack_list_header_get_size(ele_count + 1) : base_hdr_sz; + + return (uint32_t)sizeof(list_mem) + hdr_sz + + as_pack_ext_header_get_size(ext_content_sz) + ext_content_sz + + content_sz; +} + +void +list_from_asval(const as_val *val, as_particle **pp) +{ + as_serializer s; + as_msgpack_init(&s); + + list_mem *p_list_mem = (list_mem *)*pp; + int32_t sz = as_serializer_serialize_presized(&s, val, p_list_mem->data); + + cf_assert(sz >= 0, AS_PARTICLE, "list_from_asval() failed to presize"); + as_serializer_destroy(&s); + + const as_list *list = (const as_list *)val; + + uint32_t ele_count = as_list_size(list); + uint32_t base_hdr_sz = as_pack_list_header_get_size(ele_count); + uint32_t content_sz = (uint32_t)sz - base_hdr_sz; + bool is_ordered = flags_is_ordered((uint8_t)list->flags); + uint32_t ext_content_sz = list_calc_ext_content_sz(ele_count, content_sz, + is_ordered); + + if (is_ordered || ext_content_sz != 0) { + uint32_t hdr_sz = as_pack_list_header_get_size(ele_count + 1); + uint32_t ele_start = hdr_sz + + as_pack_ext_header_get_size(ext_content_sz) + ext_content_sz; + + // Prefer memmove over 2x serialize. + memmove(p_list_mem->data + ele_start, p_list_mem->data + base_hdr_sz, + content_sz); + + as_packer pk = { + .buffer = p_list_mem->data, + .capacity = ele_start + }; + + as_pack_list_header(&pk, ele_count + 1); + as_pack_ext_header(&pk, ext_content_sz, get_ext_flags(is_ordered)); + list_pack_empty_index(&pk, ele_count, NULL, content_sz, is_ordered); + cf_assert(pk.offset == ele_start, AS_PARTICLE, "size mismatch pk.offset(%d) != ele_start(%u)", pk.offset, ele_start); + p_list_mem->sz = ele_start + content_sz; + } + else { + p_list_mem->sz = (uint32_t)sz; + } + + p_list_mem->type = AS_PARTICLE_TYPE_LIST; +} + +as_val * +list_to_asval(const as_particle *p) +{ + list_mem *p_list_mem = (list_mem *)p; + + as_buffer buf = { + .capacity = p_list_mem->sz, + .size = p_list_mem->sz, + .data = p_list_mem->data + }; + + as_serializer s; + as_msgpack_init(&s); + + as_val *val = NULL; + + as_serializer_deserialize(&s, &buf, &val); + as_serializer_destroy(&s); + + if (! val) { + return (as_val *)as_arraylist_new(0, 1); + } + + return val; +} + +uint32_t +list_asval_wire_size(const as_val *val) +{ + as_serializer s; + as_msgpack_init(&s); + + uint32_t sz = as_serializer_serialize_getsize(&s, (as_val *)val); + + as_serializer_destroy(&s); + + return sz; +} + +uint32_t +list_asval_to_wire(const as_val *val, uint8_t *wire) +{ + as_serializer s; + as_msgpack_init(&s); + + int32_t sz = as_serializer_serialize_presized(&s, val, wire); + + as_serializer_destroy(&s); + cf_assert(sz > 0, AS_PARTICLE, "list_asval_to_wire() sz %d failed to serialize", sz); + + return (uint32_t)sz; +} + +//------------------------------------------------ +// Handle msgpack translation. +// + +uint32_t +list_size_from_msgpack(const uint8_t *packed, uint32_t packed_size) +{ + return (uint32_t)sizeof(list_mem) + packed_size; +} + +void +list_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp) +{ + list_mem *p_list_mem = (list_mem *)*pp; + + p_list_mem->type = AS_PARTICLE_TYPE_LIST; + p_list_mem->sz = packed_size; + memcpy(p_list_mem->data, packed, p_list_mem->sz); +} + +//------------------------------------------------ +// Handle on-device "flat" format. +// + +int32_t +list_size_from_flat(const uint8_t *flat, uint32_t flat_size) +{ + // TODO - maybe never used + return -1; +} + +int +list_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp) +{ + // Cast temp buffer from disk to data-not-in-memory. + list_flat *p_list_flat = (list_flat *)flat; + + // This assumes list_flat is the same as list_mem. + *pp = (as_particle *)p_list_flat; + + return 0; +} + +int +list_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp) +{ + // Convert temp buffer from disk to data-in-memory. + const list_flat *p_list_flat = (const list_flat *)flat; + packed_list list; + + if (! packed_list_init(&list, p_list_flat->data, p_list_flat->sz)) { + cf_warning(AS_PARTICLE, "list_from_flat() invalid packed list"); + return -1; + } + + list_mem *p_list_mem = packed_list_pack_mem(&list, NULL); + + if (! p_list_mem) { + cf_warning(AS_PARTICLE, "list_from_flat() failed to create particle"); + return -1; + } + + p_list_mem->type = p_list_flat->type; + *pp = (as_particle *)p_list_mem; + + return 0; +} + +uint32_t +list_flat_size(const as_particle *p) +{ + define_packed_list_particle(list, p, success); + cf_assert(success, AS_PARTICLE, "list_to_flat() invalid packed list"); + + return sizeof(list_flat) + packed_list_mem_sz(&list, false, NULL); +} + +uint32_t +list_to_flat(const as_particle *p, uint8_t *flat) +{ + define_packed_list_particle(list, p, success); + list_flat *p_list_flat = (list_flat *)flat; + + cf_assert(success, AS_PARTICLE, "list_to_flat() invalid packed list"); + p_list_flat->sz = packed_list_mem_sz(&list, false, NULL); + + uint32_t check = packed_list_pack_buf(&list, p_list_flat->data, + p_list_flat->sz, 0, true); + + cf_assert(check == p_list_flat->sz, AS_PARTICLE, "size mismatch check(%u) != sz(%u), ele_count %u content_sz %u flags 0x%x", check, p_list_flat->sz, list.ele_count, list.content_sz, list.ext_flags); + + // Already wrote the type. + + return sizeof(list_flat) + p_list_flat->sz; +} + + +//========================================================== +// as_bin particle functions specific to LIST. +// + +void +as_bin_particle_list_get_packed_val(const as_bin *b, cdt_payload *packed) +{ + const list_mem *p_list_mem = (const list_mem *)b->particle; + + packed->ptr = (uint8_t *)p_list_mem->data; + packed->sz = p_list_mem->sz; +} + + +//========================================================== +// Local helpers. +// + +static inline bool +is_list_type(uint8_t type) +{ + return type == AS_PARTICLE_TYPE_LIST; +} + +static inline bool +flags_is_ordered(uint8_t flags) +{ + return (flags & AS_PACKED_LIST_FLAG_ORDERED) != 0; +} + +static inline bool +list_is_ordered(const packed_list *list) +{ + return flags_is_ordered(list->ext_flags); +} + +static inline bool +mod_flags_is_unique(uint64_t flags) +{ + return (flags & AS_CDT_LIST_ADD_UNIQUE) != 0; +} + +static inline bool +mod_flags_is_bounded(uint64_t flags) +{ + return (flags & AS_CDT_LIST_INSERT_BOUNDED) != 0; +} + +static inline int +mod_flags_return_exists(uint64_t flags) +{ + // TODO - modify for NOFAIL flag later. + return -AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS; +} + +static inline uint8_t +strip_ext_flags(uint8_t flags) +{ + return flags & AS_PACKED_LIST_FLAG_ORDERED; +} + +static inline uint8_t +get_ext_flags(bool ordered) +{ + return ordered ? + (AS_PACKED_LIST_FLAG_ORDERED | PACKED_LIST_FLAG_FULLOFF_IDX) : + PACKED_LIST_FLAG_OFF_IDX; +} + +static uint32_t +list_calc_ext_content_sz(uint32_t ele_count, uint32_t content_sz, bool ordered) +{ + offset_index offidx; + + if (! ordered) { + list_offset_index_init(&offidx, NULL, ele_count, NULL, content_sz); + } + else { + list_full_offset_index_init(&offidx, NULL, ele_count, NULL, content_sz); + } + + return offset_index_size(&offidx); +} + +static uint32_t +list_pack_header(uint8_t *buf, uint32_t ele_count) +{ + as_packer pk = { + .buffer = buf, + .capacity = INT_MAX, + }; + + if (as_pack_list_header(&pk, ele_count) != 0) { + cf_crash(AS_PARTICLE, "as_pack_list_header() unexpected failure"); + } + + return pk.offset; +} + +static void +list_pack_empty_index(as_packer *pk, uint32_t ele_count, + const uint8_t *contents, uint32_t content_sz, bool is_ordered) +{ + offset_index offidx; + + if (is_ordered) { + list_full_offset_index_init(&offidx, pk->buffer + pk->offset, ele_count, + contents, content_sz); + } + else { + list_offset_index_init(&offidx, pk->buffer + pk->offset, ele_count, + contents, content_sz); + } + + offset_index_set_filled(&offidx, 1); + pk->offset += offset_index_size(&offidx); +} + +//------------------------------------------------ +// as_bin +// + +static inline void +as_bin_set_empty_list(as_bin *b, rollback_alloc *alloc_buf, bool is_ordered) +{ + if (is_ordered) { + as_bin_set_ordered_empty_list(b, alloc_buf); + } + else { + as_bin_set_unordered_empty_list(b, alloc_buf); + } +} + +void +as_bin_set_unordered_empty_list(as_bin *b, rollback_alloc *alloc_buf) +{ + b->particle = list_simple_create_from_buf(alloc_buf, 0, NULL, 0); + as_bin_state_set_from_type(b, AS_PARTICLE_TYPE_LIST); +} + +static void +as_bin_set_ordered_empty_list(as_bin *b, rollback_alloc *alloc_buf) +{ + b->particle = list_simple_create_from_buf(alloc_buf, 1, + (const uint8_t *)&list_ordered_empty.list.ext_hdr, + sizeof(msgpack_list_empty_flagged) - 1); + as_bin_state_set_from_type(b, AS_PARTICLE_TYPE_LIST); +} + +static inline void +as_bin_set_temp_list_if_notinuse(as_bin *b, uint64_t create_flags) +{ + if (! as_bin_inuse(b)) { + b->particle = (create_flags & AS_PACKED_LIST_FLAG_ORDERED) != 0 ? + (as_particle *)&list_ordered_empty : + (as_particle *)&list_mem_empty; + as_bin_state_set_from_type(b, AS_PARTICLE_TYPE_LIST); + } +} + +//---------------------------------------------------------- +// packed_list +// + +static bool +packed_list_init(packed_list *list, const uint8_t *buf, uint32_t sz) +{ + list->packed = buf; + list->packed_sz = sz; + + list->ele_count = 0; + list->ext_flags = 0; + list->contents = NULL; + + return packed_list_unpack_hdridx(list); +} + +static inline bool +packed_list_init_from_particle(packed_list *list, const as_particle *p) +{ + const list_mem *p_list_mem = (const list_mem *)p; + return packed_list_init(list, p_list_mem->data, p_list_mem->sz); +} + +static bool +packed_list_init_from_bin(packed_list *list, const as_bin *b) +{ + uint8_t type = as_bin_get_particle_type(b); + cf_assert(is_list_type(type), AS_PARTICLE, "packed_list_init_from_bin() invalid type %d", type); + return packed_list_init_from_particle(list, b->particle); +} + +static bool +packed_list_unpack_hdridx(packed_list *list) +{ + if (list->packed_sz == 0) { + list->ext_flags = 0; + return false; + } + + as_unpacker pk = { + .buffer = list->packed, + .length = list->packed_sz + }; + + int64_t ele_count = as_unpack_list_header_element_count(&pk); + + if (ele_count < 0) { + return false; + } + + list->ele_count = (uint32_t)ele_count; + + if (ele_count != 0 && as_unpack_peek_is_ext(&pk)) { + as_msgpack_ext ext; + + if (as_unpack_ext(&pk, &ext) != 0) { + return false; + } + + list->ext_flags = ext.type; + list->ele_count--; + list->contents = list->packed + pk.offset; + list->content_sz = list->packed_sz - pk.offset; + + if (list_is_ordered(list)) { + list_full_offset_index_init(&list->offidx, NULL, list->ele_count, + list->contents, list->content_sz); + } + else { + list_offset_index_init(&list->offidx, NULL, list->ele_count, + list->contents, list->content_sz); + } + + list_full_offset_index_init(&list->full_offidx, NULL, list->ele_count, + list->contents, list->content_sz); + + if (ext.size >= offset_index_size(&list->offidx)) { + offset_index_set_ptr(&list->offidx, (uint8_t *)ext.data, + list->packed + pk.offset); + } + } + else { + list->contents = list->packed + pk.offset; + list->content_sz = list->packed_sz - pk.offset; + list->ext_flags = 0; + + list_offset_index_init(&list->offidx, NULL, list->ele_count, + list->contents, list->content_sz); + list_full_offset_index_init(&list->full_offidx, NULL, list->ele_count, + list->contents, list->content_sz); + } + + return true; +} + +static void +packed_list_partial_offidx_update(const packed_list *list) +{ + if (list_is_ordered(list) || ! offset_index_is_valid(&list->full_offidx) || + ! offset_index_is_valid(&list->offidx)) { + return; + } + + offset_index *full = (offset_index *)&list->full_offidx; + offset_index *part = (offset_index *)&list->offidx; + uint32_t filled = offset_index_get_filled(part); + uint32_t max = (offset_index_get_filled(full) / PACKED_LIST_INDEX_STEP) + 1; + + if (filled >= max) { + return; + } + + for (uint32_t j = filled; j < max; j++) { + uint32_t off = offset_index_get_const(full, j * PACKED_LIST_INDEX_STEP); + offset_index_set(part, j, off); + } + + offset_index_set_filled(part, max); +} + +static bool +packed_list_find_by_value_ordered(const packed_list *list, + const cdt_payload *value, order_index_find *find) +{ + if (list->ele_count == 0) { + find->found = false; + find->result = 0; + return true; + } + + offset_index *offidx = list_full_offidx_p(list); + cf_assert(offset_index_is_valid(offidx), AS_PARTICLE, "invalid offidx"); + uint32_t last = offset_index_get_filled(offidx); + + find->count = last - find->start; + + if (! order_index_find_rank_by_value(NULL, value, offidx, find)) { + return false; + } + + if (offset_index_is_full(offidx) || find->result < last - 1 || + (! find->found && find->result < last) || (find->found && + (find->target > list->ele_count || + find->result >= find->target))) { + return true; + } + + if (find->result == list->ele_count || find->result == last || + find->result < find->target) { + as_unpacker pk_start = { + .buffer = value->ptr, + .length = value->sz + }; + + as_unpacker pk_buf = { + .buffer = list->contents, + .offset = offset_index_get_const(offidx, last - 1), + .length = list->content_sz + }; + + if (as_unpack_size(&pk_buf) <= 0) { + return false; + } + + offset_index_set(offidx, last, pk_buf.offset); + find->result = list->ele_count; + + for (uint32_t i = last; i < list->ele_count; i++) { + pk_start.offset = 0; // reset + + msgpack_compare_t cmp = as_unpack_compare(&pk_start, &pk_buf); + + offset_index_set(offidx, i + 1, pk_buf.offset); + + if (cmp == MSGPACK_COMPARE_EQUAL) { + find->found = true; + + if (i != list->ele_count - 1 && i < find->target && + find->target <= list->ele_count) { + continue; + } + + find->result = i; + offset_index_set_filled(offidx, MIN(i + 2, list->ele_count)); + break; + } + + if (cmp == MSGPACK_COMPARE_LESS) { + find->result = i - (find->found ? 1 : 0); + offset_index_set_filled(offidx, MIN(i + 2, list->ele_count)); + break; + } + + if (cmp == MSGPACK_COMPARE_END || cmp == MSGPACK_COMPARE_ERROR) { + return false; + } + } + + if (find->result == list->ele_count) { + offset_index_set_filled(offidx, list->ele_count); + } + } + + return true; +} + +static uint32_t +packed_list_find_idx_offset(const packed_list *list, uint32_t index) +{ + if (index == 0) { + return 0; + } + + if (list_is_ordered(list)) { + if (offset_index_is_valid(&list->offidx)) { + offset_index *offidx = (offset_index *)&list->offidx; + + if (! list_full_offset_index_fill_to(offidx, index)) { + return 0; + } + + return offset_index_get_const(offidx, index); + } + + define_offset_index(offidx, list->contents, list->content_sz, + list->ele_count); + + if (! list_full_offset_index_fill_to(&offidx, index)) { + return 0; + } + + return offset_index_get_const(&offidx, index); + } + else if (offset_index_is_valid(&list->full_offidx) && + index < offset_index_get_filled(&list->full_offidx)) { + return offset_index_get_const(&list->full_offidx, index); + } + + as_unpacker pk = { + .buffer = list->contents, + .length = list->content_sz + }; + + uint32_t steps = index; + + if (offset_index_is_valid(&list->offidx)) { + uint32_t idx = index / PACKED_LIST_INDEX_STEP; + uint32_t filled = offset_index_get_filled(&list->offidx); + + if (idx >= filled) { + cf_assert(filled != 0, AS_PARTICLE, "packed_list_op_find_idx_offset() filled is zero"); + idx = filled - 1; + } + + pk.offset = offset_index_get_const(&list->offidx, idx); + steps -= idx * PACKED_LIST_INDEX_STEP; + + offset_index *offidx = (offset_index *)&list->offidx; // mutable struct variable + uint32_t blocks = steps / PACKED_LIST_INDEX_STEP; + + steps %= PACKED_LIST_INDEX_STEP; + + for (uint32_t i = 0; i < blocks; i++) { + for (uint32_t j = 0; j < PACKED_LIST_INDEX_STEP; j++) { + if (as_unpack_size(&pk) <= 0) { + return 0; + } + } + + idx++; + offset_index_set_next(offidx, idx, pk.offset); + } + } + + for (uint32_t i = 0; i < steps; i++) { + if (as_unpack_size(&pk) <= 0) { + return 0; + } + } + + return pk.offset; +} + +static uint32_t +packed_list_find_idx_offset_continue(const packed_list *list, uint32_t index, + uint32_t index0, uint32_t offset0) +{ + if (list_is_ordered(list)) { + return packed_list_find_idx_offset(list, index); + } + else if (offset_index_is_valid(&list->full_offidx) && + index < offset_index_get_filled(&list->full_offidx)) { + return offset_index_get_const(&list->full_offidx, index); + } + + as_unpacker pk = { + .buffer = list->contents, + .offset = offset0, + .length = list->content_sz + }; + + uint32_t steps = index - index0; + + if (offset_index_is_valid(&list->offidx)) { + uint32_t idx0 = index0 / PACKED_LIST_INDEX_STEP; + uint32_t idx = index / PACKED_LIST_INDEX_STEP; + uint32_t filled = offset_index_get_filled(&list->offidx); + + if (idx0 != idx) { + if (idx0 < filled - 1) { + return packed_list_find_idx_offset(list, index); + } + + uint32_t mod0 = index0 % PACKED_LIST_INDEX_STEP; + offset_index *offidx = (offset_index *)&list->offidx; + + if (mod0 != 0) { + for (uint32_t i = mod0; i < PACKED_LIST_INDEX_STEP; i++) { + if (as_unpack_size(&pk) <= 0) { + return 0; + } + + steps--; + } + + idx0++; + offset_index_set_next(offidx, idx0, pk.offset); + } + + uint32_t blocks = idx - idx0; + + for (uint32_t i = 0; i < blocks; i++) { + for (uint32_t j = 0; j < PACKED_LIST_INDEX_STEP; j++) { + if (as_unpack_size(&pk) <= 0) { + return 0; + } + } + + idx0++; + offset_index_set_next(offidx, idx0, pk.offset); + } + + steps -= blocks * PACKED_LIST_INDEX_STEP; + } + } + + for (uint32_t i = 0; i < steps; i++) { + if (as_unpack_size(&pk) <= 0) { + return 0; + } + } + + return pk.offset; +} + +// value_end == NULL means looking for: [value_start, largest possible value]. +// value_start == value_end means looking for a single value: +// [value_start, value_start]. +static bool +packed_list_find_rank_range_by_value_interval_ordered(const packed_list *list, + const cdt_payload *value_start, const cdt_payload *value_end, + uint32_t *rank_r, uint32_t *count_r, bool is_multi) +{ + cf_assert(offset_index_is_valid(list_full_offidx_p(list)), AS_PARTICLE, "packed_list_find_rank_range_by_value_interval_ordered() invalid full offset_index"); + cf_assert(value_end, AS_PARTICLE, "value_end == NULL"); + + order_index_find find = { + .target = 0 + }; + + if (! packed_list_find_by_value_ordered(list, value_start, &find)) { + return false; + } + + *rank_r = find.result; + + if (value_end == value_start) { + if (! find.found) { + *count_r = 0; + } + else if (is_multi) { + find.start = find.result + 1; + find.target = list->ele_count; + + if (! packed_list_find_by_value_ordered(list, value_start, &find)) { + return false; + } + + if (find.found) { + *count_r = find.result - *rank_r + 1; + } + else { + *count_r = 1; + } + } + else { + *count_r = 1; + } + + return true; + } + + if (! value_end->ptr) { + *count_r = list->ele_count - *rank_r; + return true; + } + + as_unpacker pk_start = { + .buffer = value_start->ptr, + .length = value_start->sz + }; + + as_unpacker pk_end = { + .buffer = value_end->ptr, + .length = value_end->sz + }; + + msgpack_compare_t cmp = as_unpack_compare(&pk_start, &pk_end); + + if (cmp == MSGPACK_COMPARE_GREATER || cmp == MSGPACK_COMPARE_EQUAL) { + *count_r = 0; + return true; + } + + find.start = find.result; + + if (! packed_list_find_by_value_ordered(list, value_end, &find)) { + return false; + } + + *count_r = find.result - *rank_r; + + return true; +} + +// value_end == NULL means looking for: [value_start, largest possible value]. +// value_start == value_end means looking for a single value: +// [value_start, value_start]. +// mask_val is a mask for is_multi case and a uint64_t[1] value for ! is_multi. +static bool +packed_list_find_rank_range_by_value_interval_unordered(const packed_list *list, + const cdt_payload *value_start, const cdt_payload *value_end, + uint32_t *rank, uint32_t *count, uint64_t *mask_val, bool inverted, + bool is_multi) +{ + cf_assert(value_end, AS_PARTICLE, "value_end == NULL"); + + as_unpacker pk_start = { + .buffer = value_start->ptr, + .length = value_start->sz + }; + + as_unpacker pk_end = { + .buffer = value_end->ptr, + .length = value_end->sz + }; + + offset_index *full_offidx = list_full_offidx_p(list); + + if (! offset_index_is_valid(full_offidx)) { + full_offidx = NULL; + } + + // Pre-check parameters. + if (as_unpack_size(&pk_start) <= 0) { + cf_warning(AS_PARTICLE, "packed_list_op_find_rank_range_by_value_interval_unordered() invalid start value"); + return false; + } + + if (value_end != value_start && value_end->ptr && + as_unpack_size(&pk_end) <= 0) { + cf_warning(AS_PARTICLE, "packed_list_op_find_rank_range_by_value_interval_unordered() invalid end value"); + return false; + } + + *rank = 0; + *count = 0; + + as_unpacker pk = { + .buffer = list->contents, + .length = list->content_sz + }; + + for (uint32_t i = 0; i < list->ele_count; i++) { + uint32_t value_offset = pk.offset; // save for pk_end + + pk_start.offset = 0; // reset + + msgpack_compare_t cmp_start = as_unpack_compare(&pk, &pk_start); + + if (full_offidx) { + offset_index_set(full_offidx, i + 1, pk.offset); + } + + if (cmp_start == MSGPACK_COMPARE_ERROR) { + cf_warning(AS_PARTICLE, "packed_list_op_find_rank_range_by_value_interval_unordered() invalid packed list at index %u", i); + return false; + } + + if (cmp_start == MSGPACK_COMPARE_LESS) { + (*rank)++; + + if (inverted) { + if (mask_val) { + cdt_idx_mask_set(mask_val, i); + } + + (*count)++; + } + } + else if (value_start != value_end) { + msgpack_compare_t cmp_end = MSGPACK_COMPARE_LESS; + + // NULL value_end means largest possible value. + if (value_end->ptr) { + pk.offset = value_offset; + pk_end.offset = 0; + cmp_end = as_unpack_compare(&pk, &pk_end); + } + + if ((cmp_end == MSGPACK_COMPARE_LESS && ! inverted) || + ((cmp_end == MSGPACK_COMPARE_GREATER || + cmp_end == MSGPACK_COMPARE_EQUAL) && inverted)) { + if (mask_val) { + cdt_idx_mask_set(mask_val, i); + } + + (*count)++; + } + } + // Single value case. + else if (cmp_start == MSGPACK_COMPARE_EQUAL) { + if (is_multi) { + if (! inverted) { + if (mask_val) { + cdt_idx_mask_set(mask_val, i); + } + + (*count)++; + } + } + else if (*count == 0) { + if (mask_val) { + *mask_val = i; + } + + (*count)++; + } + } + else if (inverted && is_multi) { + if (mask_val) { + cdt_idx_mask_set(mask_val, i); + } + + (*count)++; + } + } + + if (full_offidx) { + offset_index_set_filled(full_offidx, list->ele_count); + } + + return true; +} + +static uint32_t +packed_list_mem_sz(const packed_list *list, bool has_ext, + uint32_t *ext_content_sz_r) +{ + bool ordered = list_is_ordered(list); + uint32_t ext_cont_sz = 0; + + if (has_ext) { + ext_cont_sz = list_calc_ext_content_sz(list->ele_count, + list->content_sz, ordered); + + if (ext_content_sz_r) { + *ext_content_sz_r = ext_cont_sz; + } + } + else if (! ordered) { + return as_pack_list_header_get_size(list->ele_count) + list->content_sz; + } + + if (! ordered && ext_cont_sz == 0) { + return as_pack_list_header_get_size(list->ele_count) + list->content_sz; + } + + return as_pack_list_header_get_size(list->ele_count + 1) + + as_pack_ext_header_get_size(ext_cont_sz) + ext_cont_sz + + list->content_sz; +} + +static uint32_t +packed_list_pack_buf(const packed_list *list, uint8_t *buf, uint32_t sz, + uint32_t ext_content_sz, bool strip_flags) +{ + as_packer pk = { + .buffer = buf, + .capacity = sz + }; + + bool ordered = list_is_ordered(list); + + if (ordered || ext_content_sz != 0) { + as_pack_list_header(&pk, list->ele_count + 1); + as_pack_ext_header(&pk, ext_content_sz, strip_flags ? + strip_ext_flags(list->ext_flags) : get_ext_flags(ordered)); + + if (ext_content_sz != 0) { + list_pack_empty_index(&pk, list->ele_count, NULL, list->content_sz, + ordered); + } + } + else { + as_pack_list_header(&pk, list->ele_count); + } + + packed_list_content_pack(list, &pk); + + return pk.offset; +} + +static list_mem * +packed_list_pack_mem(const packed_list *list, list_mem *p_list_mem) +{ + uint32_t ext_content_sz = 0; + uint32_t sz = packed_list_mem_sz(list, true, &ext_content_sz); + + if (! p_list_mem) { + p_list_mem = cf_malloc_ns(sizeof(list_mem) + sz); + } + + p_list_mem->sz = sz; + packed_list_pack_buf(list, p_list_mem->data, sz, ext_content_sz, false); + + return p_list_mem; +} + +static void +packed_list_content_pack(const packed_list *list, as_packer *pk) +{ + uint8_t *ptr = pk->buffer + pk->offset; + + memcpy(ptr, list->contents, list->content_sz); + pk->offset += list->content_sz; +} + +static int +packed_list_remove_by_idx(const packed_list *list, as_bin *b, + rollback_alloc *alloc_buf, const uint64_t rm_idx, uint32_t *rm_sz) +{ + define_packed_list_op(op, list); + + if (! packed_list_op_remove(&op, rm_idx, 1)) { + cf_warning(AS_PARTICLE, "packed_list_remove_by_idx() as_packed_list_remove failed"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (op.new_ele_count == 0) { + as_bin_set_empty_list(b, alloc_buf, list_is_ordered(list)); + } + else { + uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags, + op.new_content_sz, op.new_ele_count, rm_idx, &list->offidx, + NULL); + + ptr += packed_list_op_write_seg1(&op, ptr); + packed_list_op_write_seg2(&op, ptr); + } + + *rm_sz = list->content_sz - op.new_content_sz; + + return AS_PROTO_RESULT_OK; +} + +static int +packed_list_remove_by_mask(const packed_list *list, as_bin *b, + rollback_alloc *alloc_buf, const uint64_t *rm_mask, uint32_t rm_count, + uint32_t *rm_sz) +{ + offset_index *full_offidx = list_full_offidx_p(list); + + *rm_sz = cdt_idx_mask_get_content_sz(rm_mask, rm_count, full_offidx); + + offset_index new_offidx; + uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags, + list->content_sz - *rm_sz, list->ele_count - rm_count, 0, NULL, + &new_offidx); + + ptr = cdt_idx_mask_write_eles(rm_mask, rm_count, full_offidx, ptr, true); + cf_assert(ptr == ((list_mem *)b->particle)->data + ((list_mem *)b->particle)->sz, AS_PARTICLE, + "packed_list_remove_idx_mask() pack mismatch ptr %p data %p sz %u [%p]", ptr, ((list_mem *)b->particle)->data, ((list_mem *)b->particle)->sz, ((list_mem *)b->particle)->data + ((list_mem *)b->particle)->sz); + + if (offset_index_is_valid(&new_offidx)) { + list_offset_index_rm_mask_cpy(&new_offidx, full_offidx, rm_mask, + rm_count); + } + + return AS_PROTO_RESULT_OK; +} + +// Assumes index/count(non-zero) is surrounded by other elements. +static int +packed_list_trim(const packed_list *list, as_bin *b, rollback_alloc *alloc_buf, + uint32_t index, uint32_t count, cdt_result_data *result) +{ + cf_assert(result->is_multi, AS_PARTICLE, "packed_list_trim() required to be a multi op"); + + uint32_t rm_count = list->ele_count - count; + uint32_t index1 = index + count; + uint32_t offset0 = packed_list_find_idx_offset(list, index); + uint32_t offset1 = packed_list_find_idx_offset_continue(list, index1, + index, offset0); + uint32_t content_sz = offset1 - offset0; + + if ((offset0 == 0 && index != 0) || offset1 == 0) { + cf_warning(AS_PARTICLE, "packed_list_trim() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (b) { + uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags, content_sz, + count, 0, &list->offidx, NULL); + + memcpy(ptr, list->contents + offset0, content_sz); + } + + switch (result->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_COUNT: + as_bin_set_int(result->result, rm_count); + break; + case RESULT_TYPE_REVINDEX: + case RESULT_TYPE_INDEX: { + bool is_rev = (result->type == RESULT_TYPE_REVINDEX); + define_int_list_builder(builder, result->alloc, rm_count); + + cdt_container_builder_add_int_range(&builder, 0, index, + list->ele_count, is_rev); + cdt_container_builder_add_int_range(&builder, index1, + list->ele_count - index1, list->ele_count, is_rev); + cdt_container_builder_set_result(&builder, result); + break; + } + case RESULT_TYPE_RANK: + case RESULT_TYPE_REVRANK: { + define_int_list_builder(builder, result->alloc, rm_count); + + if (list_is_ordered(list)) { + cdt_container_builder_add_int_range(&builder, 0, index, + list->ele_count, result->type == RESULT_TYPE_REVRANK); + cdt_container_builder_add_int_range(&builder, index + count, + rm_count - index, list->ele_count, + result->type == RESULT_TYPE_REVRANK); + cdt_container_builder_set_result(&builder, result); + break; + } + + as_unpacker pk = { + .buffer = list->contents, + .length = list->content_sz + }; + + if (! packed_list_builder_add_ranks_by_range(list, &builder, &pk, index, + result->type == RESULT_TYPE_REVRANK)) { + cf_warning(AS_PARTICLE, "packed_list_trim() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + pk.offset = offset1; + + if (! packed_list_builder_add_ranks_by_range(list, &builder, &pk, + rm_count - index, result->type == RESULT_TYPE_REVRANK)) { + cf_warning(AS_PARTICLE, "packed_list_trim() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + cdt_container_builder_set_result(&builder, result); + break; + } + case RESULT_TYPE_VALUE: { + uint32_t tail_sz = list->content_sz - offset1; + list_mem *p_list_mem = list_create(result->alloc, rm_count, + offset0 + tail_sz); + + cf_assert(p_list_mem, AS_PARTICLE, "NULL list"); + result->result->particle = (as_particle *)p_list_mem; + + uint8_t *ptr = p_list_mem->data; + uint32_t hdr_sz = list_pack_header(ptr, rm_count); + + ptr += hdr_sz; + memcpy(ptr, list->contents, offset0); + ptr += offset0; + memcpy(ptr, list->contents + offset1, tail_sz); + + as_bin_state_set_from_type(result->result, AS_PARTICLE_TYPE_LIST); + + break; + } + default: + cf_warning(AS_PARTICLE, "packed_list_trim() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; +} + +static int +packed_list_get_remove_by_index_range(const packed_list *list, as_bin *b, + rollback_alloc *alloc_buf, int64_t index, uint64_t count, + cdt_result_data *result) +{ + uint32_t uindex; + uint32_t count32; + + if (! calc_index_count(index, count, list->ele_count, &uindex, &count32, + result->is_multi)) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_index_range() index %ld out of bounds for ele_count %u", index, list->ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (result_data_is_inverted(result)) { + if (! result->is_multi) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_index_range() INVERTED flag not supported for single result ops"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (result_data_is_return_index_range(result) || + result_data_is_return_rank_range(result)) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_index_range() result_type %d not supported with INVERTED flag", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + result->flags &= ~AS_CDT_OP_FLAG_INVERTED; + + if (count32 == 0) { + // Reduce to remove all. + uindex = 0; + count32 = list->ele_count; + } + else if (uindex == 0) { + // Reduce to remove tail section. + uindex = count32; + count32 = list->ele_count - count32; + } + else if (uindex + count32 >= list->ele_count) { + // Reduce to remove head section. + count32 = uindex; + uindex = 0; + } + else { + return packed_list_trim(list, b, alloc_buf, uindex, count32, + result); + } + } + + if (count32 == 0) { + if (! list_result_data_set_not_found(result, uindex)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; + } + + define_packed_list_op(op, list); + + if (! packed_list_op_remove(&op, uindex, count32)) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_index_range() as_packed_list_remove failed"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (b) { + if (op.new_ele_count == 0) { + as_bin_set_empty_list(b, alloc_buf, list_is_ordered(list)); + } + else { + uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags, + op.new_content_sz, op.new_ele_count, uindex, &list->offidx, + NULL); + + ptr += packed_list_op_write_seg1(&op, ptr); + packed_list_op_write_seg2(&op, ptr); + } + } + + switch (result->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_INDEX: + case RESULT_TYPE_REVINDEX: + return result_data_set_index_rank_count(result, uindex, count32, + list->ele_count); + case RESULT_TYPE_RANK: + case RESULT_TYPE_REVRANK: { + if (op.new_ele_count == 0) { + return result_data_set_index_rank_count(result, 0, count32, + list->ele_count); + } + + if (! result->is_multi) { + uint32_t rank; + + if (list_is_ordered(list)) { + rank = uindex; + } + else { + uint32_t rcount; + + cdt_payload value = { + .ptr = list->contents + op.seg1_sz, + .sz = list->content_sz - op.new_content_sz + }; + + if (! packed_list_find_rank_range_by_value_interval_unordered( + list, &value, &value, &rank, &rcount, NULL, false, + false)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + + if (result->type == RESULT_TYPE_REVRANK) { + rank = list->ele_count - rank - 1; + } + + as_bin_set_int(result->result, (int64_t)rank); + break; + } + + as_unpacker pk = { + .buffer = list->contents + op.seg1_sz, + .length = list->content_sz - op.new_content_sz + }; + + uint32_t rm_count = list->ele_count - op.new_ele_count; + define_int_list_builder(builder, result->alloc, rm_count); + + if (list_is_ordered(list)) { + cdt_container_builder_add_int_range(&builder, uindex, count32, + list->ele_count, result->type == RESULT_TYPE_REVRANK); + } + else if (! packed_list_builder_add_ranks_by_range(list, &builder, &pk, + rm_count, result->type == RESULT_TYPE_REVRANK)) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_index_range() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + cdt_container_builder_set_result(&builder, result); + break; + } + case RESULT_TYPE_COUNT: + as_bin_set_int(result->result, list->ele_count - op.new_ele_count); + break; + case RESULT_TYPE_VALUE: { + const uint8_t *result_ptr = list->contents + op.seg1_sz; + uint32_t end = (op.seg2_sz != 0) ? op.seg2_offset : list->content_sz; + uint32_t result_sz = end - op.seg1_sz; + uint32_t result_count = list->ele_count - op.new_ele_count; + + if (result->is_multi) { + result->result->particle = + list_simple_create_from_buf(result->alloc, + result_count, result_ptr, result_sz); + + if (! result->result->particle) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + as_bin_state_set_from_type(result->result, AS_PARTICLE_TYPE_LIST); + } + else if (result_sz != 0) { + cf_assert(count32 <= 1, AS_PARTICLE, "packed_list_get_remove_by_index_range() result must be list for count > 1"); + as_bin_particle_alloc_from_msgpack(result->result, result_ptr, + result_sz); + } + // else - leave result bin empty because result_size is 0. + break; + } + case RESULT_TYPE_REVINDEX_RANGE: + if (result->type == RESULT_TYPE_REVINDEX_RANGE) { + uindex = list->ele_count - uindex - count32; + } + // no break + case RESULT_TYPE_INDEX_RANGE: + result_data_set_list_int2x(result, uindex, count32); + break; + case RESULT_TYPE_RANK_RANGE: + case RESULT_TYPE_REVRANK_RANGE: + if (list_is_ordered(list)) { + return result_data_set_range(result, uindex, count32, + list->ele_count); + } + // no break + default: + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_index_range() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + +#ifdef LIST_DEBUG_VERIFY + if (! list_verify(b)) { + cdt_bin_print(b, "packed_list_get_remove_by_index_range"); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +// value_end == NULL means looking for: [value_start, largest possible value]. +// value_start == value_end means looking for a single value: [value_start, value_start]. +static int +packed_list_get_remove_by_value_interval(const packed_list *list, as_bin *b, + rollback_alloc *alloc_buf, const cdt_payload *value_start, + const cdt_payload *value_end, cdt_result_data *result) +{ + bool inverted = result_data_is_inverted(result); + + if (inverted && ! result->is_multi) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_value_interval() INVERTED flag not supported for single result ops"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rank; + vla_list_full_offidx_if_invalid(u, list); + + if (list_is_ordered(list)) { + uint32_t count; + + if (! packed_list_find_rank_range_by_value_interval_ordered(list, + value_start, value_end, &rank, &count, result->is_multi)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_list_get_remove_by_index_range(list, b, alloc_buf, + (int64_t)rank, (uint64_t)count, result); + } + + uint32_t rm_count; + define_cdt_idx_mask(rm_mask, result->is_multi ? list->ele_count : 1); + + if (! packed_list_find_rank_range_by_value_interval_unordered(list, + value_start, value_end, &rank, &rm_count, rm_mask, inverted, + result->is_multi)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rm_sz = 0; + + if (b) { + if (rm_count == list->ele_count) { + as_bin_set_unordered_empty_list(b, alloc_buf); + } + else if (rm_count != 0) { + int ret; + + if (result->is_multi) { + ret = packed_list_remove_by_mask(list, b, alloc_buf, rm_mask, + rm_count, &rm_sz); + } + else { + // rm_mask[0] is an idx for single value finds. + ret = packed_list_remove_by_idx(list, b, alloc_buf, rm_mask[0], + &rm_sz); + } + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + } + else { + packed_list_partial_offidx_update(list); + } + } + else { + packed_list_partial_offidx_update(list); + } + + switch (result->type) { + case RESULT_TYPE_NONE: + case RESULT_TYPE_COUNT: + case RESULT_TYPE_REVRANK: + case RESULT_TYPE_RANK: + case RESULT_TYPE_REVRANK_RANGE: + case RESULT_TYPE_RANK_RANGE: + return result_data_set_range(result, rank, inverted ? + list->ele_count - rm_count : rm_count, list->ele_count); + case RESULT_TYPE_INDEX: + case RESULT_TYPE_REVINDEX: + if (result->is_multi) { + result_data_set_int_list_by_mask(result, rm_mask, rm_count, + list->ele_count); + } + else { + result_data_set_index_rank_count(result, rm_mask[0], rm_count, + list->ele_count); + } + break; + case RESULT_TYPE_VALUE: + if (result->is_multi) { + list_result_data_set_values_by_mask(result, rm_mask, + list_full_offidx_p(list), rm_count, rm_sz); + } + else { + define_order_index2(rm_idx, list->ele_count, 1); + + order_index_set(&rm_idx, 0, rm_mask[0]); + list_result_data_set_values_by_ordidx(result, &rm_idx, u.offidx, + rm_count, rm_sz); + } + break; + case RESULT_TYPE_INDEX_RANGE: + case RESULT_TYPE_REVINDEX_RANGE: + default: + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_value_interval() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; +} + +static int +packed_list_get_remove_by_rank_range(const packed_list *list, as_bin *b, + rollback_alloc *alloc_buf, int64_t rank, uint64_t count, + cdt_result_data *result) +{ + bool inverted = result_data_is_inverted(result); + + if (inverted && ! result->is_multi) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() INVERTED flag not supported for single result ops"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (list_is_ordered(list)) { + // idx == rank for ordered lists. + return packed_list_get_remove_by_index_range(list, b, alloc_buf, rank, + count, result); + } + + uint32_t urank; + uint32_t count32; + + if (! calc_index_count(rank, count, list->ele_count, &urank, &count32, + result->is_multi)) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() rank %u out of bounds for ele_count %u", urank, list->ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + vla_list_full_offidx_if_invalid(full, list); + + if (! list_full_offset_index_fill_all(full.offidx)) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() invalid packed list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + define_build_order_heap_by_range(heap, urank, count32, list->ele_count, + list, list_order_heap_cmp_fn, success); + + if (! success) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() invalid packed list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rm_count = inverted ? list->ele_count - count32 : count32; + + if (rm_count == 0) { + if (! list_result_data_set_not_found(result, urank)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + packed_list_partial_offidx_update(list); + + return AS_PROTO_RESULT_OK; + } + + define_cdt_idx_mask(rm_mask, list->ele_count); + order_index ret_idx; + + cdt_idx_mask_set_by_ordidx(rm_mask, &heap._, heap.filled, count32, + inverted); + + if (inverted) { + if (result_data_is_return_rank_range(result)) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() result_type %d not supported with INVERTED flag", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (! result->is_multi) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() singe result type %d not supported with INVERTED flag", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + else { + order_index_init_ref(&ret_idx, &heap._, heap.filled, rm_count); + } + + uint32_t rm_sz = 0; + + if (b) { + int ret = packed_list_remove_by_mask(list, b, alloc_buf, rm_mask, + rm_count, &rm_sz); + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + } + else { + packed_list_partial_offidx_update(list); + } + + switch (result->type) { + case RESULT_TYPE_NONE: + case RESULT_TYPE_COUNT: + case RESULT_TYPE_RANK: + case RESULT_TYPE_REVRANK: + case RESULT_TYPE_RANK_RANGE: + case RESULT_TYPE_REVRANK_RANGE: + return result_data_set_range(result, rank, count32, list->ele_count); + case RESULT_TYPE_INDEX: + case RESULT_TYPE_REVINDEX: + result_data_set_int_list_by_mask(result, rm_mask, rm_count, + list->ele_count); + break; + case RESULT_TYPE_VALUE: + if (inverted) { + list_result_data_set_values_by_mask(result, rm_mask, + &list->full_offidx, rm_count, rm_sz); + } + else if (! list_result_data_set_values_by_ordidx(result, &ret_idx, + &list->full_offidx, rm_count, rm_sz)) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() invalid packed list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + break; + case RESULT_TYPE_INDEX_RANGE: + case RESULT_TYPE_REVINDEX_RANGE: + default: + cf_warning(AS_PARTICLE, "packed_list_get_remove_by_rank_range() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; +} + +static int +packed_list_get_remove_all_by_value_list_ordered(const packed_list *list, + as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk, + uint32_t items_count, cdt_result_data *result) +{ + cf_assert(result->is_multi, AS_PARTICLE, "not supported"); + + define_order_index2(rm_rc, list->ele_count, 2 * items_count); + uint32_t rm_count = 0; + + for (uint32_t i = 0; i < items_count; i++) { + cdt_payload value = { items_pk->buffer + items_pk->offset }; + int64_t sz = as_unpack_size(items_pk); + + if (sz <= 0) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_all_by_value_list_ordered() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + value.sz = (uint32_t)sz; + + uint32_t rank; + uint32_t count; + + if (! packed_list_find_rank_range_by_value_interval_ordered(list, + &value, &value, &rank, &count, true)) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_all_by_value_list_ordered() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + order_index_set(&rm_rc, 2 * i, rank); + order_index_set(&rm_rc, (2 * i) + 1, count); + rm_count += count; + } + + bool inverted = result_data_is_inverted(result); + uint32_t rm_sz = 0; + bool need_mask = (b || (inverted && + (result_data_is_return_elements(result) || + result_data_is_return_rank(result) || + result_data_is_return_index(result)))); + cond_define_cdt_idx_mask(rm_mask, list->ele_count, need_mask); + + if (inverted) { + if (! list_full_offset_index_fill_all(list_full_offidx_p(list))) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_all_by_value_list_ordered() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + rm_count = list->ele_count - rm_count; + } + + if (need_mask) { + cdt_idx_mask_set_by_irc(rm_mask, &rm_rc, NULL, inverted); + } + + if (b) { + if (rm_count == list->ele_count) { + as_bin_set_ordered_empty_list(b, alloc_buf); + } + else if (rm_count != 0) { + int ret = packed_list_remove_by_mask(list, b, alloc_buf, rm_mask, + rm_count, &rm_sz); + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + } + else { + packed_list_partial_offidx_update(list); + } + } + else { + packed_list_partial_offidx_update(list); + } + + switch (result->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_COUNT: + as_bin_set_int(result->result, rm_count); + break; + case RESULT_TYPE_INDEX: + case RESULT_TYPE_REVINDEX: + case RESULT_TYPE_RANK: + case RESULT_TYPE_REVRANK: + if (inverted) { + result_data_set_int_list_by_mask(result, rm_mask, rm_count, + list->ele_count); + } + else { + result_data_set_by_irc(result, &rm_rc, NULL, rm_count); + } + break; + case RESULT_TYPE_VALUE: { + if (inverted) { + list_result_data_set_values_by_mask(result, rm_mask, &list->offidx, + rm_count, rm_sz); + } + else { + list_result_data_set_values_by_idxcount(result, &rm_rc, + &list->offidx); + } + break; + } + default: + cf_warning(AS_PARTICLE, "packed_list_get_remove_all_by_value_list_ordered() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + +#ifdef LIST_DEBUG_VERIFY + if (! list_verify(b)) { + cdt_bin_print(b, "packed_list_get_remove_all_by_value_list_ordered"); + list_print(list, "original"); + cf_crash(AS_PARTICLE, "all_by_value_list_ordered: ele_count %u items_count %u rm_count %u", list->ele_count, items_count, rm_count); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +packed_list_get_remove_all_by_value_list(const packed_list *list, as_bin *b, + rollback_alloc *alloc_buf, const cdt_payload *value_list, + cdt_result_data *result) +{ + if (result_data_is_return_rank_range(result) || + result_data_is_return_index_range(result)) { + cf_warning(AS_PARTICLE, "packed_list_op_get_remove_all_by_value_list() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + as_unpacker items_pk; + uint32_t items_count; + + if (! list_param_parse(value_list, &items_pk, &items_count)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + bool inverted = result_data_is_inverted(result); + + if (items_count == 0) { + if (! inverted) { + if (! list_result_data_set_not_found(result, 0)) { + cf_warning(AS_PARTICLE, "packed_list_get_remove_all_by_value_list() invalid result type %d", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; + } + + result->flags &= ~AS_CDT_OP_FLAG_INVERTED; + + return packed_list_get_remove_by_index_range(list, b, alloc_buf, 0, + list->ele_count, result); + } + + vla_list_full_offidx_if_invalid(full, list); + + if (list_is_ordered(list)) { + return packed_list_get_remove_all_by_value_list_ordered(list, b, + alloc_buf, &items_pk, items_count, result); + } + + bool is_ret_rank = result_data_is_return_rank(result); + uint32_t rm_count = 0; + define_order_index(value_list_ordidx, items_count); + define_cdt_idx_mask(rm_mask, list->ele_count); + cond_vla_order_index2(rc, list->ele_count, items_count * 2, is_ret_rank); + + if (! offset_index_find_items(full.offidx, + CDT_FIND_ITEMS_IDXS_FOR_LIST_VALUE, &items_pk, &value_list_ordidx, + inverted, rm_mask, &rm_count, is_ret_rank ? &rc.ordidx : NULL)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rm_sz = 0; + + if (b) { + if (rm_count == list->ele_count) { + as_bin_set_unordered_empty_list(b, alloc_buf); + } + else if (rm_count != 0) { + int ret = packed_list_remove_by_mask(list, b, alloc_buf, rm_mask, + rm_count, &rm_sz); + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + } + else { + packed_list_partial_offidx_update(list); + } + } + else { + packed_list_partial_offidx_update(list); + } + + switch (result->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_INDEX: + case RESULT_TYPE_REVINDEX: + result_data_set_int_list_by_mask(result, rm_mask, rm_count, + list->ele_count); + break; + case RESULT_TYPE_RANK: + case RESULT_TYPE_REVRANK: + result_data_set_by_itemlist_irc(result, &value_list_ordidx, + &rc.ordidx, rm_count); + break; + case RESULT_TYPE_COUNT: + as_bin_set_int(result->result, rm_count); + break; + case RESULT_TYPE_VALUE: { + list_result_data_set_values_by_mask(result, rm_mask, full.offidx, + rm_count, rm_sz); + break; + } + default: + cf_warning(AS_PARTICLE, "packed_list_op_get_remove_all_by_value_list() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; +} + +static int +packed_list_insert(const packed_list *list, as_bin *b, + rollback_alloc *alloc_buf, int64_t index, const cdt_payload *payload, + bool payload_is_list, uint64_t mod_flags, cdt_result_data *result) +{ + uint32_t param_count = 1; + uint32_t payload_hdr_sz = 0; + + if (payload_is_list) { + int64_t payload_count = + as_unpack_buf_list_element_count(payload->ptr, payload->sz); + + if (payload_count < 0) { + cf_warning(AS_PARTICLE, "packed_list_insert() invalid payload, expected a list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (payload_count == 0) { + result_data_set_int(result, list->ele_count); + return AS_PROTO_RESULT_OK; + } + + param_count = (uint32_t)payload_count; + payload_hdr_sz = as_pack_list_header_get_size((uint32_t)payload_count); + + if (payload_hdr_sz > payload->sz) { + cf_warning(AS_PARTICLE, "packed_list_insert() invalid list header: payload->size=%d", payload->sz); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + + if (index > INT32_MAX || (index = calc_index(index, list->ele_count)) < 0) { + cf_warning(AS_PARTICLE, "packed_list_insert() index %ld out of bounds for ele_count %d", index > 0 ? index : index - list->ele_count, list->ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (mod_flags_is_bounded(mod_flags) && (uint32_t)index > list->ele_count) { + result_data_set_int(result, list->ele_count); + return AS_PROTO_RESULT_OK; // no-op + } + + uint32_t rm_sz = 0; + uint32_t rm_count = 0; + bool is_unique = mod_flags_is_unique(mod_flags); + cond_define_cdt_idx_mask(rm_mask, param_count, is_unique); + + if (is_unique) { + // Assume only here for the unordered case. + if (payload_is_list) { + as_unpacker pk = { + .buffer = payload->ptr + payload_hdr_sz, + .length = payload->sz - payload_hdr_sz + }; + + for (uint32_t i = 0; i < param_count; i++) { + cdt_payload val = { pk.buffer + pk.offset }; + int64_t sz = as_unpack_size(&pk); + uint32_t rank; + uint32_t count; + + if (sz <= 0) { + cf_warning(AS_PARTICLE, "packed_list_insert() invalid parameters"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + val.sz = (uint32_t)sz; + + if (! packed_list_find_rank_range_by_value_interval_unordered( + list, &val, &val, &rank, &count, NULL, false, false)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (count == 0) { + as_unpacker cmp0 = { + .buffer = val.ptr, + .length = val.sz + }; + + as_unpacker cmp1 = pk; + bool found = false; + + cmp1.offset = 0; + + for (uint32_t j = 0; j < i; j++) { + cmp0.offset = 0; + + msgpack_compare_t cmp = as_unpack_compare(&cmp0, &cmp1); + + if (cmp == MSGPACK_COMPARE_EQUAL) { + rm_sz += val.sz; + rm_count++; + found = true; + break; + } + } + + if (! found) { + cdt_idx_mask_set(rm_mask, i); + } + } + else { + // TODO - support NOFAIL + //rm_sz += val.sz; + //rm_count++; + as_bin_set_int(result->result, list->ele_count); + return mod_flags_return_exists(mod_flags); + } + } + + if (param_count == rm_count) { + as_bin_set_int(result->result, list->ele_count); + return mod_flags_return_exists(mod_flags); + } + } + else { + uint32_t rank; + uint32_t count; + + if (! packed_list_find_rank_range_by_value_interval_unordered(list, + payload, payload, &rank, &count, NULL, false, false)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (count != 0) { + as_bin_set_int(result->result, list->ele_count); + return mod_flags_return_exists(mod_flags); + } + } + } + + uint32_t uindex = (uint32_t)index; + define_packed_list_op(op, list); + uint32_t insert_sz = payload->sz - payload_hdr_sz - rm_sz; + uint32_t add_count = param_count - rm_count; + + if (! packed_list_op_insert(&op, uindex, add_count, insert_sz)) { + cf_warning(AS_PARTICLE, "packed_list_insert() packed_list_op_insert failed"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags, + op.new_content_sz, op.new_ele_count, uindex, &list->offidx, NULL); + + ptr += packed_list_op_write_seg1(&op, ptr); + + const uint8_t *p = payload->ptr + payload_hdr_sz; + + if (rm_sz == 0) { + uint32_t sz = payload->sz - payload_hdr_sz; + + memcpy(ptr, p, sz); + ptr += sz; + } + else { + as_unpacker pk = { + .buffer = payload->ptr + payload_hdr_sz, + .length = payload->sz - payload_hdr_sz + }; + + uint32_t idx = 0; + + for (uint32_t i = 0; i < add_count; i++) { + uint32_t next = cdt_idx_mask_find(rm_mask, idx, param_count, false); + uint32_t skip = next - idx; + + for (uint32_t j = 0; j < skip; j++) { + as_unpack_size(&pk); + } + + const uint8_t *begin = pk.buffer + pk.offset; + size_t sz = (size_t)as_unpack_size(&pk); + + memcpy(ptr, begin, sz); + ptr += sz; + idx = next + 1; + } + } + + packed_list_op_write_seg2(&op, ptr); + result_data_set_int(result, op.new_ele_count); + +#ifdef LIST_DEBUG_VERIFY + if (! list_verify(b)) { + cdt_bin_print(b, "packed_list_insert"); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +packed_list_add_ordered(const packed_list *list, as_bin *b, + rollback_alloc *alloc_buf, const cdt_payload *payload, bool unique, + cdt_result_data *result) +{ + vla_list_full_offidx_if_invalid(full, list); + + order_index_find find = { + .target = list->ele_count + 1 + }; + + if (! packed_list_find_by_value_ordered(list, payload, &find)) { + cf_warning(AS_PARTICLE, "packed_list_add_ordered() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (find.found && unique) { + return -AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS; + } + + return packed_list_insert(list, b, alloc_buf, (int64_t)find.result, payload, + false, AS_CDT_LIST_MODIFY_DEFAULT, result); +} + +static int +packed_list_add_items_ordered(const packed_list *list, as_bin *b, + rollback_alloc *alloc_buf, const cdt_payload *items, bool unique, + cdt_result_data *result) +{ + int64_t add_count = as_unpack_buf_list_element_count(items->ptr, items->sz); + + if (add_count < 0) { + cf_warning(AS_PARTICLE, "packed_list_add_items_ordered() invalid payload, expected a list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (add_count == 0) { + result_data_set_int(result, list->ele_count); + return AS_PROTO_RESULT_OK; // no-op + } + + uint32_t val_count = (uint32_t)add_count; + uint32_t hdr_sz = as_pack_list_header_get_size(val_count); + + if (hdr_sz > items->sz) { + cf_warning(AS_PARTICLE, "packed_list_add_items_ordered() invalid list header: payload->size=%d", items->sz); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + // Sort items to add. + define_order_index(val_ord, val_count); + define_offset_index(val_off, items->ptr + hdr_sz, items->sz - hdr_sz, + val_count); + + if (! list_full_offset_index_fill_all(&val_off) || + ! list_order_index_sort(&val_ord, &val_off, + AS_CDT_SORT_ASCENDING)) { + cf_warning(AS_PARTICLE, "packed_list_add_items_ordered() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (unique) { + uint32_t rm_count; + uint32_t rm_sz; + bool success = order_index_sorted_mark_dup_eles(&val_ord, &val_off, + &rm_count, &rm_sz); + cf_assert(success, AS_PARTICLE, "remove dup failed"); + } + + vla_list_full_offidx_if_invalid(full, list); + define_order_index2(insert_idx, list->ele_count, val_count); + uint32_t new_content_sz = list->content_sz; + uint32_t new_ele_count = list->ele_count; + + for (uint32_t i = 0; i < val_count; i++) { + uint32_t val_idx = order_index_get(&val_ord, i); + + if (val_idx == val_count) { + continue; + } + + uint32_t off = offset_index_get_const(&val_off, val_idx); + uint32_t sz = offset_index_get_delta_const(&val_off, val_idx); + + const cdt_payload value = { + .ptr = items->ptr + hdr_sz + off, + .sz = sz + }; + + order_index_find find = { + .target = list->ele_count + 1 + }; + + if (! packed_list_find_by_value_ordered(list, &value, &find)) { + cf_warning(AS_PARTICLE, "packed_list_add_items_ordered() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (unique && find.found) { + // TODO - order_index_set(&val_ord, i, val_count) for NOFAIL later. + return -AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS; + } + else { + order_index_set(&insert_idx, i, find.result); + new_content_sz += sz; + new_ele_count++; + } + } + + if (! list_full_offset_index_fill_all(full.offidx)) { + cf_warning(AS_PARTICLE, "packed_list_add_items_ordered() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + // Construct new list. + offset_index new_offidx; + uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags, new_content_sz, + new_ele_count, 0, &list->offidx, &new_offidx); + + uint32_t list_start = 0; + uint32_t new_idx = 0; + uint32_t cpy_delta = 0; + uint32_t cur_offset = 0; + + for (uint32_t i = 0; i < val_count; i++) { + uint32_t val_idx = order_index_get(&val_ord, i); + + if (val_idx == val_count) { + continue; + } + + uint32_t list_idx = order_index_get(&insert_idx, i); + + if (list_idx > list_start) { + uint32_t off0 = offset_index_get_const(&list->offidx, list_start); + uint32_t off1 = offset_index_get_const(&list->offidx, list_idx); + uint32_t seg_count = list_idx - list_start; + uint32_t seg_sz = off1 - off0; + + memcpy(ptr, list->contents + off0, seg_sz); + ptr += seg_sz; + offset_index_copy(&new_offidx, &list->offidx, new_idx, list_start, + seg_count, cpy_delta); + list_start = list_idx; + new_idx += seg_count; + cur_offset = off1 + cpy_delta; + } + + offset_index_set(&new_offidx, new_idx++, cur_offset); + + uint32_t off = offset_index_get_const(&val_off, val_idx); + uint32_t val_sz = offset_index_get_delta_const(&val_off, val_idx); + + memcpy(ptr, items->ptr + hdr_sz + off, val_sz); + ptr += val_sz; + cpy_delta += val_sz; + cur_offset += val_sz; + } + + if (list_start < list->ele_count && list->ele_count != 0) { + uint32_t off = offset_index_get_const(&list->offidx, list_start); + uint32_t seg_count = list->ele_count - list_start; + + memcpy(ptr, list->contents + off, list->content_sz - off); + offset_index_copy(&new_offidx, &list->offidx, new_idx, list_start, + seg_count, cpy_delta); + } + + offset_index_set_filled(&new_offidx, new_ele_count); + result_data_set_int(result, new_ele_count); + +#ifdef LIST_DEBUG_VERIFY + if (! list_verify(b)) { + cdt_bin_print(b, "packed_list_add_items_ordered"); + list_print(list, "original"); + cf_crash(AS_PARTICLE, "add_items_ordered: val_count %u", val_count); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +packed_list_replace_ordered(const packed_list *list, as_bin *b, + rollback_alloc *alloc_buf, uint32_t index, const cdt_payload *value, + uint64_t mod_flags) +{ + uint32_t rank; + uint32_t count; + vla_list_full_offidx_if_invalid(u, list); + + if (! packed_list_find_rank_range_by_value_interval_ordered(list, + value, value, &rank, &count, false)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + define_packed_list_op(op, list); + + if (index > list->ele_count) { + cf_warning(AS_PARTICLE, "packed_list_replace_ordered() index %u > ele_count %u out of bounds not allowed for ORDERED lists", index, list->ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (! packed_list_op_remove(&op, index, 1)) { + cf_warning(AS_PARTICLE, "packed_list_replace_ordered() as_packed_list_remove failed"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (mod_flags_is_unique(mod_flags) && count != 0) { + if (rank == index) { // uniquely replacing element with same value + return AS_PROTO_RESULT_OK; // no-op + } + + return mod_flags_return_exists(mod_flags); + } + + uint32_t new_ele_count = list->ele_count; + + op.new_content_sz += value->sz; + + if (index == list->ele_count) { + new_ele_count++; + } + + uint8_t *ptr = list_setup_bin(b, alloc_buf, list->ext_flags, + op.new_content_sz, new_ele_count, (rank < index) ? rank : index, + &list->offidx, NULL); + uint32_t offset = offset_index_get_const(u.offidx, rank); + + if (rank <= index) { + uint32_t tail_sz = op.seg1_sz - offset; + + memcpy(ptr, list->contents, offset); + ptr += offset; + memcpy(ptr, value->ptr, value->sz); + ptr += value->sz; + memcpy(ptr, list->contents + offset, tail_sz); + ptr += tail_sz; + packed_list_op_write_seg2(&op, ptr); + } + else if (op.seg2_sz == 0) { + ptr += packed_list_op_write_seg1(&op, ptr); + memcpy(ptr, value->ptr, value->sz); + } + else { + uint32_t head_sz = offset - op.seg2_offset; + uint32_t tail_sz = op.seg2_sz - head_sz; + + ptr += packed_list_op_write_seg1(&op, ptr); + memcpy(ptr, list->contents + op.seg2_offset, head_sz); + ptr += head_sz; + memcpy(ptr, value->ptr, value->sz); + ptr += value->sz; + memcpy(ptr, list->contents + offset, tail_sz); + } + + return AS_PROTO_RESULT_OK; +} + +//---------------------------------------------------------- +// packed_list_op +// + +static void +packed_list_op_init(packed_list_op *op, const packed_list *list) +{ + memset(op, 0, sizeof(packed_list_op)); + op->list = list; +} + +// Calculate a packed list split via insert op. +// Return true on success. +static bool +packed_list_op_insert(packed_list_op *op, uint32_t index, uint32_t count, + uint32_t insert_sz) +{ + uint32_t ele_count = op->list->ele_count; + + if (index >= ele_count) { // insert off the end + if (index + count >= INT32_MAX) { + cf_warning(AS_PARTICLE, "as_packed_list_insert() index %u + count %u overflow", index, count); + return false; + } + + op->new_ele_count = index + count; + op->nil_ele_sz = index - ele_count; + + op->seg1_sz = op->list->content_sz; + op->seg2_sz = 0; + } + else { // insert front or middle + op->new_ele_count = ele_count + count; + op->nil_ele_sz = 0; + uint32_t offset = packed_list_find_idx_offset(op->list, index); + + if (index != 0 && offset == 0) { + return false; + } + + op->seg1_sz = offset; + op->seg2_offset = offset; + op->seg2_sz = op->list->content_sz - offset; + } + + op->new_content_sz = op->seg1_sz + op->nil_ele_sz + insert_sz + op->seg2_sz; + + return true; +} + +// Calculate a packed list split via remove op. +// Assume count != 0. +// Return true on success. +static bool +packed_list_op_remove(packed_list_op *op, uint32_t index, uint32_t count) +{ + uint32_t ele_count = op->list->ele_count; + + if (index >= ele_count) { // nothing to remove + op->seg1_sz = op->list->content_sz; + op->seg2_sz = 0; + op->new_ele_count = ele_count; + op->new_content_sz = op->list->content_sz; + + return true; + } + + uint32_t offset = packed_list_find_idx_offset(op->list, index); + + if (index != 0 && offset == 0) { + return false; + } + + if (count >= ele_count - index) { // remove tail elements + op->new_ele_count = index; + + op->seg1_sz = offset; + op->seg2_offset = 0; + op->seg2_sz = 0; + } + else { // remove front or middle + op->new_ele_count = ele_count - count; + op->seg1_sz = offset; + + uint32_t end_off = packed_list_find_idx_offset_continue(op->list, + index + count, index, offset); + + if (end_off == 0) { + return false; + } + + op->seg2_offset = end_off; + op->seg2_sz = op->list->content_sz - end_off; + } + + op->new_content_sz = op->seg1_sz + op->seg2_sz; + + return true; +} + +// Write segment 1 and trailing nils if any. +// Return number of bytes written. +static uint32_t +packed_list_op_write_seg1(const packed_list_op *op, uint8_t *buf) +{ + memcpy(buf, op->list->contents, op->seg1_sz); + + if (op->nil_ele_sz == 0) { + return op->seg1_sz; + } + + buf += op->seg1_sz; + memset(buf, msgpack_nil[0], op->nil_ele_sz); + + return op->seg1_sz + op->nil_ele_sz; +} + +// Write segment 2 if any. +// Return number of bytes written. +static uint32_t +packed_list_op_write_seg2(const packed_list_op *op, uint8_t *buf) +{ + if (op->seg2_sz == 0) { + return 0; + } + + memcpy(buf, op->list->contents + op->seg2_offset, op->seg2_sz); + + return op->seg2_sz; +} + +static bool +packed_list_builder_add_ranks_by_range(const packed_list *list, + cdt_container_builder *builder, as_unpacker *start, uint32_t count, + bool reverse) +{ + for (uint32_t i = 0; i < count; i++) { + cdt_payload value = { + .ptr = start->buffer + start->offset + }; + + int64_t sz = as_unpack_size(start); + uint32_t rank; + uint32_t rcount; + + if (sz <= 0) { + return false; + } + + value.sz = (uint32_t)sz; + + if (! packed_list_find_rank_range_by_value_interval_unordered(list, + &value, &value, &rank, &rcount, NULL, false, false)) { + return false; + } + + cdt_container_builder_add_int64(builder, + reverse ? list->ele_count - rank - 1 : rank); + } + + return true; +} + +//---------------------------------------------------------- +// list +// + +// Create a non-indexed list. +// If alloc_buf is NULL, memory is reserved using cf_malloc. +static list_mem * +list_create(rollback_alloc *alloc_buf, uint32_t ele_count, uint32_t content_sz) +{ + uint32_t hdr_sz = as_pack_list_header_get_size(ele_count); + uint32_t sz = hdr_sz + content_sz; + list_mem *p_list_mem = (list_mem *)rollback_alloc_reserve(alloc_buf, + sizeof(list_mem) + sz); + + p_list_mem->type = AS_PARTICLE_TYPE_LIST; + p_list_mem->sz = sz; + + return p_list_mem; +} + +static as_particle * +list_simple_create_from_buf(rollback_alloc *alloc_buf, uint32_t ele_count, + const uint8_t *contents, uint32_t content_sz) +{ + list_mem *p_list_mem = list_create(alloc_buf, ele_count, content_sz); + + if (p_list_mem) { + uint32_t hdr_sz = list_pack_header(p_list_mem->data, ele_count); + + if (content_sz > 0 && contents) { + memcpy(p_list_mem->data + hdr_sz, contents, content_sz); + } + } + + return (as_particle *)p_list_mem; +} + +static as_particle * +list_simple_create(rollback_alloc *alloc_buf, uint32_t ele_count, + uint32_t content_sz, uint8_t **contents_r) +{ + list_mem *p_list_mem = list_create(alloc_buf, ele_count, content_sz); + uint32_t hdr_sz = list_pack_header(p_list_mem->data, ele_count); + + *contents_r = p_list_mem->data + hdr_sz; + + return (as_particle *)p_list_mem; +} + +static int +list_set_flags(as_bin *b, rollback_alloc *alloc_buf, uint8_t set_flags, + cdt_result_data *result) +{ + packed_list list; + + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "list_set_flags() invalid packed list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + bool reorder = false; + bool was_ordered = list_is_ordered(&list); + + if (flags_is_ordered(set_flags)) { + if (was_ordered) { + return AS_PROTO_RESULT_OK; // no-op + } + + if (list.ele_count > 1) { + reorder = true; + } + } + else { + if (! was_ordered) { + return AS_PROTO_RESULT_OK; // no-op + } + } + + offset_index new_offidx; + uint8_t * const ptr = list_setup_bin(b, alloc_buf, set_flags, + list.content_sz, list.ele_count, reorder ? 0 : list.ele_count, + &list.offidx, &new_offidx); + + if (! reorder) { + memcpy(ptr, list.contents, list.content_sz); + } + else { + vla_list_full_offidx_if_invalid(full, &list); + + if (! list_full_offset_index_fill_all(full.offidx)) { + cf_warning(AS_PARTICLE, "list_set_flags() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + define_order_index(ordidx, list.ele_count); + + if (! list_order_index_sort(&ordidx, full.offidx, + AS_CDT_SORT_ASCENDING)) { + cf_warning(AS_PARTICLE, "list_set_flags() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + list_order_index_pack(&ordidx, full.offidx, ptr, &new_offidx); + } + +#ifdef LIST_DEBUG_VERIFY + if (! list_verify(b)) { + cdt_bin_print(b, "set_flags"); + list_print(&list, "original"); + cf_crash(AS_PARTICLE, "set_flags: set_flags %u", set_flags); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +list_append(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *payload, + bool payload_is_list, uint64_t mod_flags, cdt_result_data *result) +{ + packed_list list; + + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "list_append() invalid packed list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (list_is_ordered(&list)) { + bool add_unique = mod_flags_is_unique(mod_flags); + + if (! payload_is_list) { + return packed_list_add_ordered(&list, b, alloc_buf, payload, + add_unique, result); + } + + return packed_list_add_items_ordered(&list, b, alloc_buf, payload, + add_unique, result); + } + + return packed_list_insert(&list, b, alloc_buf, (int64_t)list.ele_count, + payload, payload_is_list, mod_flags, result); +} + +static int +list_insert(as_bin *b, rollback_alloc *alloc_buf, int64_t index, + const cdt_payload *payload, bool payload_is_list, uint64_t mod_flags, + cdt_result_data *result) +{ + packed_list list; + + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "list_insert() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (list_is_ordered(&list)) { + cf_warning(AS_PARTICLE, "list_insert() invalid op on ORDERED list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_list_insert(&list, b, alloc_buf, index, payload, + payload_is_list, mod_flags, result); +} + +static int +list_set(as_bin *b, rollback_alloc *alloc_buf, int64_t index, + const cdt_payload *value, uint64_t mod_flags) +{ + packed_list list; + + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "list_set() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (list_is_ordered(&list)) { + cf_warning(AS_PARTICLE, "list_set() invalid op on ORDERED list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t ele_count = list.ele_count; + + if (index >= ele_count) { + return packed_list_insert(&list, b, alloc_buf, index, value, false, + mod_flags, NULL); + } + + if (index > UINT32_MAX || (index = calc_index(index, ele_count)) < 0) { + cf_warning(AS_PARTICLE, "list_set() index %ld out of bounds for ele_count %d", index > 0 ? index : index - ele_count, ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (mod_flags_is_unique(mod_flags)) { + uint32_t rank; + uint32_t count; + uint64_t idx; + + // Use non-multi-find scan to optimize for 0 or 1 copies of element. + // 2 or more copies will result in an additional multi-find scan below. + if (! packed_list_find_rank_range_by_value_interval_unordered(&list, + value, value, &rank, &count, &idx, false, false)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (count != 0) { + if (idx != (uint64_t)index) { + return mod_flags_return_exists(mod_flags); + } + + // Need second scan since the dup found is at the index being set. + if (! packed_list_find_rank_range_by_value_interval_unordered(&list, + value, value, &rank, &count, NULL, false, true)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (count > 1) { + return mod_flags_return_exists(mod_flags); + } + } + } + + uint32_t uindex = (uint32_t)index; + define_packed_list_op(op, &list); + + if (! packed_list_op_remove(&op, uindex, 1)) { + cf_warning(AS_PARTICLE, "list_set() as_packed_list_remove failed"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + op.new_content_sz += value->sz; + + uint8_t *ptr = list_setup_bin(b, alloc_buf, list.ext_flags, + op.new_content_sz, ele_count, uindex, &list.offidx, NULL); + + ptr += packed_list_op_write_seg1(&op, ptr); + + memcpy(ptr, value->ptr, value->sz); + ptr += value->sz; + + packed_list_op_write_seg2(&op, ptr); + + return AS_PROTO_RESULT_OK; +} + +static int +list_increment(as_bin *b, rollback_alloc *alloc_buf, int64_t index, + cdt_payload *delta_value, uint64_t mod_flags, cdt_result_data *result) +{ + packed_list list; + + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "list_increment() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (index > INT32_MAX || (index = calc_index(index, list.ele_count)) < 0) { + cf_warning(AS_PARTICLE, "list_increment() index %ld out of bounds for ele_count %d", index > 0 ? index : index - list.ele_count, list.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t uindex = (uint32_t)index; + cdt_calc_delta calc_delta; + + if (! cdt_calc_delta_init(&calc_delta, delta_value, false)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (uindex < list.ele_count) { + uint32_t offset = packed_list_find_idx_offset(&list, uindex); + + if (uindex != 0 && offset == 0) { + cf_warning(AS_PARTICLE, "list_increment() unable to unpack element at %u", uindex); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + as_unpacker pk = { + .buffer = list.contents + offset, + .length = list.content_sz - offset + }; + + if (! cdt_calc_delta_add(&calc_delta, &pk)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + else { + if (! cdt_calc_delta_add(&calc_delta, NULL)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + + uint8_t value_buf[CDT_MAX_PACKED_INT_SZ]; + cdt_payload value = { value_buf, CDT_MAX_PACKED_INT_SZ }; + + cdt_calc_delta_pack_and_result(&calc_delta, &value, result->result); + + if (list_is_ordered(&list)) { + return packed_list_replace_ordered(&list, b, alloc_buf, uindex, &value, + mod_flags); + } + + return list_set(b, alloc_buf, (int64_t)uindex, &value, mod_flags); +} + +static int +list_sort(as_bin *b, rollback_alloc *alloc_buf, as_cdt_sort_flags sort_flags) +{ + packed_list list; + + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "list_sort() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (list.ele_count <= 1) { + return AS_PROTO_RESULT_OK; + } + + vla_list_full_offidx_if_invalid(full, &list); + + if (! list_full_offset_index_fill_all(full.offidx)) { + cf_warning(AS_PARTICLE, "list_sort() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + define_order_index(ordidx, list.ele_count); + + if (list_is_ordered(&list)) { + for (uint32_t i = 0; i < list.ele_count; i++) { + order_index_set(&ordidx, i, i); + } + } + else if (! list_order_index_sort(&ordidx, full.offidx, sort_flags)) { + cf_warning(AS_PARTICLE, "list_sort() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rm_count = 0; + uint32_t rm_sz = 0; + + if ((sort_flags & AS_CDT_SORT_DROP_DUPLICATES) != 0 && + ! order_index_sorted_mark_dup_eles(&ordidx, full.offidx, + &rm_count, &rm_sz)) { + cf_warning(AS_PARTICLE, "list_sort() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + offset_index new_offidx; + uint8_t *ptr = list_setup_bin(b, alloc_buf, list.ext_flags, + list.content_sz - rm_sz, list.ele_count - rm_count, 0, &list.offidx, + &new_offidx); + + ptr = list_order_index_pack(&ordidx, full.offidx, ptr, &new_offidx); + cf_assert(ptr == ((list_mem *)b->particle)->data + ((list_mem *)b->particle)->sz, AS_PARTICLE, + "list_sort() pack mismatch ptr %p data %p sz %u [%p]", ptr, ((list_mem *)b->particle)->data, ((list_mem *)b->particle)->sz, ((list_mem *)b->particle)->data + ((list_mem *)b->particle)->sz); + + return AS_PROTO_RESULT_OK; +} + +static int +list_remove_by_index_range(as_bin *b, rollback_alloc *alloc_buf, int64_t index, + uint64_t count, cdt_result_data *result) +{ + packed_list list; + + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "list_remove_by_index_range() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_list_get_remove_by_index_range(&list, b, alloc_buf, index, + count, result); +} + +static int +list_remove_by_value_interval(as_bin *b, rollback_alloc *alloc_buf, + const cdt_payload *value_start, const cdt_payload *value_end, + cdt_result_data *result) +{ + packed_list list; + + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "list_remove_by_value_interval() invalid packed list, ele_count=%d", list.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_list_get_remove_by_value_interval(&list, b, alloc_buf, + value_start, value_end, result); +} + +static int +list_remove_by_rank_range(as_bin *b, rollback_alloc *alloc_buf, int64_t rank, + uint64_t count, cdt_result_data *result) +{ + packed_list list; + + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "list_remove_by_rank_range() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_list_get_remove_by_rank_range(&list, b, alloc_buf, rank, + count, result); +} + +static int +list_remove_all_by_value_list(as_bin *b, rollback_alloc *alloc_buf, + const cdt_payload *value_list, cdt_result_data *result) +{ + packed_list list; + + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "list_remove_all_by_value_list() invalid list"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_list_get_remove_all_by_value_list(&list, b, alloc_buf, + value_list, result); +} + +// Return ptr to packed + ele_start. +static uint8_t * +list_setup_bin(as_bin *b, rollback_alloc *alloc_buf, uint8_t flags, + uint32_t content_sz, uint32_t ele_count, uint32_t idx_trunc, + const offset_index *old_offidx, offset_index *new_offidx) +{ + bool set_ordered = flags_is_ordered(flags); + uint32_t ext_content_sz = list_calc_ext_content_sz(ele_count, content_sz, + set_ordered); + uint32_t ext_sz = (ext_content_sz == 0 && ! set_ordered) ? + 0 : as_pack_ext_header_get_size(ext_content_sz) + ext_content_sz; + list_mem *p_list_mem = list_create(alloc_buf, + ele_count + (ext_sz == 0 ? 0 : 1), ext_sz + content_sz); + + cf_assert(p_list_mem, AS_PARTICLE, "p_list_mem NULL"); + b->particle = (as_particle *)p_list_mem; + + as_packer pk = { + .buffer = p_list_mem->data, + .capacity = p_list_mem->sz + }; + + if (ext_sz == 0) { + as_pack_list_header(&pk, ele_count); + + if (new_offidx) { + list_offset_index_init(new_offidx, NULL, ele_count, NULL, + content_sz); + } + + return pk.buffer + pk.offset; + } + + as_pack_list_header(&pk, ele_count + 1); + as_pack_ext_header(&pk, ext_content_sz, get_ext_flags(set_ordered)); + + uint8_t * const ptr = pk.buffer + pk.offset; + offset_index offidx_temp; + uint8_t * const contents = pk.buffer + pk.offset + ext_content_sz; + + if (! new_offidx) { + new_offidx = &offidx_temp; + } + + if (! set_ordered) { + list_offset_index_init(new_offidx, ptr, ele_count, contents, + content_sz); + idx_trunc /= PACKED_LIST_INDEX_STEP; + } + else { + list_full_offset_index_init(new_offidx, ptr, ele_count, contents, + content_sz); + } + + if (idx_trunc == 0 || ! old_offidx || offset_index_is_null(old_offidx)) { + offset_index_set_filled(new_offidx, 1); + } + else { + idx_trunc = MIN(idx_trunc, offset_index_get_filled(old_offidx)); + offset_index_copy(new_offidx, old_offidx, 0, 0, idx_trunc, 0); + offset_index_set_filled(new_offidx, idx_trunc); + } + + return contents; +} + + +//========================================================== +// cdt_list_builder +// + +void +cdt_list_builder_start(cdt_container_builder *builder, + rollback_alloc *alloc_buf, uint32_t ele_count, uint32_t max_sz) +{ + uint32_t sz = sizeof(list_mem) + sizeof(uint64_t) + 1 + max_sz; + list_mem *p_list_mem = (list_mem *)rollback_alloc_reserve(alloc_buf, sz); + + p_list_mem->type = AS_PARTICLE_TYPE_LIST; + p_list_mem->sz = list_pack_header(p_list_mem->data, ele_count); + + builder->particle = (as_particle *)p_list_mem; + builder->write_ptr = p_list_mem->data + p_list_mem->sz; + builder->ele_count = 0; + builder->sz = &p_list_mem->sz; +} + + +//========================================================== +// cdt_process_state_packed_list +// + +bool +cdt_process_state_packed_list_modify_optype(cdt_process_state *state, + cdt_modify_data *cdt_udata) +{ + as_bin *b = cdt_udata->b; + as_cdt_optype optype = state->type; + + if (as_bin_inuse(b) && ! is_list_type(as_bin_get_particle_type(b))) { + cf_warning(AS_PARTICLE, "cdt_process_state_packed_list_modify_optype() invalid type %d", as_bin_get_particle_type(b)); + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + return false; + } + + define_rollback_alloc(alloc_buf, cdt_udata->alloc_buf, 5, true); + // Results always on the heap. + define_rollback_alloc(alloc_result, NULL, 1, false); + int ret = AS_PROTO_RESULT_OK; + + cdt_result_data result = { + .result = cdt_udata->result, + .alloc = alloc_result, + }; + + switch (optype) { + case AS_CDT_OP_LIST_SET_TYPE: { + uint64_t list_type; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &list_type)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + as_bin_set_temp_list_if_notinuse(b, AS_PACKED_LIST_FLAG_NONE); + ret = list_set_flags(b, alloc_buf, (uint8_t)list_type, &result); + break; + } + case AS_CDT_OP_LIST_APPEND: { + cdt_payload value; + uint64_t create_type = AS_PACKED_LIST_FLAG_NONE; + uint64_t modify = AS_CDT_LIST_MODIFY_DEFAULT; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &value, &create_type, &modify)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + as_bin_set_temp_list_if_notinuse(b, create_type); + ret = list_append(b, alloc_buf, &value, false, modify, &result); + break; + } + case AS_CDT_OP_LIST_APPEND_ITEMS: { + cdt_payload items; + uint64_t create_type = AS_PACKED_LIST_FLAG_NONE; + uint64_t modify = AS_CDT_LIST_MODIFY_DEFAULT; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &items, &create_type, &modify)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + as_bin_set_temp_list_if_notinuse(b, create_type); + ret = list_append(b, alloc_buf, &items, true, modify, &result); + break; + } + case AS_CDT_OP_LIST_INSERT: { + int64_t index; + cdt_payload value; + uint64_t modify = AS_CDT_LIST_MODIFY_DEFAULT; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &value, &modify)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + as_bin_set_temp_list_if_notinuse(b, AS_PACKED_LIST_FLAG_NONE); + ret = list_insert(b, alloc_buf, index, &value, false, modify, &result); + break; + } + case AS_CDT_OP_LIST_INSERT_ITEMS: { + int64_t index; + cdt_payload items; + uint64_t modify = AS_CDT_LIST_MODIFY_DEFAULT; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &items, &modify)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + as_bin_set_temp_list_if_notinuse(b, AS_PACKED_LIST_FLAG_NONE); + ret = list_insert(b, alloc_buf, index, &items, true, modify, &result); + break; + } + case AS_CDT_OP_LIST_SET: { + int64_t index; + cdt_payload value; + uint64_t modify = AS_CDT_LIST_MODIFY_DEFAULT; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &value, &modify)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + as_bin_set_temp_list_if_notinuse(b, AS_PACKED_LIST_FLAG_NONE); + ret = list_set(b, alloc_buf, index, &value, modify); + break; + } + case AS_CDT_OP_LIST_REMOVE: + case AS_CDT_OP_LIST_POP: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + int64_t index; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &index)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, optype == AS_CDT_OP_LIST_REMOVE ? + RESULT_TYPE_COUNT : RESULT_TYPE_VALUE, false); + ret = list_remove_by_index_range(b, alloc_buf, index, 1, &result); + break; + } + case AS_CDT_OP_LIST_REMOVE_RANGE: + case AS_CDT_OP_LIST_POP_RANGE: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + int64_t index; + uint64_t count = UINT32_MAX; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &count)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, optype == AS_CDT_OP_LIST_REMOVE_RANGE ? + RESULT_TYPE_COUNT : RESULT_TYPE_VALUE, true); + ret = list_remove_by_index_range(b, alloc_buf, index, count, &result); + break; + } + case AS_CDT_OP_LIST_TRIM: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + int64_t index; + uint64_t count = UINT32_MAX; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &count)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result.type = RESULT_TYPE_COUNT; + result.flags = AS_CDT_OP_FLAG_INVERTED; + result.is_multi = true; + ret = list_remove_by_index_range(b, alloc_buf, index, count, &result); + break; + } + case AS_CDT_OP_LIST_CLEAR: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + packed_list list; + + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "LIST_CLEAR: invalid list"); + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + as_bin_set_empty_list(b, alloc_buf, list_is_ordered(&list)); + break; + } + case AS_CDT_OP_LIST_INCREMENT: { + int64_t index; + cdt_payload delta = { NULL }; + uint64_t create = AS_PACKED_LIST_FLAG_NONE; + uint64_t modify = AS_CDT_LIST_MODIFY_DEFAULT; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &delta, &create, + &modify)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + as_bin_set_temp_list_if_notinuse(b, create); + ret = list_increment(b, alloc_buf, index, &delta, modify, &result); + break; + } + case AS_CDT_OP_LIST_SORT: { + if (! as_bin_inuse(b)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + return false; + } + + uint64_t flags = 0; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &flags)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + ret = list_sort(b, alloc_buf, (as_cdt_sort_flags)flags); + break; + } + case AS_CDT_OP_LIST_REMOVE_BY_INDEX: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + int64_t index; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, false); + ret = list_remove_by_index_range(b, alloc_buf, index, 1, &result); + break; + } + case AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE: + case AS_CDT_OP_LIST_REMOVE_BY_VALUE: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + cdt_payload value; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, + optype == AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE); + ret = list_remove_by_value_interval(b, alloc_buf, &value, &value, + &result); + break; + } + case AS_CDT_OP_LIST_REMOVE_BY_RANK: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + int64_t rank; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, false); + ret = list_remove_by_rank_range(b, alloc_buf, rank, 1, &result); + break; + } + case AS_CDT_OP_LIST_REMOVE_ALL_BY_VALUE_LIST: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + cdt_payload items; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &items)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = list_remove_all_by_value_list(b, alloc_buf, &items, &result); + break; + } + case AS_CDT_OP_LIST_REMOVE_BY_INDEX_RANGE: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + int64_t index; + uint64_t count = UINT32_MAX; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index, &count)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = list_remove_by_index_range(b, alloc_buf, index, count, &result); + break; + } + case AS_CDT_OP_LIST_REMOVE_BY_VALUE_INTERVAL: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + cdt_payload value_start; + cdt_payload value_end = { NULL }; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value_start, + &value_end)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = list_remove_by_value_interval(b, alloc_buf, &value_start, + &value_end, &result); + break; + } + case AS_CDT_OP_LIST_REMOVE_BY_RANK_RANGE: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + int64_t rank; + uint64_t count = UINT32_MAX; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank, &count)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = list_remove_by_rank_range(b, alloc_buf, rank, count, &result); + break; + } + default: + cf_warning(AS_PARTICLE, "cdt_process_state_packed_list_modify_optype() invalid cdt op: %d", optype); + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + if (ret != AS_PROTO_RESULT_OK) { + cf_warning(AS_PARTICLE, "%s: failed", cdt_process_state_get_op_name(state)); + cdt_udata->ret_code = ret; + rollback_alloc_rollback(alloc_result); + rollback_alloc_rollback(alloc_buf); + return false; + } + + // In case of no-op. + if (b->particle == (const as_particle *)&list_mem_empty) { + as_bin_set_unordered_empty_list(b, alloc_buf); + } + else if (b->particle == (const as_particle *)&list_ordered_empty) { + as_bin_set_ordered_empty_list(b, alloc_buf); + } + + return true; +} + +bool +cdt_process_state_packed_list_read_optype(cdt_process_state *state, + cdt_read_data *cdt_udata) +{ + const as_bin *b = cdt_udata->b; + as_cdt_optype optype = state->type; + + if (! is_list_type(as_bin_get_particle_type(b))) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + return false; + } + + packed_list list; + + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "%s: invalid list", cdt_process_state_get_op_name(state)); + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + return false; + } + + // Just one entry needed for results bin. + define_rollback_alloc(alloc_result, NULL, 1, false); + int ret = AS_PROTO_RESULT_OK; + + cdt_result_data result = { + .result = cdt_udata->result, + .alloc = alloc_result, + }; + + switch (optype) { + case AS_CDT_OP_LIST_GET: { + int64_t index; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &index)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, RESULT_TYPE_VALUE, false); + ret = packed_list_get_remove_by_index_range(&list, NULL, NULL, index, + 1, &result); + break; + } + case AS_CDT_OP_LIST_GET_RANGE: { + int64_t index; + uint64_t count = UINT32_MAX; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &index, &count)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, RESULT_TYPE_VALUE, true); + ret = packed_list_get_remove_by_index_range(&list, NULL, NULL, index, + count, &result); + break; + } + case AS_CDT_OP_LIST_SIZE: { + as_bin_set_int(result.result, list.ele_count); + break; + } + case AS_CDT_OP_LIST_GET_BY_INDEX: { + uint64_t result_type; + int64_t index; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, false); + ret = packed_list_get_remove_by_index_range(&list, NULL, NULL, index, + 1, &result); + break; + } + case AS_CDT_OP_LIST_GET_ALL_BY_VALUE: + case AS_CDT_OP_LIST_GET_BY_VALUE: { + uint64_t result_type; + cdt_payload value; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, + optype == AS_CDT_OP_LIST_GET_ALL_BY_VALUE); + ret = packed_list_get_remove_by_value_interval(&list, NULL, NULL, + &value, &value, &result); + break; + } + case AS_CDT_OP_LIST_GET_BY_RANK: { + uint64_t result_type; + int64_t rank; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, false); + ret = packed_list_get_remove_by_rank_range(&list, NULL, NULL, rank, 1, + &result); + break; + } + case AS_CDT_OP_LIST_GET_ALL_BY_VALUE_LIST: { + uint64_t result_type; + cdt_payload value_list; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value_list)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = packed_list_get_remove_all_by_value_list(&list, NULL, NULL, + &value_list, &result); + break; + } + case AS_CDT_OP_LIST_GET_BY_INDEX_RANGE: { + uint64_t result_type; + int64_t index; + uint64_t count = UINT32_MAX; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index, &count)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = packed_list_get_remove_by_index_range(&list, NULL, NULL, index, + count, &result); + break; + } + case AS_CDT_OP_LIST_GET_BY_VALUE_INTERVAL: { + uint64_t result_type; + cdt_payload value_start; + cdt_payload value_end = { NULL }; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value_start, + &value_end)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = packed_list_get_remove_by_value_interval(&list, NULL, NULL, + &value_start, &value_end, &result); + break; + } + case AS_CDT_OP_LIST_GET_BY_RANK_RANGE: { + uint64_t result_type; + int64_t rank; + uint64_t count = UINT32_MAX; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank, &count)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = packed_list_get_remove_by_rank_range(&list, NULL, NULL, rank, + count, &result); + break; + } + default: + cf_warning(AS_PARTICLE, "cdt_process_state_packed_list_read_optype() invalid cdt op: %d", optype); + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + if (ret != AS_PROTO_RESULT_OK) { + cf_warning(AS_PARTICLE, "%s: failed", cdt_process_state_get_op_name(state)); + cdt_udata->ret_code = ret; + rollback_alloc_rollback(alloc_result); + return false; + } + + return true; +} + + +//========================================================== +// list_offset_index +// + +static inline void +list_offset_index_init(offset_index *offidx, uint8_t *idx_mem_ptr, + uint32_t ele_count, const uint8_t *contents, uint32_t content_sz) +{ + ele_count /= PACKED_LIST_INDEX_STEP; + + if (ele_count != 0) { + ele_count++; + } + + offset_index_init(offidx, idx_mem_ptr, ele_count, contents, content_sz); + offidx->is_partial = true; +} + +static void +list_offset_index_rm_mask_cpy(offset_index *dst, const offset_index *full_src, + const uint64_t *rm_mask, uint32_t rm_count) +{ + cf_assert(rm_mask && rm_count != 0, AS_PARTICLE, "list_offset_index_rm_mask_cpy() should not do no-op copy"); + + uint32_t ele_count = full_src->_.ele_count; + + if (! dst->is_partial) { + uint32_t delta = 0; + uint32_t prev = 0; + uint32_t idx = 0; + + for (uint32_t i = 0; i < rm_count; i++) { + idx = cdt_idx_mask_find(rm_mask, idx, ele_count, false); + uint32_t sz = offset_index_get_delta_const(full_src, idx); + uint32_t diff = idx - prev; + + for (uint32_t j = 1; j < diff; j++) { + uint32_t offset = offset_index_get_const(full_src, prev + j); + + offset_index_set(dst, prev + j - i, offset - delta); + } + + prev = idx; + delta += sz; + idx++; + } + + uint32_t diff = full_src->_.ele_count - prev; + + for (uint32_t i = 1; i < diff; i++) { + uint32_t offset = offset_index_get_const(full_src, prev + i); + offset_index_set(dst, prev + i - rm_count, offset - delta); + } + + offset_index_set_filled(dst, dst->_.ele_count); + return; + } + + uint32_t delta = 0; + uint32_t prev_par_idx = 0; + uint32_t idx = 0; + + for (uint32_t i = 0; i < rm_count; i++) { + idx = cdt_idx_mask_find(rm_mask, idx, ele_count, false); + uint32_t sz = offset_index_get_delta_const(full_src, idx); + uint32_t par_idx = (idx - i) / PACKED_LIST_INDEX_STEP; + uint32_t diff = par_idx - prev_par_idx + 1; + + for (uint32_t j = 1; j < diff; j++) { + uint32_t offset = offset_index_get_const(full_src, + (prev_par_idx + j) * PACKED_LIST_INDEX_STEP + i); + offset_index_set(dst, prev_par_idx + j, offset - delta); + } + + prev_par_idx = par_idx; + delta += sz; + idx++; + } + + uint32_t par_idx = (full_src->_.ele_count - rm_count) / + PACKED_LIST_INDEX_STEP; + uint32_t diff = par_idx - prev_par_idx + 1; + + for (uint32_t j = 1; j < diff; j++) { + uint32_t offset = offset_index_get_const(full_src, + (prev_par_idx + j) * PACKED_LIST_INDEX_STEP + rm_count); + offset_index_set(dst, prev_par_idx + j, offset - delta); + } + + offset_index_set_filled(dst, par_idx + 1); +} + + +//========================================================== +// list_full_offset_index +// + +static inline void +list_full_offset_index_init(offset_index *offidx, uint8_t *idx_mem_ptr, + uint32_t ele_count, const uint8_t *contents, uint32_t content_sz) +{ + offset_index_init(offidx, idx_mem_ptr, ele_count, contents, content_sz); +} + +static bool +list_full_offset_index_fill_to(offset_index *offidx, uint32_t index) +{ + uint32_t start = offset_index_get_filled(offidx); + + index = MIN(index + 1, offidx->_.ele_count); + + if (start >= index) { + return true; + } + + as_unpacker pk = { + .buffer = offidx->contents, + .offset = offset_index_get_const(offidx, start - 1), + .length = offidx->content_sz + }; + + for (uint32_t i = start; i < index; i++) { + if (as_unpack_size(&pk) <= 0) { + return false; + } + + offset_index_set(offidx, i, pk.offset); + } + + offset_index_set_filled(offidx, index); + + return true; +} + +bool +list_full_offset_index_fill_all(offset_index *offidx) +{ + return list_full_offset_index_fill_to(offidx, offidx->_.ele_count); +} + + +//========================================================== +// list_order_index +// + +static int +list_order_index_sort_cmp_fn(const void *x, const void *y, void *p) +{ + list_order_index_sort_userdata *udata = p; + + if (udata->error) { + return 0; + } + + const order_index *order = udata->order; + uint32_t a = order_index_ptr2value(order, x); + uint32_t b = order_index_ptr2value(order, y); + + const offset_index *offsets = udata->offsets; + const uint8_t *buf = udata->offsets->contents; + uint32_t len = udata->offsets->content_sz; + uint32_t x_off = offset_index_get_const(offsets, a); + uint32_t y_off = offset_index_get_const(offsets, b); + + as_unpacker x_pk = { + .buffer = buf + x_off, + .offset = 0, + .length = len - x_off + }; + + as_unpacker y_pk = { + .buffer = buf + y_off, + .offset = 0, + .length = len - y_off + }; + + msgpack_compare_t cmp = as_unpack_compare(&x_pk, &y_pk); + + switch (cmp) { + case MSGPACK_COMPARE_EQUAL: + return 0; + case MSGPACK_COMPARE_LESS: + if (udata->flags & AS_CDT_SORT_DESCENDING) { + cmp = MSGPACK_COMPARE_GREATER; + } + break; + case MSGPACK_COMPARE_GREATER: + if (udata->flags & AS_CDT_SORT_DESCENDING) { + cmp = MSGPACK_COMPARE_LESS; + } + break; + default: + udata->error = true; + return 0; + } + + return (cmp == MSGPACK_COMPARE_LESS) ? -1 : 1; +} + +bool +list_order_index_sort(order_index *ordidx, const offset_index *full_offidx, + as_cdt_sort_flags flags) +{ + uint32_t ele_count = ordidx->_.ele_count; + list_order_index_sort_userdata udata = { + .order = ordidx, + .offsets = full_offidx, + .flags = flags + }; + + for (uint32_t i = 0; i < ele_count; i++) { + order_index_set(ordidx, i, i); + } + + qsort_r(order_index_get_mem(ordidx, 0), ele_count, ordidx->_.ele_sz, + list_order_index_sort_cmp_fn, (void *)&udata); + + return ! udata.error; +} + +static uint8_t * +list_order_index_pack(const order_index *ordidx, + const offset_index *full_offidx, uint8_t *buf, offset_index *new_offidx) +{ + cf_assert(new_offidx, AS_PARTICLE, "new_offidx null"); + cf_assert(full_offidx->_.ele_count != 0, AS_PARTICLE, "ele_count == 0"); + + const uint8_t *contents = full_offidx->contents; + uint32_t buf_off = 0; + uint32_t write_count = 0; + + for (uint32_t i = 0; i < full_offidx->_.ele_count; i++) { + uint32_t idx = order_index_get(ordidx, i); + + if (idx == full_offidx->_.ele_count) { + continue; + } + + uint32_t off = offset_index_get_const(full_offidx, idx); + uint32_t sz = offset_index_get_delta_const(full_offidx, idx); + + memcpy(buf + buf_off, contents + off, sz); + buf_off += sz; + write_count++; + + if (offset_index_is_null(new_offidx)) { + continue; + } + + if (! new_offidx->is_partial) { + offset_index_set(new_offidx, write_count, buf_off); + } + else if (write_count % PACKED_LIST_INDEX_STEP == 0) { + uint32_t new_idx = write_count / PACKED_LIST_INDEX_STEP; + offset_index_set(new_offidx, new_idx, buf_off); + } + } + + if (offset_index_is_valid(new_offidx)) { + offset_index_set_filled(new_offidx, (new_offidx->is_partial ? + (write_count / PACKED_LIST_INDEX_STEP) + 1 : write_count)); + } + + return buf + buf_off; +} + + +//========================================================== +// list_order_heap +// + +static msgpack_compare_t +list_order_heap_cmp_fn(const void *udata, uint32_t idx1, uint32_t idx2) +{ + const packed_list *list = (const packed_list *)udata; + const offset_index *offidx = &list->full_offidx; + + as_unpacker pk1 = { + .buffer = list->contents, + .offset = offset_index_get_const(offidx, idx1), + .length = list->content_sz + }; + + as_unpacker pk2 = { + .buffer = list->contents, + .offset = offset_index_get_const(offidx, idx2), + .length = list->content_sz + }; + + return as_unpack_compare(&pk1, &pk2); +} + + +//========================================================== +// list_result_data +// + +static bool +list_result_data_set_not_found(cdt_result_data *rd, int64_t index) +{ + switch (rd->type) { + case RESULT_TYPE_KEY: + case RESULT_TYPE_MAP: + return false; + default: + break; + } + + return result_data_set_not_found(rd, index); +} + +// Does not respect inverted flag. +static void +list_result_data_set_values_by_mask(cdt_result_data *rd, const uint64_t *mask, + const offset_index *full_offidx, uint32_t count, uint32_t sz) +{ + if (sz == 0) { + sz = cdt_idx_mask_get_content_sz(mask, count, full_offidx); + } + + cdt_container_builder builder; + cdt_list_builder_start(&builder, rd->alloc, count, sz); + + const uint8_t *end = cdt_idx_mask_write_eles(mask, count, full_offidx, + builder.write_ptr, false); + + cf_assert(end - builder.write_ptr == sz, AS_PARTICLE, "size mismatch end - ptr %zu != sz %u", end - builder.write_ptr, sz); + cdt_container_builder_add_n(&builder, NULL, count, sz); + cdt_container_builder_set_result(&builder, rd); +} + +// Does not respect inverted flag. +static void +list_result_data_set_values_by_idxcount(cdt_result_data *rd, + const order_index *idxcnt, const offset_index *full_offidx) +{ + uint32_t items_count = idxcnt->_.ele_count / 2; + uint32_t sz = 0; + uint32_t ret_count = 0; + + for (uint32_t i = 0; i < items_count; i++) { + uint32_t idx = order_index_get(idxcnt, 2 * i); + uint32_t count = order_index_get(idxcnt, (2 * i) + 1); + + for (uint32_t j = 0; j < count; j++) { + sz += offset_index_get_delta_const(full_offidx, idx + j); + } + } + + cdt_container_builder builder; + cdt_list_builder_start(&builder, rd->alloc, ret_count, sz); + + for (uint32_t i = 0; i < items_count; i++) { + uint32_t idx = order_index_get(idxcnt, 2 * i); + uint32_t count = order_index_get(idxcnt, (2 * i) + 1); + + if (count == 0) { + continue; + } + + uint32_t offset = offset_index_get_const(full_offidx, idx); + uint32_t end = offset_index_get_const(full_offidx, idx + count); + + cdt_container_builder_add_n(&builder, full_offidx->contents + offset, + count, end - offset); + } + + cdt_container_builder_set_result(&builder, rd); +} + +// Does not respect inverted flag. +static bool +list_result_data_set_values_by_ordidx(cdt_result_data *rd, + const order_index *ordidx, const offset_index *full_offidx, + uint32_t count, uint32_t sz) +{ + if (! rd->is_multi) { + if (count != 0) { + uint32_t i = order_index_get(ordidx, 0); + uint32_t offset = offset_index_get_const(full_offidx, i); + uint32_t sz = offset_index_get_delta_const(full_offidx, i); + + return as_bin_particle_alloc_from_msgpack(rd->result, + full_offidx->contents + offset, sz) == AS_PROTO_RESULT_OK; + } + + return true; + } + + if (sz == 0) { + sz = order_index_get_ele_size(ordidx, count, full_offidx); + } + + uint8_t *ptr; + + rd->result->particle = list_simple_create(rd->alloc, count, sz, + &ptr); + order_index_write_eles(ordidx, count, full_offidx, ptr, false); + as_bin_state_set_from_type(rd->result, AS_PARTICLE_TYPE_LIST); + + return true; +} + + +//========================================================== +// Debugging support. +// + +static void +list_print(const packed_list *list, const char *name) +{ + print_packed(list->packed, list->packed_sz, name); +} + +static bool +list_verify(const as_bin *b) +{ + if (! b) { + return true; + } + + packed_list list; + uint8_t type = as_bin_get_particle_type(b); + + if (type != AS_PARTICLE_TYPE_LIST) { + cf_warning(AS_PARTICLE, "list_verify() non-list type: %u", type); + return false; + } + + // Check header. + if (! packed_list_init_from_bin(&list, b)) { + cf_warning(AS_PARTICLE, "list_verify() invalid packed list"); + return false; + } + + offset_index *offidx = list_full_offidx_p(&list); + bool check_offidx = offset_index_is_valid(offidx); + uint32_t filled = 0; + define_offset_index(temp_offidx, list.contents, list.content_sz, + list.ele_count); + + as_unpacker pk = { + .buffer = list.contents, + .length = list.content_sz + }; + + if (check_offidx) { + filled = offset_index_get_filled(offidx); + + if (list.ele_count != 0) { + offset_index_copy(&temp_offidx, offidx, 0, 0, filled, 0); + } + } + + // Check offsets. + for (uint32_t i = 0; i < list.ele_count; i++) { + uint32_t offset; + + if (check_offidx) { + if (list_is_ordered(&list)) { + if (i < filled) { + offset = offset_index_get_const(offidx, i); + + if (pk.offset != offset) { + cf_warning(AS_PARTICLE, "list_verify() i=%u offset=%u expected=%u", i, offset, pk.offset); + return false; + } + } + else { + offset_index_set(&temp_offidx, i, pk.offset); + } + } + else if ((i % PACKED_LIST_INDEX_STEP) == 0) { + uint32_t step_i = i / PACKED_LIST_INDEX_STEP; + + if (i < filled) { + offset = offset_index_get_const(offidx, i); + + if (pk.offset != offset) { + cf_warning(AS_PARTICLE, "list_verify() i=%u step %u offset=%u expected=%u", i, step_i, offset, pk.offset); + return false; + } + } + } + } + else { + offset_index_set(&temp_offidx, i, pk.offset); + } + + offset = pk.offset; + + if (as_unpack_size(&pk) <= 0) { + cf_warning(AS_PARTICLE, "list_verify() i=%u offset=%u pk.offset=%u invalid key", i, offset, pk.offset); + return false; + } + } + + // Check packed size. + if (list.content_sz != pk.offset) { + cf_warning(AS_PARTICLE, "list_verify() content_sz=%u expected=%u", list.content_sz, pk.offset); + return false; + } + + pk.offset = 0; + + as_unpacker pk_value = pk; + + // Check ordered list. + if (list_is_ordered(&list) && list.ele_count > 0) { + if (as_unpack_size(&pk) <= 0) { + cf_warning(AS_PARTICLE, "list_verify() pk.offset=%u invalid value", pk.offset); + return false; + } + + for (uint32_t i = 1; i < list.ele_count; i++) { + uint32_t offset = pk.offset; + msgpack_compare_t cmp = as_unpack_compare(&pk_value, &pk); + + if (cmp == MSGPACK_COMPARE_ERROR) { + cf_warning(AS_PARTICLE, "list_verify() i=%u offset=%u pk.offset=%u invalid key", i, offset, pk.offset); + return false; + } + + if (cmp == MSGPACK_COMPARE_GREATER) { + cf_warning(AS_PARTICLE, "list_verify() i=%u offset=%u pk.offset=%u keys not in order", i, offset, pk.offset); + return false; + } + } + } + + return true; +} + +// Quash warnings for debug function. +void +as_cdt_list_debug_dummy() +{ + list_verify(NULL); + list_print(NULL, NULL); +} diff --git a/as/src/base/particle_map.c b/as/src/base/particle_map.c new file mode 100644 index 00000000..86d71c20 --- /dev/null +++ b/as/src/base/particle_map.c @@ -0,0 +1,6886 @@ +/* + * particle_map.c + * + * Copyright (C) 2015-2018 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include +#include +#include +#include + +#include "aerospike/as_buffer.h" +#include "aerospike/as_msgpack.h" +#include "aerospike/as_serializer.h" +#include "aerospike/as_val.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_byte_order.h" + +#include "bits.h" +#include "fault.h" + +#include "base/cdt.h" +#include "base/datamodel.h" +#include "base/particle.h" +#include "base/proto.h" + + +//========================================================== +// MAP particle interface - function declarations. +// + +// Destructor, etc. +void map_destruct(as_particle *p); +uint32_t map_size(const as_particle *p); + +// Handle "wire" format. +int32_t map_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int map_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int map_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int map_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int32_t map_size_from_wire(const uint8_t *wire_value, uint32_t value_size); +int map_from_wire(as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size, as_particle **pp); +int map_compare_from_wire(const as_particle *p, as_particle_type wire_type, const uint8_t *wire_value, uint32_t value_size); +uint32_t map_wire_size(const as_particle *p); +uint32_t map_to_wire(const as_particle *p, uint8_t *wire); + +// Handle as_val translation. +uint32_t map_size_from_asval(const as_val *val); +void map_from_asval(const as_val *val, as_particle **pp); +as_val *map_to_asval(const as_particle *p); +uint32_t map_asval_wire_size(const as_val *val); +uint32_t map_asval_to_wire(const as_val *val, uint8_t *wire); + +// Handle msgpack translation. +uint32_t map_size_from_msgpack(const uint8_t *packed, uint32_t packed_size); +void map_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp); + +// Handle on-device "flat" format. +int32_t map_size_from_flat(const uint8_t *flat, uint32_t flat_size); +int map_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp); +int map_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp); +uint32_t map_flat_size(const as_particle *p); +uint32_t map_to_flat(const as_particle *p, uint8_t *flat); + + +//========================================================== +// MAP particle interface - vtable. +// + +const as_particle_vtable map_vtable = { + map_destruct, + map_size, + + map_concat_size_from_wire, + map_append_from_wire, + map_prepend_from_wire, + map_incr_from_wire, + map_size_from_wire, + map_from_wire, + map_compare_from_wire, + map_wire_size, + map_to_wire, + + map_size_from_asval, + map_from_asval, + map_to_asval, + map_asval_wire_size, + map_asval_to_wire, + + map_size_from_msgpack, + map_from_msgpack, + + map_size_from_flat, + map_cast_from_flat, + map_from_flat, + map_flat_size, + map_to_flat +}; + + +//========================================================== +// Typedefs & constants. +// + +//#define MAP_DEBUG_VERIFY + +#define LINEAR_FIND_RANK_MAX_COUNT 16 // switch to linear search when the count drops to this number + +#define AS_PACKED_MAP_FLAG_RESERVED_0 0x04 // placeholder for multimap +#define AS_PACKED_MAP_FLAG_OFF_IDX 0x10 // has list offset index +#define AS_PACKED_MAP_FLAG_ORD_IDX 0x20 // has value order index +#define AS_PACKED_MAP_FLAG_ON_STACK 0x40 // map on stack + +struct packed_map_s; + +typedef bool (*packed_map_get_by_idx_func)(const struct packed_map_s *userdata, cdt_payload *contents, uint32_t index); + +typedef struct offidx_op_s { + offset_index *dest; + const offset_index *src; + uint32_t d_i; + uint32_t s_i; + int delta; +} offidx_op; + +typedef struct packed_map_s { + const uint8_t *packed; + const uint8_t *contents; // where elements start (excludes ext) + uint32_t packed_sz; + uint32_t content_sz; + + // Mutable field member (Is considered mutable in const objects). + offset_index offidx; // offset start at contents (excluding ext metadata pair) + uint8_t flags; + // Mutable field member. + order_index value_idx; + + uint32_t ele_count; // excludes ext pair +} packed_map; + +typedef struct packed_map_op_s { + const packed_map *map; + + uint32_t new_ele_count; + uint32_t ele_removed; + + uint32_t seg1_sz; + uint32_t seg2_offset; + uint32_t seg2_sz; + + uint32_t key1_offset; + uint32_t key1_sz; + uint32_t key2_offset; + uint32_t key2_sz; +} packed_map_op; + +typedef struct map_packer_s { + uint8_t *write_ptr; + const uint8_t *contents; + + offset_index offset_idx; // offset start at ele_start (excluding ext metadata pair) + order_index value_idx; + + uint32_t ele_count; + uint32_t content_sz; // does not include map header or ext + uint32_t ext_content_sz; + + uint32_t ext_sz; + uint32_t ext_header_sz; + + uint8_t flags; +} map_packer; + +typedef struct map_mem_s { + uint8_t type; + uint32_t sz; + uint8_t data[]; +} __attribute__ ((__packed__)) map_mem; + +typedef struct map_flat_s { + uint8_t type; + uint32_t sz; + uint8_t data[]; +} __attribute__ ((__packed__)) map_flat; + +typedef struct msgpack_map_empty_flagged_s { + uint8_t map_hdr; + uint8_t ext_hdr; + uint8_t ext_sz; + uint8_t ext_flags; + uint8_t nil; +} __attribute__ ((__packed__)) msgpack_map_empty_flagged; + +typedef struct map_mem_empty_flagged_s { + map_mem mem; + msgpack_map_empty_flagged map; +} map_mem_empty_flagged; + +#define MSGPACK_MAP_FLAGGED(__flags) { \ + .map_hdr = 0x81, \ + .ext_hdr = 0xC7, \ + .ext_sz = 0, \ + .ext_flags = __flags, \ + .nil = 0xC0 \ +} + +#define MAP_MEM_EMPTY_FLAGGED_ENTRY(__flag) { \ + { \ + .type = AS_PARTICLE_TYPE_MAP, \ + .sz = sizeof(msgpack_map_empty_flagged) \ + }, \ + MSGPACK_MAP_FLAGGED(__flag) \ +} + +static const map_mem_empty_flagged map_mem_empty_flagged_table[] = { + MAP_MEM_EMPTY_FLAGGED_ENTRY(AS_PACKED_MAP_FLAG_K_ORDERED | AS_PACKED_MAP_FLAG_OFF_IDX), + MAP_MEM_EMPTY_FLAGGED_ENTRY(AS_PACKED_MAP_FLAG_KV_ORDERED | AS_PACKED_MAP_FLAG_OFF_IDX | AS_PACKED_MAP_FLAG_ORD_IDX), +}; +static const map_mem map_mem_empty = { + .type = AS_PARTICLE_TYPE_MAP, + .sz = 1, + .data = {0x80}, +}; + +typedef enum sort_by_e { + SORT_BY_KEY, + SORT_BY_VALUE +} sort_by_t; + +typedef struct index_sort_userdata_s { + const offset_index *offsets; + order_index *order; + const uint8_t *contents; + uint32_t content_sz; + bool error; + sort_by_t sort_by; +} index_sort_userdata; + +typedef struct map_add_control_s { + bool allow_overwrite; // if key exists and map is unique-keyed - may overwrite + bool allow_create; // if key does not exist - may create +} map_add_control; + +typedef struct map_ele_find_s { + bool found_key; + bool found_value; + + uint32_t idx; + uint32_t rank; + + uint32_t key_offset; // offset start at map header + uint32_t value_offset; // offset start at map header + uint32_t sz; + + uint32_t upper; + uint32_t lower; +} map_ele_find; + +// TODO - refactor params using this. +typedef struct map_getrem_s { + const packed_map *map; + as_bin *b; + rollback_alloc *alloc_buf; + cdt_result_data *result; +} map_getrem; + +#define as_bin_use_static_map_mem_if_notinuse(__b, __flags) \ + if (! as_bin_inuse(b)) { \ + if (is_kv_ordered(__flags)) { \ + (__b)->particle = (as_particle *)(map_mem_empty_flagged_table + 1); \ + } \ + else if (is_k_ordered(__flags)) { \ + (__b)->particle = (as_particle *)map_mem_empty_flagged_table; \ + } \ + else { \ + (__b)->particle = (as_particle *)&map_mem_empty; \ + } \ + as_bin_state_set_from_type(__b, AS_PARTICLE_TYPE_MAP); \ + } + +#define vla_map_offidx_if_invalid(__name, __map_p) \ + union { \ + offset_index *offidx; \ + uint8_t mem_temp[sizeof(offset_index *) + (offset_index_is_valid(&(__map_p)->offidx) ? 0 : offset_index_size(&(__map_p)->offidx))]; \ + } __name; \ + __name.offidx = (offset_index *)&(__map_p)->offidx; \ + if (offset_index_is_null(__name.offidx)) { \ + __name.offidx->_.ptr = __name.mem_temp + sizeof(offset_index *); \ + offset_index_set_filled(__name.offidx, 1); \ + } + +#define vla_map_allidx_if_invalid(__name, __map_p) \ + union { \ + struct { \ + offset_index *offidx; \ + order_index *ordidx; \ + }; \ + uint8_t mem_temp[sizeof(offset_index *) + sizeof(order_index *) + \ + (offset_index_is_valid(&(__map_p)->offidx) ? 0 : offset_index_size(&(__map_p)->offidx)) + \ + (order_index_is_valid(&(__map_p)->value_idx) ? 0 : order_index_size(&(__map_p)->value_idx))]; \ + } __name; \ + __name.offidx = (offset_index *)&(__map_p)->offidx; \ + __name.ordidx = (order_index *)&(__map_p)->value_idx; \ + if (offset_index_is_null(__name.offidx)) { \ + __name.offidx->_.ptr = __name.mem_temp + sizeof(offset_index *) + sizeof(order_index *); \ + offset_index_set_filled(__name.offidx, 1); \ + if (order_index_is_null(__name.ordidx)) { \ + __name.ordidx->_.ptr = __name.offidx->_.ptr + offset_index_size(__name.offidx); \ + order_index_set(__name.ordidx, 0, (__map_p)->ele_count); \ + } \ + } \ + else if (order_index_is_null(__name.ordidx)) { \ + __name.ordidx->_.ptr = __name.mem_temp + sizeof(offset_index *) + sizeof(order_index *); \ + order_index_set(__name.ordidx, 0, (__map_p)->ele_count); \ + } + +#define define_map_unpacker(__name, __map_ptr) \ + as_unpacker __name = { \ + .buffer = (__map_ptr)->contents, \ + .length = (__map_ptr)->content_sz \ + } + +#define define_map_op(__name, __map_ptr) \ + packed_map_op __name; \ + packed_map_op_init(&__name, __map_ptr) + +#define define_map_packer(__name, __ele_count, __flags, __content_sz) \ + map_packer __name; \ + map_packer_init(&__name, __ele_count, __flags, __content_sz) + + +//========================================================== +// Forward declarations. +// + +static inline bool is_map_type(uint8_t type); +static inline bool is_k_ordered(uint8_t flags); +static inline bool is_kv_ordered(uint8_t flags); +static uint32_t map_calc_ext_content_sz(uint8_t flags, uint32_t ele_count, uint32_t content_sz); +static uint8_t map_adjust_incoming_flags(uint8_t flags); + +static inline uint32_t map_ext_content_sz(const packed_map *map); +static inline bool map_is_k_ordered(const packed_map *map); +static inline bool map_is_kv_ordered(const packed_map *map); +static inline bool map_has_offidx(const packed_map *map); +static inline bool map_fill_offidx(const packed_map *map); + +static inline bool skip_map_pair(as_unpacker *pk); + +// map_packer +static as_particle *map_packer_create_particle(map_packer *pk, rollback_alloc *alloc_buf); +static void map_packer_init(map_packer *pk, uint32_t ele_count, uint8_t flags, uint32_t content_sz); +static void map_packer_setup_bin(map_packer *pk, as_bin *b, rollback_alloc *alloc_buf); +static void map_packer_write_hdridx(map_packer *pk); +static bool map_packer_fill_offset_index(map_packer *mpk); +static int map_packer_fill_index_sort_compare(const void *x, const void *y, void *p); +static bool map_packer_fill_ordidx(map_packer *mpk, const uint8_t *contents, uint32_t content_sz); +static bool map_packer_add_op_copy_index(map_packer *mpk, const packed_map_op *add_op, map_ele_find *remove_info, const map_ele_find *add_info, uint32_t kv_sz); +static inline void map_packer_write_seg1(map_packer *pk, const packed_map_op *op); +static inline void map_packer_write_seg2(map_packer *pk, const packed_map_op *op); +static inline void map_packer_write_msgpack_seg(map_packer *pk, const cdt_payload *seg); + +// map +static int map_set_flags(as_bin *b, rollback_alloc *alloc_buf, as_bin *result, uint8_t set_flags); +static int map_increment(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key, const cdt_payload *delta_value, as_bin *result, bool is_decrement); +static int map_add(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key, const cdt_payload *value, as_bin *result, const map_add_control *control); +static int map_add_items(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *items, as_bin *result, const map_add_control *control); + +static int map_remove_by_key_interval(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key_start, const cdt_payload *key_end, cdt_result_data *result); +static int map_remove_by_index_range(as_bin *b, rollback_alloc *alloc_buf, int64_t index, uint64_t count, cdt_result_data *result); +static int map_remove_by_value_interval(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_start, const cdt_payload *value_end, cdt_result_data *result); +static int map_remove_by_rank_range(as_bin *b, rollback_alloc *alloc_buf, int64_t rank, uint64_t count, cdt_result_data *result); + +static int map_remove_all_by_key_list(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key_list, cdt_result_data *result); +static int map_remove_all_by_value_list(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_list, cdt_result_data *result); + +static int map_clear(as_bin *b, rollback_alloc *alloc_buf, as_bin *result); + +// packed_map +static bool packed_map_init(packed_map *map, const uint8_t *buf, uint32_t sz, bool fill_idxs); +static inline bool packed_map_init_from_particle(packed_map *map, const as_particle *p, bool fill_idxs); +static bool packed_map_init_from_bin(packed_map *map, const as_bin *b, bool fill_idxs); +static bool packed_map_unpack_hdridx(packed_map *map, bool fill_idxs); + +static void packed_map_init_indexes(const packed_map *map, as_packer *pk); + +static bool packed_map_ensure_ordidx_filled(const packed_map *op); + +static uint32_t packed_map_find_index_by_idx_unordered(const packed_map *map, uint32_t idx); +static uint32_t packed_map_find_index_by_key_unordered(const packed_map *map, const cdt_payload *key); + +static void packed_map_find_rank_indexed_linear(const packed_map *map, map_ele_find *find, uint32_t start, uint32_t len); +static bool packed_map_find_rank_indexed(const packed_map *map, map_ele_find *find); +static bool packed_map_find_rank_by_value_indexed(const packed_map *map, map_ele_find *find, const cdt_payload *value); +static bool packed_map_find_rank_range_by_value_interval_indexed(const packed_map *map, const cdt_payload *value_start, const cdt_payload *value_end, uint32_t *rank, uint32_t *count, bool is_multi); +static bool packed_map_find_rank_range_by_value_interval_unordered(const packed_map *map, const cdt_payload *value_start, const cdt_payload *value_end, uint32_t *rank, uint32_t *count, uint64_t *mask); +static bool packed_map_find_key_indexed(const packed_map *map, map_ele_find *find, const cdt_payload *key); +static bool packed_map_find_key(const packed_map *map, map_ele_find *find, const cdt_payload *key); + +static int packed_map_get_remove_by_key_interval(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key_start, const cdt_payload *key_end, cdt_result_data *result); +static int packed_map_get_remove_by_index_range(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, int64_t index, uint64_t count, cdt_result_data *result); + +static int packed_map_get_remove_by_value_interval(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_start, const cdt_payload *value_end, cdt_result_data *result); +static int packed_map_get_remove_by_rank_range(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, int64_t rank, uint64_t count, cdt_result_data *result); + +static int packed_map_get_remove_all_by_key_list(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key_list, cdt_result_data *result); +static int packed_map_get_remove_all_by_key_list_ordered(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk, uint32_t items_count, cdt_result_data *result); +static int packed_map_get_remove_all_by_key_list_unordered(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk, uint32_t items_count, cdt_result_data *result); +static int packed_map_get_remove_all_by_value_list(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *value_list, cdt_result_data *result); +static int packed_map_get_remove_all_by_value_list_ordered(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk, uint32_t items_count, cdt_result_data *result); + +static int packed_map_get_remove_all(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, cdt_result_data *result); + +static int packed_map_remove_by_mask(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, const uint64_t *rm_mask, uint32_t count, uint32_t *rm_sz_r); +static int packed_map_remove_idx_range(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, uint32_t idx, uint32_t count); + +static bool packed_map_get_range_by_key_interval_unordered(const packed_map *map, const cdt_payload *key_start, const cdt_payload *key_end, uint32_t *index, uint32_t *count, uint64_t *mask); +static bool packed_map_get_range_by_key_interval_ordered(const packed_map *map, const cdt_payload *key_start, const cdt_payload *key_end, uint32_t *index, uint32_t *count); +static int packed_map_build_rank_result_by_ele_idx(const packed_map *map, const order_index *ele_idx, uint32_t start, uint32_t count, cdt_result_data *result); +static int packed_map_build_rank_result_by_mask(const packed_map *map, const uint64_t *mask, uint32_t count, cdt_result_data *result); +static int packed_map_build_rank_result_by_index_range(const packed_map *map, uint32_t index, uint32_t count, cdt_result_data *result); + +static bool packed_map_get_key_by_idx(const packed_map *map, cdt_payload *key, uint32_t index); +static bool packed_map_get_value_by_idx(const packed_map *map, cdt_payload *value, uint32_t idx); +static bool packed_map_get_pair_by_idx(const packed_map *map, cdt_payload *value, uint32_t index); + +static int packed_map_build_index_result_by_ele_idx(const packed_map *map, const order_index *ele_idx, uint32_t start, uint32_t count, cdt_result_data *result); +static int packed_map_build_index_result_by_mask(const packed_map *map, const uint64_t *mask, uint32_t count, cdt_result_data *result); +static bool packed_map_build_ele_result_by_idx_range(const packed_map *map, uint32_t start_idx, uint32_t count, cdt_result_data *result); +static bool packed_map_build_ele_result_by_ele_idx(const packed_map *map, const order_index *ele_idx, uint32_t start, uint32_t count, uint32_t rm_sz, cdt_result_data *result); +static bool packed_map_build_ele_result_by_mask(const packed_map *map, const uint64_t *mask, uint32_t count, uint32_t rm_sz, cdt_result_data *result); +static int packed_map_build_result_by_key(const packed_map *map, const cdt_payload *key, uint32_t idx, uint32_t count, cdt_result_data *result); + +static int64_t packed_map_get_rank_by_idx(const packed_map *map, uint32_t idx); +static int packed_map_build_rank_result_by_idx(const packed_map *map, uint32_t idx, cdt_result_data *result); +static int packed_map_build_rank_result_by_idx_range(const packed_map *map, uint32_t idx, uint32_t count, cdt_result_data *result); + +static msgpack_compare_t packed_map_compare_key_by_idx(const void *ptr, uint32_t idx1, uint32_t idx2); +static msgpack_compare_t packed_map_compare_values(as_unpacker *pk1, as_unpacker *pk2); +static msgpack_compare_t packed_map_compare_value_by_idx(const void *ptr, uint32_t idx1, uint32_t idx2); + +static bool packed_map_write_k_ordered(const packed_map *map, uint8_t *write_ptr, offset_index *offsets_new); + +// packed_map_op +static void packed_map_op_init(packed_map_op *op, const packed_map *map); +static int32_t packed_map_op_add(packed_map_op *op, const map_ele_find *found); +static int32_t packed_map_op_remove(packed_map_op *op, const map_ele_find *found, uint32_t count, uint32_t remove_sz); + +static uint8_t *packed_map_op_write_seg1(const packed_map_op *op, uint8_t *buf); +static uint8_t *packed_map_op_write_seg2(const packed_map_op *op, uint8_t *buf); +static bool packed_map_op_write_new_offidx(const packed_map_op *op, const map_ele_find *remove_info, const map_ele_find *add_info, offset_index *new_offidx, uint32_t kv_sz); +static bool packed_map_op_write_new_ordidx(const packed_map_op *op, const map_ele_find *remove_info, const map_ele_find *add_info, order_index *value_idx); + +// map_particle +static as_particle *map_particle_create(rollback_alloc *alloc_buf, uint32_t ele_count, const uint8_t *buf, uint32_t content_sz, uint8_t flags); +static int64_t map_particle_strip_indexes(const as_particle *p, uint8_t *dest); + +// map_ele_find +static void map_ele_find_init(map_ele_find *find, const packed_map *map); +static void map_ele_find_continue_from_lower(map_ele_find *find, const map_ele_find *found, uint32_t ele_count); +static void map_ele_find_init_from_idx(map_ele_find *find, const packed_map *map, uint32_t idx); + +// map_offset_index +static bool map_offset_index_fill(offset_index *offidx, uint32_t index); +static int64_t map_offset_index_get(offset_index *offidx, uint32_t index); +static int64_t map_offset_index_get_delta(offset_index *offidx, uint32_t index); + +// offidx_op +static void offidx_op_init(offidx_op *op, offset_index *dest, const offset_index *src); +static void offidx_op_remove(offidx_op *op, uint32_t index); +static void offidx_op_remove_range(offidx_op *op, uint32_t index, uint32_t count); +static void offidx_op_end(offidx_op *op); + +// order_index +static bool order_index_sort(order_index *ordidx, const offset_index *offsets, const uint8_t *contents, uint32_t content_sz, sort_by_t sort_by); +static inline bool order_index_set_sorted(order_index *ordidx, const offset_index *offsets, const uint8_t *ele_start, uint32_t tot_ele_sz, sort_by_t sort_by); +static bool order_index_set_sorted_with_offsets(order_index *ordidx, const offset_index *offsets, sort_by_t sort_by); + +static uint32_t order_index_find_idx(const order_index *ordidx, uint32_t idx, uint32_t start, uint32_t len); + +// order_index_adjust +static uint32_t order_index_adjust_lower(const order_index_adjust *via, uint32_t src); + +// order_index_op +static inline void order_index_op_add(order_index *dest, const order_index *src, uint32_t add_idx, uint32_t add_rank); +static inline void order_index_op_replace1_internal(order_index *dest, const order_index *src, uint32_t add_idx, uint32_t add_rank, uint32_t remove_rank, const order_index_adjust *adjust); +static inline void order_index_op_replace1(order_index *dest, const order_index *src, uint32_t add_rank, uint32_t remove_rank); +static void order_index_op_remove_idx_mask(order_index *dest, const order_index *src, const uint64_t *mask, uint32_t count); + +// result_data +static bool result_data_set_key_not_found(cdt_result_data *rd, int64_t index); +static bool result_data_set_value_not_found(cdt_result_data *rd, int64_t rank); + +// Debugging support +static void map_print(const packed_map *map, const char *name); +static bool map_verify(const as_bin *b); + + +//========================================================== +// MAP particle interface - function definitions. +// + +//------------------------------------------------ +// Destructor, etc. +// + +void +map_destruct(as_particle *p) +{ + cf_free(p); +} + +uint32_t +map_size(const as_particle *p) +{ + const map_mem *p_map_mem = (const map_mem *)p; + return (uint32_t)sizeof(map_mem) + p_map_mem->sz; +} + +//------------------------------------------------ +// Handle "wire" format. +// + +int32_t +map_concat_size_from_wire(as_particle_type wire_type, const uint8_t *wire_value, + uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "concat size for map"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +int +map_append_from_wire(as_particle_type wire_type, const uint8_t *wire_value, + uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "append to map"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +int +map_prepend_from_wire(as_particle_type wire_type, const uint8_t *wire_value, + uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "prepend to map"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +int +map_incr_from_wire(as_particle_type wire_type, const uint8_t *wire_value, + uint32_t value_size, as_particle **pp) +{ + cf_warning(AS_PARTICLE, "increment of map"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +int32_t +map_size_from_wire(const uint8_t *wire_value, uint32_t value_size) +{ + // TODO - CDT can't determine in memory or not. + packed_map map; + + if (! packed_map_init(&map, wire_value, value_size, false)) { + cf_warning(AS_PARTICLE, "map_size_from_wire() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + if (map.flags == 0) { + return (int32_t)(sizeof(map_mem) + value_size); + } + + uint32_t extra_sz = map_ext_content_sz(&map); + + // 1 byte for header, 1 byte for type, 1 byte for length for existing ext. + extra_sz += as_pack_ext_header_get_size(extra_sz) - 3; + + return (int32_t)(sizeof(map_mem) + value_size + extra_sz); +} + +int +map_from_wire(as_particle_type wire_type, const uint8_t *wire_value, + uint32_t value_size, as_particle **pp) +{ + // TODO - CDT can't determine in memory or not. + // It works for data-not-in-memory but we'll incur a memcpy that could be + // eliminated. + packed_map map; + + if (! packed_map_init(&map, wire_value, value_size, false)) { + cf_warning(AS_PARTICLE, "map_size_from_wire() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + map_mem *p_map_mem = (map_mem *)*pp; + + p_map_mem->type = wire_type; + + if (map.flags == 0) { + p_map_mem->sz = value_size; + memcpy(p_map_mem->data, wire_value, value_size); + return AS_PROTO_RESULT_OK; + } + + // TODO - May want to check key order here but for now we'll trust the client/other node. + uint32_t ext_content_sz = map_ext_content_sz(&map); + // 1 byte for header, 1 byte for type, 1 byte for length for existing ext. + uint32_t extra_sz = as_pack_ext_header_get_size(ext_content_sz) - 3; + + as_packer pk = { + .buffer = p_map_mem->data, + .capacity = value_size + extra_sz + }; + + as_pack_map_header(&pk, map.ele_count + 1); + as_pack_ext_header(&pk, ext_content_sz, + map_adjust_incoming_flags(map.flags)); + packed_map_init_indexes(&map, &pk); + as_pack_val(&pk, &as_nil); + memcpy(pk.buffer + pk.offset, map.contents, map.content_sz); + p_map_mem->sz = value_size + ext_content_sz + extra_sz; + +#ifdef MAP_DEBUG_VERIFY + { + as_bin b; + b.particle = *pp; + as_bin_state_set_from_type(&b, AS_PARTICLE_TYPE_MAP); + + if (! map_verify(&b)) { + offset_index_print(&map.offidx, "verify"); + cf_warning(AS_PARTICLE, "map_from_wire: pp=%p wire_value=%p", pp, wire_value); + } + } +#endif + + return AS_PROTO_RESULT_OK; +} + +int +map_compare_from_wire(const as_particle *p, as_particle_type wire_type, + const uint8_t *wire_value, uint32_t value_size) +{ + // TODO + cf_warning(AS_PARTICLE, "map_compare_from_wire() not implemented"); + return -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; +} + +uint32_t +map_wire_size(const as_particle *p) +{ + packed_map map; + + if (! packed_map_init_from_particle(&map, p, false)) { + cf_crash(AS_PARTICLE, "map_wire_size() invalid packed map"); + } + + if (map.flags == 0) { + return map.packed_sz; + } + + uint32_t sz = map.content_sz; + sz += as_pack_list_header_get_size(map.ele_count + 1); + sz += 3 + 1; // 3 for min ext hdr and 1 for nil pair + + return sz; +} + +uint32_t +map_to_wire(const as_particle *p, uint8_t *wire) +{ + int64_t sz = map_particle_strip_indexes(p, wire); + cf_assert(sz >= 0, AS_PARTICLE, "map_to_wire() strip failed with sz %ld", sz); + return (uint32_t)sz; +} + +//------------------------------------------------ +// Handle as_val translation. +// + +uint32_t +map_size_from_asval(const as_val *val) +{ + as_serializer s; + as_msgpack_init(&s); + + uint32_t sz = as_serializer_serialize_getsize(&s, (as_val *)val); + + as_serializer_destroy(&s); + + const as_map *map = (const as_map *)val; + + if (map->flags == 0) { + return (uint32_t)sizeof(map_mem) + sz; + } + + uint32_t ele_count = as_map_size(map); + uint32_t map_hdr_sz = as_pack_list_header_get_size(ele_count); + uint32_t content_sz = sz - map_hdr_sz; + uint32_t ext_content_sz = map_calc_ext_content_sz(map->flags, ele_count, + content_sz); + + sz = (uint32_t)sizeof(map_mem); + sz += as_pack_list_header_get_size(ele_count + 1) + content_sz; + sz += as_pack_ext_header_get_size(ext_content_sz); // ext header and length field + sz += ext_content_sz; // ext content + sz++; // nil pair + + return (uint32_t)sizeof(map_mem) + sz; +} + +void +map_from_asval(const as_val *val, as_particle **pp) +{ + map_mem *p_map_mem = (map_mem *)*pp; + const as_map *av_map = (const as_map *)val; + + p_map_mem->type = AS_PARTICLE_TYPE_MAP; + + as_serializer s; + as_msgpack_init(&s); + + int32_t sz = as_serializer_serialize_presized(&s, val, p_map_mem->data); + + cf_assert(sz >= 0, AS_PARTICLE, "map_from_asval() failed to presize"); + as_serializer_destroy(&s); + + if (av_map->flags == 0) { + p_map_mem->sz = (uint32_t)sz; + return; + } + + uint8_t *temp_mem = NULL; + uint8_t buf[sizeof(packed_map) + (sz < CDT_MAX_STACK_OBJ_SZ ? sz : 0)]; + packed_map *map = (packed_map *)buf; + bool success; + + if (sz < CDT_MAX_STACK_OBJ_SZ) { + memcpy(buf + sizeof(packed_map), p_map_mem->data, sz); + success = packed_map_init(map, buf + sizeof(packed_map), sz, false); + } + else { + temp_mem = cf_malloc(sz); + memcpy(temp_mem, p_map_mem->data, sz); + success = packed_map_init(map, temp_mem, sz, false); + } + + cf_assert(success, AS_PARTICLE, "map_from_asval() failed to unpack header"); + + uint8_t map_flags = map_adjust_incoming_flags(av_map->flags); + define_map_packer(mpk, map->ele_count, map_flags, map->content_sz); + + mpk.write_ptr = p_map_mem->data; + map_packer_write_hdridx(&mpk); + + if (! packed_map_write_k_ordered(map, mpk.write_ptr, &mpk.offset_idx)) { + cf_crash(AS_PARTICLE, "map_from_asval() sort on key failed"); + } + + p_map_mem->sz = + (uint32_t)(mpk.contents - p_map_mem->data + map->content_sz); + + if (order_index_is_valid(&mpk.value_idx)) { + order_index_set(&mpk.value_idx, 0, map->ele_count); + } + + cf_free(temp_mem); + +#ifdef MAP_DEBUG_VERIFY + { + as_bin b; + b.particle = (as_particle *)p_map_mem; + as_bin_state_set_from_type(&b, AS_PARTICLE_TYPE_MAP); + if (! map_verify(&b)) { + cdt_bin_print(&b, "map_from_asval"); + } + } +#endif +} + +as_val * +map_to_asval(const as_particle *p) +{ + map_mem *p_map_mem = (map_mem *)p; + + as_buffer buf = { + .capacity = p_map_mem->sz, + .size = p_map_mem->sz, + .data = p_map_mem->data + }; + + as_serializer s; + as_msgpack_init(&s); + + as_val *val = NULL; + + as_serializer_deserialize(&s, &buf, &val); + as_serializer_destroy(&s); + + if (! val) { + return (as_val *)as_hashmap_new(0); + } + + packed_map map; + + packed_map_init_from_particle(&map, p, false); + ((as_map *)val)->flags = (uint32_t)map.flags; + + return val; +} + +uint32_t +map_asval_wire_size(const as_val *val) +{ + as_serializer s; + as_msgpack_init(&s); + + uint32_t sz = as_serializer_serialize_getsize(&s, (as_val *)val); + + as_serializer_destroy(&s); + + return sz; +} + +uint32_t +map_asval_to_wire(const as_val *val, uint8_t *wire) +{ + as_serializer s; + as_msgpack_init(&s); + + int32_t sz = as_serializer_serialize_presized(&s, val, wire); + + as_serializer_destroy(&s); + cf_assert(sz > 0, AS_PARTICLE, "map_asval_to_wire() sz %d failed to serialize", sz); + + return (uint32_t)sz; +} + +//------------------------------------------------ +// Handle msgpack translation. +// + +uint32_t +map_size_from_msgpack(const uint8_t *packed, uint32_t packed_size) +{ + return (uint32_t)sizeof(map_mem) + packed_size; +} + +void +map_from_msgpack(const uint8_t *packed, uint32_t packed_size, as_particle **pp) +{ + map_mem *p_map_mem = (map_mem *)*pp; + + p_map_mem->type = AS_PARTICLE_TYPE_MAP; + p_map_mem->sz = packed_size; + memcpy(p_map_mem->data, packed, p_map_mem->sz); +} + +//------------------------------------------------ +// Handle on-device "flat" format. +// + +int32_t +map_size_from_flat(const uint8_t *flat, uint32_t flat_size) +{ + // TODO - maybe never used + return -1; +} + +int +map_cast_from_flat(uint8_t *flat, uint32_t flat_size, as_particle **pp) +{ + // Cast temp buffer from disk to data-not-in-memory. + map_flat *p_map_flat = (map_flat *)flat; + + // This assumes map_flat is the same as map_mem. + *pp = (as_particle *)p_map_flat; + + return 0; +} + +int +map_from_flat(const uint8_t *flat, uint32_t flat_size, as_particle **pp) +{ + const map_flat *p_map_flat = (const map_flat *)flat; + packed_map map; + + // This path implies disk-backed data-in-memory so fill_idxs -> true. + if (! packed_map_init(&map, p_map_flat->data, p_map_flat->sz, true)) { + cf_warning(AS_PARTICLE, "map_from_flat() invalid packed map"); + return -1; + } + + if (map.flags == 0) { + // Convert temp buffer from disk to data-in-memory. + map_mem *p_map_mem = cf_malloc_ns(sizeof(map_mem) + p_map_flat->sz); + + p_map_mem->type = p_map_flat->type; + p_map_mem->sz = p_map_flat->sz; + memcpy(p_map_mem->data, p_map_flat->data, p_map_mem->sz); + + *pp = (as_particle *)p_map_mem; + + return 0; + } + + uint8_t flags = map_adjust_incoming_flags(map.flags); + define_map_packer(mpk, map.ele_count, flags, map.content_sz); + as_particle *p = map_packer_create_particle(&mpk, NULL); + + if (! p) { + return -1; + } + + map_packer_write_hdridx(&mpk); + memcpy(mpk.write_ptr, map.contents, map.content_sz); + + if (! map_packer_fill_offset_index(&mpk)) { + cf_free(p); + return -1; + } + + if (order_index_is_valid(&mpk.value_idx)) { + if (! order_index_set_sorted(&mpk.value_idx, &map.offidx, + map.contents, map.content_sz, SORT_BY_VALUE)) { + cf_free(p); + return -1; + } + } + + *pp = p; + + return 0; +} + +uint32_t +map_flat_size(const as_particle *p) +{ + const map_mem *p_map_mem = (const map_mem *)p; + + packed_map map; + + if (! packed_map_init_from_particle(&map, p, false)) { + const as_bin b = { + .particle = (as_particle *)p + }; + + cdt_bin_print(&b, "map"); + cf_crash(AS_PARTICLE, "map_flat_size() invalid packed map"); + } + + if (map.flags == 0) { + return sizeof(map_flat) + p_map_mem->sz; + } + + uint32_t sz = map.content_sz; + sz += as_pack_list_header_get_size(map.ele_count + 1); + sz += 3 + 1; // 3 for min ext hdr and 1 for nil pair + + return (uint32_t)sizeof(map_flat) + sz; +} + +uint32_t +map_to_flat(const as_particle *p, uint8_t *flat) +{ + map_flat *p_map_flat = (map_flat *)flat; + int64_t sz = map_particle_strip_indexes(p, p_map_flat->data); + + cf_assert(sz >= 0, AS_PARTICLE, "map_to_flat() strip indexes failed with sz %ld", sz); + p_map_flat->sz = (uint32_t)sz; + + // Already wrote the type. + + return sizeof(map_flat) + p_map_flat->sz; +} + + +//========================================================== +// Global API. +// + +void +as_bin_set_empty_packed_map(as_bin *b, rollback_alloc *alloc_buf, uint8_t flags) +{ + b->particle = map_particle_create(alloc_buf, 0, NULL, 0, flags); + as_bin_state_set_from_type(b, AS_PARTICLE_TYPE_MAP); +} + + +//========================================================== +// Local helpers. +// + +static inline bool +is_map_type(uint8_t type) +{ + return type == AS_PARTICLE_TYPE_MAP; +} + +static inline bool +is_k_ordered(uint8_t flags) +{ + return flags & AS_PACKED_MAP_FLAG_K_ORDERED; +} + +static inline bool +is_kv_ordered(uint8_t flags) +{ + return (flags & AS_PACKED_MAP_FLAG_KV_ORDERED) == + AS_PACKED_MAP_FLAG_KV_ORDERED; +} + +static uint32_t +map_calc_ext_content_sz(uint8_t flags, uint32_t ele_count, uint32_t content_sz) +{ + uint32_t sz = 0; + + if (is_k_ordered(flags)) { + offset_index offidx; + + offset_index_init(&offidx, NULL, ele_count, NULL, content_sz); + sz += offset_index_size(&offidx); + } + + if (is_kv_ordered(flags)) { + order_index ordidx; + + order_index_init(&ordidx, NULL, ele_count); + sz += order_index_size(&ordidx); + } + + return sz; +} + +static uint8_t +map_adjust_incoming_flags(uint8_t flags) +{ + static const uint8_t mask = AS_PACKED_MAP_FLAG_KV_ORDERED | + AS_PACKED_MAP_FLAG_OFF_IDX | AS_PACKED_MAP_FLAG_ORD_IDX; + + if (is_k_ordered(flags)) { + flags |= AS_PACKED_MAP_FLAG_OFF_IDX; + } + + if (is_kv_ordered(flags)) { + flags |= AS_PACKED_MAP_FLAG_ORD_IDX; + } + + return flags & mask; +} + +static inline uint32_t +map_ext_content_sz(const packed_map *map) +{ + return map_calc_ext_content_sz(map->flags, map->ele_count, map->content_sz); +} + +static inline bool +map_is_k_ordered(const packed_map *map) +{ + return is_k_ordered(map->flags); +} + +static inline bool +map_is_kv_ordered(const packed_map *map) +{ + return is_kv_ordered(map->flags); +} + +static inline bool +map_has_offidx(const packed_map *map) +{ + return offset_index_is_valid(&map->offidx); +} + +static inline bool +map_fill_offidx(const packed_map *map) +{ + offset_index *offidx = (offset_index *)&map->offidx; + return map_offset_index_fill(offidx, map->ele_count); +} + +static inline bool +skip_map_pair(as_unpacker *pk) +{ + if (as_unpack_size(pk) <= 0) { + return false; + } + + if (as_unpack_size(pk) <= 0) { + return false; + } + + return true; +} + +//------------------------------------------------ +// map_packer + +static as_particle * +map_packer_create_particle(map_packer *pk, rollback_alloc *alloc_buf) +{ + uint32_t sz = pk->ext_sz + pk->content_sz + + as_pack_map_header_get_size(pk->ele_count + (pk->flags ? 1 : 0)); + map_mem *p_map_mem = (map_mem *)(alloc_buf + ? rollback_alloc_reserve(alloc_buf, sizeof(map_mem) + sz) + : cf_malloc(sizeof(map_mem) + sz)); // response, so not cf_malloc_ns() + + p_map_mem->type = AS_PARTICLE_TYPE_MAP; + p_map_mem->sz = sz; + pk->write_ptr = p_map_mem->data; + + return (as_particle *)p_map_mem; +} + +static void +map_packer_init(map_packer *pk, uint32_t ele_count, uint8_t flags, + uint32_t content_sz) +{ + pk->ele_count = ele_count; + pk->content_sz = content_sz; + pk->ext_content_sz = 0; + + offset_index_init(&pk->offset_idx, NULL, ele_count, NULL, content_sz); + + if (flags & AS_PACKED_MAP_FLAG_OFF_IDX) { + pk->ext_content_sz += offset_index_size(&pk->offset_idx); + } + + order_index_init(&pk->value_idx, NULL, ele_count); + + if (flags & AS_PACKED_MAP_FLAG_ORD_IDX) { + pk->ext_content_sz += order_index_size(&pk->value_idx); + } + + pk->flags = flags; + + if (flags == AS_PACKED_MAP_FLAG_NONE) { + pk->ext_header_sz = 0; + pk->ext_sz = 0; + } + else { + pk->ext_header_sz = as_pack_ext_header_get_size(pk->ext_content_sz); + pk->ext_sz = pk->ext_header_sz + pk->ext_content_sz + 1; // +1 for packed nil + } + + pk->write_ptr = NULL; + pk->contents = NULL; +} + +static void +map_packer_setup_bin(map_packer *pk, as_bin *b, rollback_alloc *alloc_buf) +{ + b->particle = map_packer_create_particle(pk, alloc_buf); +} + +static void +map_packer_write_hdridx(map_packer *pk) +{ + as_packer write = { + .buffer = pk->write_ptr, + .capacity = INT_MAX + }; + + as_pack_map_header(&write, pk->ele_count + + (pk->flags == AS_PACKED_MAP_FLAG_NONE ? 0 : 1)); + + if (pk->flags == AS_PACKED_MAP_FLAG_NONE) { + pk->write_ptr += write.offset; + pk->contents = pk->write_ptr; + + return; + } + + as_pack_ext_header(&write, pk->ext_content_sz, pk->flags); + + if (pk->ext_content_sz > 0) { + uint8_t *ptr = pk->write_ptr + write.offset; + uint32_t index_sz_left = pk->ext_content_sz; + uint32_t sz = offset_index_size(&pk->offset_idx); + + if ((pk->flags & AS_PACKED_MAP_FLAG_OFF_IDX) && index_sz_left >= sz) { + offset_index_set_ptr(&pk->offset_idx, ptr, + ptr + pk->ext_content_sz + 1); // +1 for nil pair + ptr += sz; + index_sz_left -= sz; + } + + sz = order_index_size(&pk->value_idx); + + if ((pk->flags & AS_PACKED_MAP_FLAG_ORD_IDX) && index_sz_left >= sz) { + order_index_set_ptr(&pk->value_idx, ptr); + } + } + + // Pack nil. + write.offset += pk->ext_content_sz; + write.buffer[write.offset++] = msgpack_nil[0]; + + pk->write_ptr += write.offset; + pk->contents = pk->write_ptr; + pk->offset_idx.contents = pk->contents; +} + +static bool +map_packer_fill_offset_index(map_packer *mpk) +{ + if (offset_index_is_null(&mpk->offset_idx)) { + return true; + } + + offset_index_set_filled(&mpk->offset_idx, 1); + + return map_offset_index_fill(&mpk->offset_idx, mpk->ele_count); +} + +// qsort_r callback function. +static int +map_packer_fill_index_sort_compare(const void *x, const void *y, void *p) +{ + index_sort_userdata *udata = (index_sort_userdata *)p; + + if (udata->error) { + return 0; + } + + order_index *ordidx = udata->order; + uint32_t x_idx = order_index_ptr2value(ordidx, x); + uint32_t y_idx = order_index_ptr2value(ordidx, y); + const offset_index *offidx = udata->offsets; + const uint8_t *contents = udata->contents; + uint32_t content_sz = udata->content_sz; + uint32_t x_off = offset_index_get_const(offidx, x_idx); + uint32_t y_off = offset_index_get_const(offidx, y_idx); + + as_unpacker x_pk = { + .buffer = contents, + .offset = x_off, + .length = content_sz + }; + + as_unpacker y_pk = { + .buffer = contents, + .offset = y_off, + .length = content_sz + }; + + if (udata->sort_by == SORT_BY_VALUE) { + // Skip keys. + if (as_unpack_size(&x_pk) <= 0) { + udata->error = true; + return 0; + } + + if (as_unpack_size(&y_pk) <= 0) { + udata->error = true; + return 0; + } + } + + msgpack_compare_t cmp = as_unpack_compare(&x_pk, &y_pk); + + if (cmp == MSGPACK_COMPARE_EQUAL) { + if (udata->sort_by == SORT_BY_KEY) { + if ((cmp = as_unpack_compare(&x_pk, &y_pk)) == + MSGPACK_COMPARE_EQUAL) { + return 0; + } + } + else { + return 0; + } + } + + if (cmp == MSGPACK_COMPARE_LESS) { + return -1; + } + + if (cmp == MSGPACK_COMPARE_GREATER) { + return 1; + } + + udata->error = true; + + return 0; +} + +static bool +map_packer_fill_ordidx(map_packer *mpk, const uint8_t *contents, + uint32_t content_sz) +{ + if (order_index_is_null(&mpk->value_idx)) { + return true; + } + + return order_index_set_sorted(&mpk->value_idx, &mpk->offset_idx, contents, + content_sz, SORT_BY_VALUE); +} + +static bool +map_packer_add_op_copy_index(map_packer *mpk, const packed_map_op *add_op, + map_ele_find *remove_info, const map_ele_find *add_info, uint32_t kv_sz) +{ + // No elements left. + if (add_op->new_ele_count == 0) { + return true; + } + + if (offset_index_is_valid(&mpk->offset_idx)) { + if (! packed_map_op_write_new_offidx(add_op, remove_info, add_info, + &mpk->offset_idx, kv_sz) && + ! map_packer_fill_offset_index(mpk)) { + return false; + } + } + + if (order_index_is_valid(&mpk->value_idx)) { + if (remove_info->found_key && + order_index_is_filled(&add_op->map->value_idx)) { + if (! packed_map_find_rank_indexed(add_op->map, remove_info)) { + cf_warning(AS_PARTICLE, "map_packer_add_op_copy_index() remove_info find rank failed"); + return false; + } + + if (! remove_info->found_value) { + cf_warning(AS_PARTICLE, "map_packer_add_op_copy_index() remove_info rank not found: idx=%u found=%d ele_count=%u", remove_info->idx, remove_info->found_key, add_op->map->ele_count); + return false; + } + } + + if (! packed_map_op_write_new_ordidx( + add_op, remove_info, add_info, &mpk->value_idx) && + ! map_packer_fill_ordidx(mpk, mpk->contents, mpk->content_sz)) { + return false; + } + } + + return true; +} + +static inline void +map_packer_write_seg1(map_packer *pk, const packed_map_op *op) +{ + pk->write_ptr = packed_map_op_write_seg1(op, pk->write_ptr); +} + +static inline void +map_packer_write_seg2(map_packer *pk, const packed_map_op *op) +{ + pk->write_ptr = packed_map_op_write_seg2(op, pk->write_ptr); +} + +static inline void +map_packer_write_msgpack_seg(map_packer *pk, const cdt_payload *seg) +{ + memcpy(pk->write_ptr, seg->ptr, seg->sz); + pk->write_ptr += seg->sz; +} + +//------------------------------------------------ +// map + +static int +map_set_flags(as_bin *b, rollback_alloc *alloc_buf, as_bin *result, + uint8_t set_flags) +{ + packed_map map; + + if (! packed_map_init_from_bin(&map, b, false)) { + cf_warning(AS_PARTICLE, "packed_map_set_flags() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint8_t map_flags = map.flags; + uint32_t ele_count = map.ele_count; + bool reorder = false; + + if ((set_flags & AS_PACKED_MAP_FLAG_KV_ORDERED) == + AS_PACKED_MAP_FLAG_V_ORDERED) { + cf_warning(AS_PARTICLE, "packed_map_set_flags() invalid flags 0x%x", set_flags); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (is_kv_ordered(set_flags)) { + if (! is_kv_ordered(map_flags)) { + if (ele_count > 1 && ! is_k_ordered(map_flags)) { + reorder = true; + } + + map_flags |= AS_PACKED_MAP_FLAG_KV_ORDERED; + map_flags |= AS_PACKED_MAP_FLAG_OFF_IDX; + map_flags |= AS_PACKED_MAP_FLAG_ORD_IDX; + } + } + else if (is_k_ordered(set_flags)) { + if (is_kv_ordered(map_flags)) { + map_flags &= ~AS_PACKED_MAP_FLAG_V_ORDERED; + map_flags &= ~AS_PACKED_MAP_FLAG_ORD_IDX; + } + else if (! is_k_ordered(map_flags)) { + if (ele_count > 1) { + reorder = true; + } + + map_flags |= AS_PACKED_MAP_FLAG_K_ORDERED; + map_flags |= AS_PACKED_MAP_FLAG_OFF_IDX; + } + } + else if ((set_flags & AS_PACKED_MAP_FLAG_KV_ORDERED) == 0) { + map_flags &= ~AS_PACKED_MAP_FLAG_KV_ORDERED; + map_flags &= ~AS_PACKED_MAP_FLAG_OFF_IDX; + map_flags &= ~AS_PACKED_MAP_FLAG_ORD_IDX; + } + + define_map_packer(mpk, ele_count, map_flags, map.content_sz); + + map_packer_setup_bin(&mpk, b, alloc_buf); + map_packer_write_hdridx(&mpk); + + if (reorder) { + vla_map_offidx_if_invalid(u, &map); + + if (! packed_map_write_k_ordered(&map, mpk.write_ptr, + &mpk.offset_idx)) { + cf_warning(AS_PARTICLE, "packed_map_set_flags() sort on key failed, set_flags = 0x%x", set_flags); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + else { + memcpy(mpk.write_ptr, map.contents, map.content_sz); + + if (offset_index_is_valid(&mpk.offset_idx)) { + if (offset_index_is_full(&map.offidx)) { + offset_index_copy(&mpk.offset_idx, &map.offidx, 0, 0, + ele_count, 0); + } + else if (! map_packer_fill_offset_index(&mpk)) { + cf_warning(AS_PARTICLE, "packed_map_set_flags() fill index failed"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + } + } + + if (order_index_is_valid(&mpk.value_idx)) { + if (order_index_is_filled(&map.value_idx)) { + order_index_copy(&mpk.value_idx, &map.value_idx, 0, 0, ele_count, + NULL); + } + else { + map_packer_fill_ordidx(&mpk, mpk.contents, mpk.content_sz); + } + } + +#ifdef MAP_DEBUG_VERIFY + if (! map_verify(b)) { + cdt_bin_print(b, "packed_map_set_flags"); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +map_increment(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key, + const cdt_payload *delta_value, as_bin *result, bool is_decrement) +{ + packed_map map; + + if (! packed_map_init_from_bin(&map, b, true)) { + cf_warning(AS_PARTICLE, "packed_map_increment() invalid packed map, ele_count=%u", map.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + map_ele_find find_key; + map_ele_find_init(&find_key, &map); + + if (! packed_map_find_key(&map, &find_key, key)) { + cf_warning(AS_PARTICLE, "packed_map_increment() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + cdt_calc_delta calc_delta; + + if (! cdt_calc_delta_init(&calc_delta, delta_value, is_decrement)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (find_key.found_key) { + define_map_unpacker(pk_map_value, &map); + + pk_map_value.offset = find_key.value_offset; + + if (! cdt_calc_delta_add(&calc_delta, &pk_map_value)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + else { + if (! cdt_calc_delta_add(&calc_delta, NULL)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + + uint8_t value_buf[CDT_MAX_PACKED_INT_SZ]; + + cdt_payload value = { + .ptr = value_buf, + .sz = 0 + }; + + cdt_calc_delta_pack_and_result(&calc_delta, &value, result); + + map_add_control control = { + .allow_overwrite = true, + .allow_create = true, + }; + + return map_add(b, alloc_buf, key, &value, NULL, &control); +} + +static int +map_add(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *key, + const cdt_payload *value, as_bin *result, + const map_add_control *control) +{ + packed_map map; + + if (! packed_map_init_from_bin(&map, b, true)) { + cf_warning(AS_PARTICLE, "map_add() invalid packed map, ele_count=%u", map.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + map_ele_find find_key_to_remove; + map_ele_find_init(&find_key_to_remove, &map); + + if (! packed_map_find_key(&map, &find_key_to_remove, key)) { + cf_warning(AS_PARTICLE, "map_add() find key failed, ele_count=%u", map.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (find_key_to_remove.found_key) { + // ADD for [unique] & [key exist]. + if (! control->allow_overwrite) { + return -AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS; + } + } + else { + // REPLACE for ![key exist]. + if (! control->allow_create) { + return -AS_PROTO_RESULT_FAIL_ELEMENT_NOT_FOUND; + } + + // Normal cases handled by packed_map_op_add(): + // ADD for (![unique] & [key exist]) or ![key exist] + // PUT for all cases + // REPLACE for ([unique] & [key exist]) + // UPDATE for ([unique] & [key exist]) or ![key exist] + } + + define_map_op(op, &map); + int32_t new_sz = packed_map_op_add(&op, &find_key_to_remove); + + if (new_sz < 0) { + cf_warning(AS_PARTICLE, "map_add() failed with ret=%d, ele_count=%u", new_sz, map.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t content_sz = (uint32_t)new_sz + key->sz + value->sz; + define_map_packer(mpk, op.new_ele_count, map.flags, content_sz); + + map_packer_setup_bin(&mpk, b, alloc_buf); + map_packer_write_hdridx(&mpk); + + map_ele_find find_value_to_add; + + map_ele_find_init(&find_value_to_add, &map); + find_value_to_add.idx = find_key_to_remove.idx; // Find closest matching position for multiple same values. + + if (order_index_is_valid(&mpk.value_idx) && + order_index_is_filled(&map.value_idx)) { + if (! packed_map_find_rank_by_value_indexed(&map, + &find_value_to_add, value)) { + cf_warning(AS_PARTICLE, "map_add() find_value_to_add rank failed"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + } + + map_packer_write_seg1(&mpk, &op); + map_packer_write_msgpack_seg(&mpk, key); + map_packer_write_msgpack_seg(&mpk, value); + map_packer_write_seg2(&mpk, &op); + + if (! map_packer_add_op_copy_index(&mpk, &op, &find_key_to_remove, + &find_value_to_add, key->sz + value->sz)) { + cf_warning(AS_PARTICLE, "map_add() copy index failed"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + if (result) { + as_bin_set_int(result, op.new_ele_count); + } + +#ifdef MAP_DEBUG_VERIFY + if (! map_verify(b)) { + cdt_bin_print(b, "map_add"); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +map_add_items_unordered(const packed_map *map, as_bin *b, + rollback_alloc *alloc_buf, const offset_index *val_off, + order_index *val_ord, as_bin *result, const map_add_control *control) +{ + define_cdt_idx_mask(rm_mask, map->ele_count); + uint32_t rm_count = 0; + uint32_t rm_sz = 0; + + for (uint32_t i = 0; i < map->ele_count; i++) { + uint32_t offset = offset_index_get_const(&map->offidx, i); + + cdt_payload value = { + .ptr = map->contents + offset, + .sz = map->content_sz - offset + }; + + order_index_find find = { + .count = val_ord->max_idx, + .target = 0 // find first occurrence of value + }; + + order_index_find_rank_by_value(val_ord, &value, val_off, &find); + + if (find.found) { + // ADD for [unique] & [key exist]. + if (! control->allow_overwrite) { + return -AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS; + } + + cdt_idx_mask_set(rm_mask, i); + rm_count++; + rm_sz += offset_index_get_delta_const(&map->offidx, i); + } + else { + // REPLACE for ![key exist]. + if (! control->allow_create) { + return -AS_PROTO_RESULT_FAIL_ELEMENT_NOT_FOUND; + } + } + } + + uint32_t dup_count; + uint32_t dup_sz; + + order_index_sorted_mark_dup_eles(val_ord, val_off, &dup_count, &dup_sz); + + uint32_t new_ele_count = map->ele_count - rm_count + + val_ord->max_idx - dup_count; + uint32_t new_content_sz = map->content_sz - rm_sz + + val_off->content_sz - dup_sz; + define_map_packer(mpk, new_ele_count, map->flags, new_content_sz); + + map_packer_setup_bin(&mpk, b, alloc_buf); + map_packer_write_hdridx(&mpk); + mpk.write_ptr = cdt_idx_mask_write_eles(rm_mask, rm_count, &map->offidx, + mpk.write_ptr, true); + mpk.write_ptr = order_index_write_eles(val_ord, val_ord->max_idx, val_off, + mpk.write_ptr, false); + as_bin_set_int(result, new_ele_count); + +#ifdef MAP_DEBUG_VERIFY + if (! map_verify(b)) { + cdt_bin_print(b, "map_add_items_unordered"); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +map_add_items_ordered(const packed_map *map, as_bin *b, + rollback_alloc *alloc_buf, const offset_index *val_off, + order_index *val_ord, as_bin *result, const map_add_control *control) +{ + uint32_t dup_count; + uint32_t dup_sz; + + order_index_sorted_mark_dup_eles(val_ord, val_off, &dup_count, &dup_sz); + + if (map->ele_count == 0) { + uint32_t new_content_sz = order_index_get_ele_size(val_ord, + val_ord->max_idx, val_off); + uint32_t new_ele_count = val_ord->max_idx - dup_count; + define_map_packer(mpk, new_ele_count, map->flags, new_content_sz); + + map_packer_setup_bin(&mpk, b, alloc_buf); + map_packer_write_hdridx(&mpk); + order_index_write_eles(val_ord, val_ord->max_idx, val_off, + mpk.write_ptr, false); + + if (offset_index_is_valid(&mpk.offset_idx)) { + offset_index_set_filled(&mpk.offset_idx, 1); + + for (uint32_t i = 0; i < val_ord->max_idx; i++) { + uint32_t val_idx = order_index_get(val_ord, i); + + if (val_idx == val_ord->max_idx) { + continue; + } + + uint32_t sz = offset_index_get_delta_const(val_off, val_idx); + + offset_index_append_size(&mpk.offset_idx, sz); + } + } + + if (order_index_is_valid(&mpk.value_idx)) { + order_index_set(&mpk.value_idx, 0, new_ele_count); + } + + as_bin_set_int(result, new_ele_count); + +#ifdef MAP_DEBUG_VERIFY + if (! map_verify(b)) { + cdt_bin_print(b, "map_add_items_ordered"); + map_print(map, "original"); + offset_index_print(val_off, "val_off"); + order_index_print(val_ord, "val_ord"); + cf_crash(AS_PARTICLE, "ele_count 0 dup_count %u dup_sz %u new_ele_count %u new_content_sz %u", dup_count, dup_sz, new_ele_count, new_content_sz); + } +#endif + + return AS_PROTO_RESULT_OK; + } + + define_cdt_idx_mask(rm_mask, map->ele_count); + uint32_t rm_count = 0; + uint32_t rm_sz = 0; + define_order_index2(insert_idx, map->ele_count, val_ord->max_idx); + + for (uint32_t i = 0; i < val_ord->max_idx; i++) { + uint32_t val_idx = order_index_get(val_ord, i); + + if (val_idx == val_ord->max_idx) { + continue; + } + + uint32_t off = offset_index_get_const(val_off, val_idx); + uint32_t sz = offset_index_get_delta_const(val_off, val_idx); + + const cdt_payload value = { + .ptr = val_off->contents + off, + .sz = sz + }; + + map_ele_find find; + map_ele_find_init(&find, map); + + if (! packed_map_find_key_indexed(map, &find, &value)) { + cf_warning(AS_PARTICLE, "map_add_items_ordered() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (find.found_key) { + // ADD for [unique] & [key exist]. + if (! control->allow_overwrite) { + return -AS_PROTO_RESULT_FAIL_ELEMENT_EXISTS; + } + + if (! cdt_idx_mask_is_set(rm_mask, find.idx)) { + cdt_idx_mask_set(rm_mask, find.idx); + rm_count++; + rm_sz += offset_index_get_delta_const(&map->offidx, find.idx); + } + } + else { + // REPLACE for ![key exist]. + if (! control->allow_create) { + return -AS_PROTO_RESULT_FAIL_ELEMENT_NOT_FOUND; + } + } + + cf_assert(find.idx <= map->ele_count, AS_PARTICLE, "Invalid find.idx %u > ele_count %u", find.idx, map->ele_count); + order_index_set(&insert_idx, i, find.idx); + } + + uint32_t new_ele_count = map->ele_count - rm_count + val_ord->max_idx - + dup_count; + uint32_t new_content_sz = map->content_sz - rm_sz + val_off->content_sz - + dup_sz; + define_map_packer(mpk, new_ele_count, map->flags, new_content_sz); + map_packer_setup_bin(&mpk, b, alloc_buf); + map_packer_write_hdridx(&mpk); + uint32_t start_off = 0; + + for (uint32_t i = 0; i < val_ord->max_idx; i++) { + uint32_t val_idx = order_index_get(val_ord, i); + + if (val_idx == val_ord->max_idx) { + continue; + } + + uint32_t index = order_index_get(&insert_idx, i); + uint32_t off = offset_index_get_const(&map->offidx, index); + + if (start_off < off) { + uint32_t sz = off - start_off; + + memcpy(mpk.write_ptr, map->contents + start_off, sz); + mpk.write_ptr += sz; + + if (index == map->ele_count) { + start_off = map->content_sz; + } + else if (cdt_idx_mask_is_set(rm_mask, index)) { + start_off = offset_index_get_const(&map->offidx, index + 1); + } + else { + start_off = off; + } + } + else if (index == map->ele_count) { + start_off = map->content_sz; + } + else if (start_off == off && cdt_idx_mask_is_set(rm_mask, index)) { + start_off = offset_index_get_const(&map->offidx, index + 1); + } + + uint32_t val_offset = offset_index_get_const(val_off, val_idx); + uint32_t val_sz = offset_index_get_delta_const(val_off, val_idx); + + memcpy(mpk.write_ptr, val_off->contents + val_offset, val_sz); + mpk.write_ptr += val_sz; + } + + uint32_t sz = map->content_sz - start_off; + + if (sz != 0) { + memcpy(mpk.write_ptr, map->contents + start_off, sz); + } + + if (offset_index_is_valid(&mpk.offset_idx)) { + uint32_t read_index = 0; + uint32_t write_index = 1; + int delta = 0; + + offset_index_set_filled(&mpk.offset_idx, 1); + + for (uint32_t i = 0; i < val_ord->max_idx; i++) { + uint32_t val_idx = order_index_get(val_ord, i); + + if (val_idx == val_ord->max_idx) { + continue; + } + + uint32_t index = order_index_get(&insert_idx, i); + + if (index > read_index) { + uint32_t count = index - read_index; + + if (read_index + count == map->ele_count) { + count--; + } + + offset_index_copy(&mpk.offset_idx, &map->offidx, write_index, + read_index + 1, count, delta); + write_index += count; + read_index += count; + offset_index_set_filled(&mpk.offset_idx, write_index); + + if (index != map->ele_count && + cdt_idx_mask_is_set(rm_mask, index)) { + read_index++; + delta -= offset_index_get_delta_const(&map->offidx, index); + } + } + else if (index != map->ele_count && index == read_index && + cdt_idx_mask_is_set(rm_mask, index)) { + read_index++; + delta -= offset_index_get_delta_const(&map->offidx, index); + } + + uint32_t sz = offset_index_get_delta_const(val_off, val_idx); + + offset_index_append_size(&mpk.offset_idx, sz); + write_index++; + delta += sz; + } + + if (read_index + 1 < map->ele_count && write_index < new_ele_count) { + offset_index_copy(&mpk.offset_idx, &map->offidx, write_index, + read_index + 1, map->ele_count - read_index - 1, delta); + } + + offset_index_set_filled(&mpk.offset_idx, map->ele_count); + } + + if (order_index_is_valid(&mpk.value_idx)) { + order_index_set(&mpk.value_idx, 0, new_ele_count); + } + + as_bin_set_int(result, new_ele_count); + +#ifdef MAP_DEBUG_VERIFY + if (! map_verify(b)) { + cdt_bin_print(b, "map_add_items_ordered"); + map_print(map, "original"); + offset_index_print(val_off, "val_off"); + order_index_print(val_ord, "val_ord"); + cf_crash(AS_PARTICLE, "ele_count %u dup_count %u dup_sz %u new_ele_count %u new_content_sz %u", map->ele_count, dup_count, dup_sz, new_ele_count, new_content_sz); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +map_add_items(as_bin *b, rollback_alloc *alloc_buf, const cdt_payload *items, + as_bin *result, const map_add_control *control) +{ + as_unpacker pk = { + .buffer = items->ptr, + .length = items->sz + }; + + int64_t items_count = as_unpack_map_header_element_count(&pk); + + if (items_count < 0) { + cf_warning(AS_PARTICLE, "map_add_items() invalid parameter, expected packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (items_count > 0 && as_unpack_peek_is_ext(&pk)) { + if (! skip_map_pair(&pk)) { + cf_warning(AS_PARTICLE, "map_add_items() invalid parameter"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + items_count--; + } + + if (items_count == 0) { + return AS_PROTO_RESULT_OK; // no-op + } + + packed_map map; + + if (! packed_map_init_from_bin(&map, b, true)) { + cf_warning(AS_PARTICLE, "map_add_items() invalid packed map, ele_count=%u", map.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + vla_map_offidx_if_invalid(u, &map); + + // Pre-fill index. + if (! map_offset_index_fill(u.offidx, map.ele_count)) { + cf_warning(AS_PARTICLE, "map_add_items() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + const uint8_t *val_contents = pk.buffer + pk.offset; + uint32_t val_content_sz = pk.length - pk.offset; + uint32_t val_count = (uint32_t)items_count; + define_order_index(val_ord, val_count); + define_offset_index(val_off, val_contents, val_content_sz, val_count); + + // Sort items to add. + if (! map_offset_index_fill(&val_off, val_count) || + ! order_index_set_sorted(&val_ord, &val_off, val_contents, + val_content_sz, SORT_BY_KEY)) { + cf_warning(AS_PARTICLE, "map_add_items() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (map_is_k_ordered(&map)) { + return map_add_items_ordered(&map, b, alloc_buf, &val_off, &val_ord, + result, control); + } + + return map_add_items_unordered(&map, b, alloc_buf, &val_off, &val_ord, + result, control); +} + +static int +map_remove_by_key_interval(as_bin *b, rollback_alloc *alloc_buf, + const cdt_payload *key_start, const cdt_payload *key_end, + cdt_result_data *result) +{ + packed_map map; + + if (! packed_map_init_from_bin(&map, b, true)) { + cf_warning(AS_PARTICLE, "packed_map_remove_by_key_interval() invalid packed map, ele_count=%u", map.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_map_get_remove_by_key_interval(&map, b, alloc_buf, key_start, + key_end, result); +} + +static int +map_remove_by_index_range(as_bin *b, rollback_alloc *alloc_buf, + int64_t index, uint64_t count, cdt_result_data *result) +{ + packed_map map; + + if (! packed_map_init_from_bin(&map, b, true)) { + cf_warning(AS_PARTICLE, "packed_map_remove_by_index_range() invalid packed map index, ele_count=%u", map.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_map_get_remove_by_index_range(&map, b, alloc_buf, index, + count, result); +} + +// value_end == NULL means looking for: [value_start, largest possible value]. +// value_start == value_end means looking for a single value: [value_start, value_start]. +static int +map_remove_by_value_interval(as_bin *b, rollback_alloc *alloc_buf, + const cdt_payload *value_start, const cdt_payload *value_end, + cdt_result_data *result) +{ + packed_map map; + + if (! packed_map_init_from_bin(&map, b, true)) { + cf_warning(AS_PARTICLE, "packed_map_remove_by_value_interval() invalid packed map, ele_count=%u", map.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_map_get_remove_by_value_interval(&map, b, alloc_buf, + value_start, value_end, result); +} + +static int +map_remove_by_rank_range(as_bin *b, rollback_alloc *alloc_buf, + int64_t rank, uint64_t count, cdt_result_data *result) +{ + packed_map map; + + if (! packed_map_init_from_bin(&map, b, true)) { + cf_warning(AS_PARTICLE, "packed_map_remove_by_index_range() invalid packed map index, ele_count=%u", map.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_map_get_remove_by_rank_range(&map, b, alloc_buf, rank, count, + result); +} + +static int +map_remove_all_by_key_list(as_bin *b, rollback_alloc *alloc_buf, + const cdt_payload *key_list, cdt_result_data *result) +{ + packed_map map; + + if (! packed_map_init_from_bin(&map, b, true)) { + cf_warning(AS_PARTICLE, "map_remove_all_by_key_list() invalid packed map, ele_count=%u", map.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_map_get_remove_all_by_key_list(&map, b, alloc_buf, key_list, + result); +} + +static int +map_remove_all_by_value_list(as_bin *b, rollback_alloc *alloc_buf, + const cdt_payload *value_list, cdt_result_data *result) +{ + packed_map map; + + if (! packed_map_init_from_bin(&map, b, true)) { + cf_warning(AS_PARTICLE, "map_get_remove_all_value_items() invalid packed map, ele_count=%u", map.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_map_get_remove_all_by_value_list(&map, b, alloc_buf, + value_list, result); +} + +static int +map_clear(as_bin *b, rollback_alloc *alloc_buf, as_bin *result) +{ + packed_map map; + + if (! packed_map_init_from_bin(&map, b, false)) { + cf_warning(AS_PARTICLE, "packed_map_clear() invalid packed map, ele_count=%u", map.ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + define_map_packer(mpk, 0, map.flags, 0); + + map_packer_setup_bin(&mpk, b, alloc_buf); + map_packer_write_hdridx(&mpk); + + return AS_PROTO_RESULT_OK; +} + +//------------------------------------------------ +// packed_map + +static bool +packed_map_init(packed_map *map, const uint8_t *buf, uint32_t sz, + bool fill_idxs) +{ + map->packed = buf; + map->packed_sz = sz; + + map->ele_count = 0; + + return packed_map_unpack_hdridx(map, fill_idxs); +} + +static inline bool +packed_map_init_from_particle(packed_map *map, const as_particle *p, + bool fill_idxs) +{ + const map_mem *p_map_mem = (const map_mem *)p; + return packed_map_init(map, p_map_mem->data, p_map_mem->sz, fill_idxs); +} + +static bool +packed_map_init_from_bin(packed_map *map, const as_bin *b, bool fill_idxs) +{ + uint8_t type = as_bin_get_particle_type(b); + + cf_assert(is_map_type(type), AS_PARTICLE, "as_packed_map_init_from_bin() invalid type %d", type); + + return packed_map_init_from_particle(map, b->particle, fill_idxs); +} + +static bool +packed_map_unpack_hdridx(packed_map *map, bool fill_idxs) +{ + as_unpacker pk = { + .buffer = map->packed, + .length = map->packed_sz + }; + + if (map->packed_sz == 0) { + map->flags = 0; + return false; + } + + int64_t ele_count = as_unpack_map_header_element_count(&pk); + + if (ele_count < 0) { + return false; + } + + map->ele_count = (uint32_t)ele_count; + + if (ele_count != 0 && as_unpack_peek_is_ext(&pk)) { + as_msgpack_ext ext; + + if (as_unpack_ext(&pk, &ext) != 0) { + return false; + } + + if (as_unpack_size(&pk) <= 0) { // skip the packed nil + return false; + } + + map->flags = ext.type; + map->ele_count--; + + map->contents = map->packed + pk.offset; + map->content_sz = map->packed_sz - pk.offset; + offset_index_init(&map->offidx, NULL, map->ele_count, map->contents, + map->content_sz); + order_index_init(&map->value_idx, NULL, map->ele_count); + + uint32_t index_sz_left = ext.size; + uint8_t *ptr = (uint8_t *)ext.data; + uint32_t sz = offset_index_size(&map->offidx); + + if ((map->flags & AS_PACKED_MAP_FLAG_OFF_IDX) && index_sz_left >= sz) { + offset_index_set_ptr(&map->offidx, ptr, map->packed + pk.offset); + ptr += sz; + index_sz_left -= sz; + + if (fill_idxs) { + map_fill_offidx(map); + } + } + + sz = order_index_size(&map->value_idx); + + if ((map->flags & AS_PACKED_MAP_FLAG_ORD_IDX) && index_sz_left >= sz) { + order_index_set_ptr(&map->value_idx, ptr); + } + } + else { + map->contents = map->packed + pk.offset; + map->content_sz = map->packed_sz - pk.offset; + + offset_index_init(&map->offidx, NULL, ele_count, map->contents, + map->content_sz); + order_index_init(&map->value_idx, NULL, ele_count); + map->flags = AS_PACKED_MAP_FLAG_NONE; + } + + return true; +} + +static void +packed_map_init_indexes(const packed_map *map, as_packer *pk) +{ + uint8_t *ptr = pk->buffer + pk->offset; + + if (map_is_k_ordered(map)) { + offset_index offidx; + + offset_index_init(&offidx, ptr, map->ele_count, map->contents, + map->content_sz); + + uint32_t offidx_sz = offset_index_size(&offidx); + + ptr += offidx_sz; + offset_index_set_filled(&offidx, 1); + pk->offset += offidx_sz; + } + + if (map_is_kv_ordered(map)) { + order_index ordidx; + + order_index_init(&ordidx, ptr, map->ele_count); + order_index_set(&ordidx, 0, map->ele_count); + pk->offset += order_index_size(&ordidx); + } +} + +static bool +packed_map_ensure_ordidx_filled(const packed_map *op) +{ + order_index *ordidx = (order_index *)&op->value_idx; + + if (! order_index_is_filled(ordidx)) { + if (! map_fill_offidx(op)) { + cf_warning(AS_PARTICLE, "packed_map_ensure_ordidx_filled() failed to fill offset_idx"); + return false; + } + + return order_index_set_sorted(ordidx, &op->offidx, + op->contents, op->content_sz, SORT_BY_VALUE); + } + + return true; +} + +static uint32_t +packed_map_find_index_by_idx_unordered(const packed_map *map, uint32_t idx) +{ + uint32_t pk_offset = offset_index_get_const(&map->offidx, idx); + + cdt_payload key = { + .ptr = map->contents + pk_offset, + .sz = map->content_sz - pk_offset + }; + + return packed_map_find_index_by_key_unordered(map, &key); +} + +static uint32_t +packed_map_find_index_by_key_unordered(const packed_map *map, + const cdt_payload *key) +{ + as_unpacker pk_key = { + .buffer = key->ptr, + .length = key->sz + }; + + uint32_t index = 0; + define_map_unpacker(pk, map); + + for (uint32_t i = 0; i < map->ele_count; i++) { + pk_key.offset = 0; + msgpack_compare_t cmp = as_unpack_compare(&pk, &pk_key); + + if (cmp == MSGPACK_COMPARE_ERROR) { + return map->ele_count; + } + + if (cmp == MSGPACK_COMPARE_LESS) { + index++; + } + + if (as_unpack_size(&pk) <= 0) { + return map->ele_count; + } + } + + return index; +} + +static void +packed_map_find_rank_indexed_linear(const packed_map *map, map_ele_find *find, + uint32_t start, uint32_t len) +{ + uint32_t rank = order_index_find_idx(&map->value_idx, find->idx, start, + len); + + if (rank < start + len) { + find->found_value = true; + find->rank = rank; + } +} + +// Find rank given index (find->idx). +// Return true on success. +static bool +packed_map_find_rank_indexed(const packed_map *map, map_ele_find *find) +{ + uint32_t ele_count = map->ele_count; + + if (ele_count == 0) { + return true; + } + + if (find->idx >= ele_count) { + find->found_value = false; + return true; + } + + const offset_index *offset_idx = &map->offidx; + const order_index *value_idx = &map->value_idx; + + uint32_t rank = ele_count / 2; + uint32_t upper = ele_count; + uint32_t lower = 0; + + as_unpacker pk_value = { + .buffer = map->contents + find->value_offset, + .length = find->key_offset + find->sz - find->value_offset + }; + + find->found_value = false; + + while (true) { + if (upper - lower < LINEAR_FIND_RANK_MAX_COUNT) { + packed_map_find_rank_indexed_linear(map, find, lower, + upper - lower); + return true; + } + + uint32_t idx = order_index_get(value_idx, rank); + + if (find->idx == idx) { + find->found_value = true; + find->rank = rank; + break; + } + + as_unpacker pk_buf = { + .buffer = map->contents, + .offset = offset_index_get_const(offset_idx, idx), + .length = map->content_sz + }; + + if (as_unpack_size(&pk_buf) <= 0) { // skip key + cf_warning(AS_PARTICLE, "packed_map_find_rank_indexed() unpack key failed at rank=%u", rank); + return false; + } + + pk_value.offset = 0; // reset + + msgpack_compare_t cmp = as_unpack_compare(&pk_value, &pk_buf); + + if (cmp == MSGPACK_COMPARE_EQUAL) { + if (find->idx < idx) { + cmp = MSGPACK_COMPARE_LESS; + } + else if (find->idx > idx) { + cmp = MSGPACK_COMPARE_GREATER; + } + + find->found_value = true; + } + + if (cmp == MSGPACK_COMPARE_EQUAL) { + find->rank = rank; + break; + } + + if (cmp == MSGPACK_COMPARE_GREATER) { + if (rank >= upper - 1) { + find->rank = rank + 1; + break; + } + + lower = rank + 1; + rank += upper; + rank /= 2; + } + else if (cmp == MSGPACK_COMPARE_LESS) { + if (rank == lower) { + find->rank = rank; + break; + } + + upper = rank; + rank += lower; + rank /= 2; + } + else { + cf_warning(AS_PARTICLE, "packed_map_find_rank_indexed() error=%d lower=%u rank=%u upper=%u", (int)cmp, lower, rank, upper); + return false; + } + } + + return true; +} + +// Find (closest) rank given value. +// Find closest rank for find->idx (0 means first instance of value). +// FIXME - this is mechanically different from order_index_find_rank_by_value() +// where target = ele_count finds the largest rank; here it finds the largest +// rank + 1 in the case that the value exist; fix to conform. +// Return true on success. +static bool +packed_map_find_rank_by_value_indexed(const packed_map *map, map_ele_find *find, + const cdt_payload *value) +{ + const offset_index *offset_idx = &map->offidx; + const order_index *value_idx = &map->value_idx; + + find->found_value = false; + + if (map->ele_count == 0) { + return true; + } + + uint32_t rank = map->ele_count / 2; + + as_unpacker pk_value = { + .buffer = value->ptr, + .length = value->sz + }; + + while (true) { + uint32_t idx = order_index_get(value_idx, rank); + uint32_t pk_offset = offset_index_get_const(offset_idx, idx); + + as_unpacker pk_buf = { + .buffer = map->contents + pk_offset, + .length = map->content_sz - pk_offset + }; + + if (as_unpack_size(&pk_buf) <= 0) { // skip key + return false; + } + + pk_value.offset = 0; // reset + + msgpack_compare_t cmp = as_unpack_compare(&pk_value, &pk_buf); + + if (cmp == MSGPACK_COMPARE_EQUAL) { + if (find->idx < idx) { + cmp = MSGPACK_COMPARE_LESS; + } + else if (find->idx > idx) { + cmp = MSGPACK_COMPARE_GREATER; + } + + find->found_value = true; + } + + if (cmp == MSGPACK_COMPARE_EQUAL) { + find->found_value = true; + find->rank = rank; + break; + } + + if (cmp == MSGPACK_COMPARE_GREATER) { + if (rank >= find->upper - 1) { + find->rank = rank + 1; + break; + } + + find->lower = rank + 1; + rank += find->upper; + rank /= 2; + } + else if (cmp == MSGPACK_COMPARE_LESS) { + if (rank == find->lower) { + find->rank = rank; + break; + } + + find->upper = rank; + rank += find->lower; + rank /= 2; + } + else { + return false; + } + } + + return true; +} + +// value_end == NULL means looking for: [value_start, largest possible value]. +// value_start == value_end means looking for a single value: [value_start, value_start]. +static bool +packed_map_find_rank_range_by_value_interval_indexed(const packed_map *map, + const cdt_payload *value_start, const cdt_payload *value_end, + uint32_t *rank, uint32_t *count, bool is_multi) +{ + cf_assert(map_has_offidx(map), AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_indexed() offset_index needs to be valid"); + + map_ele_find find_start; + + map_ele_find_init(&find_start, map); + find_start.idx = 0; // find least ranked entry with value == value_start + + if (! packed_map_find_rank_by_value_indexed(map, &find_start, + value_start)) { + cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_indexed() invalid packed map"); + return false; + } + + *rank = find_start.rank; + *count = 1; + + if (! value_end || ! value_end->ptr) { + *count = map->ele_count - *rank; + } + else { + map_ele_find find_end; + + map_ele_find_init(&find_end, map); + + if (value_end != value_start) { + find_end.idx = 0; + + if (! packed_map_find_rank_by_value_indexed(map, &find_end, + value_end)) { + cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_indexed() invalid packed map"); + return false; + } + + *count = (find_end.rank > find_start.rank) ? + find_end.rank - find_start.rank : 0; + } + else { + if (! find_start.found_value) { + *count = 0; + } + else if (is_multi) { + find_end.idx = map->ele_count; // find highest ranked entry with value == value_start + + if (! packed_map_find_rank_by_value_indexed(map, &find_end, + value_start)) { + cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_indexed() invalid packed map"); + return false; + } + + *count = find_end.rank - find_start.rank; + } + } + } + + return true; +} + +// value_end == NULL means looking for: [value_start, largest possible value]. +// value_start == value_end means looking for a single value: [value_start, value_start]. +static bool +packed_map_find_rank_range_by_value_interval_unordered(const packed_map *map, + const cdt_payload *value_start, const cdt_payload *value_end, + uint32_t *rank, uint32_t *count, uint64_t *mask) +{ + cf_assert(map_has_offidx(map), AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_unordered() offset_index needs to be valid"); + cf_assert(value_end, AS_PARTICLE, "value_end == NULL"); + + as_unpacker pk_start = { + .buffer = value_start->ptr, + .length = value_start->sz + }; + + as_unpacker pk_end = { + .buffer = value_end->ptr, + .length = value_end->sz + }; + + // Pre-check parameters. + if (as_unpack_size(&pk_start) <= 0) { + cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_unordered() invalid start value"); + return false; + } + + if (value_end != value_start) { + // Pre-check parameters. + if (value_end->ptr && as_unpack_size(&pk_end) < 0) { + cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_unordered() invalid end value"); + return false; + } + } + + *rank = 0; + *count = 0; + + offset_index *offidx = (offset_index *)&map->offidx; + define_map_unpacker(pk, map); + + for (uint32_t i = 0; i < map->ele_count; i++) { + offset_index_set(offidx, i, pk.offset); + + if (as_unpack_size(&pk) <= 0) { // skip key + cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_unordered() invalid packed map at index %u", i); + return false; + } + + uint32_t value_offset = pk.offset; // save for pk_end + + pk_start.offset = 0; // reset + + msgpack_compare_t cmp_start = as_unpack_compare(&pk, &pk_start); + + if (cmp_start == MSGPACK_COMPARE_ERROR) { + cf_warning(AS_PARTICLE, "packed_map_find_rank_range_by_value_interval_unordered() invalid packed map at index %u", i); + return false; + } + + if (cmp_start == MSGPACK_COMPARE_LESS) { + (*rank)++; + } + else if (value_start != value_end) { + msgpack_compare_t cmp_end = MSGPACK_COMPARE_LESS; + + // NULL value_end means largest possible value. + if (value_end->ptr) { + pk.offset = value_offset; + pk_end.offset = 0; + cmp_end = as_unpack_compare(&pk, &pk_end); + } + + if (cmp_end == MSGPACK_COMPARE_LESS) { + cdt_idx_mask_set(mask, i); + (*count)++; + } + } + // Single value case. + else if (cmp_start == MSGPACK_COMPARE_EQUAL) { + cdt_idx_mask_set(mask, i); + (*count)++; + } + } + + offset_index_set_filled(offidx, map->ele_count); + + return true; +} + +// Find key given list index. +// Return true on success. +static bool +packed_map_find_key_indexed(const packed_map *map, map_ele_find *find, + const cdt_payload *key) +{ + const offset_index *offidx = &map->offidx; + uint32_t ele_count = map->ele_count; + + find->lower = 0; + find->upper = ele_count; + + uint32_t idx = (find->lower + find->upper) / 2; + + as_unpacker pk_key = { + .buffer = key->ptr, + .length = key->sz + }; + + find->found_key = false; + + if (ele_count == 0) { + find->idx = 0; + return true; + } + + while (true) { + uint32_t offset = offset_index_get_const(offidx, idx); + uint32_t content_sz = map->content_sz; + uint32_t sz = content_sz - offset; + + as_unpacker pk_buf = { + .buffer = map->contents + offset, + .length = sz + }; + + pk_key.offset = 0; // reset + + msgpack_compare_t cmp = as_unpack_compare(&pk_key, &pk_buf); + uint32_t key_sz = pk_buf.offset; + + if (cmp == MSGPACK_COMPARE_EQUAL) { + if (! find->found_key) { + find->found_key = true; + find->key_offset = offset; + find->value_offset = offset + key_sz; + find->idx = idx++; + find->sz = (idx >= ele_count) ? + sz : offset_index_get_const(offidx, idx) - offset; + } + + break; + } + + if (cmp == MSGPACK_COMPARE_GREATER) { + if (idx >= find->upper - 1) { + if (++idx >= ele_count) { + find->key_offset = content_sz; + find->value_offset = content_sz; + find->idx = idx; + find->sz = 0; + break; + } + + if (! find->found_key) { + uint32_t offset = offset_index_get_const(offidx, idx); + uint32_t tail = content_sz - offset; + + as_unpacker pk = { + .buffer = map->contents + offset, + .length = tail + }; + + if (as_unpack_size(&pk) <= 0) { + cf_warning(AS_PARTICLE, "packed_map_find_key_indexed() invalid packed map"); + return false; + } + + find->key_offset = offset; + find->value_offset = offset + pk.offset; + find->idx = idx++; + find->sz = (idx >= ele_count) ? + tail : offset_index_get_const(offidx, idx) - offset; + } + + break; + } + + find->lower = idx + 1; + idx += find->upper; + idx /= 2; + } + else if (cmp == MSGPACK_COMPARE_LESS) { + if (idx == find->lower) { + find->key_offset = offset; + find->value_offset = offset + key_sz; + find->idx = idx++; + find->sz = (idx >= ele_count) ? + sz : offset_index_get_const(offidx, idx) - offset; + break; + } + + find->upper = idx; + idx += find->lower; + idx /= 2; + } + else { + cf_warning(AS_PARTICLE, "packed_map_find_key_indexed() compare error=%d", (int)cmp); + return false; + } + } + + return true; +} + +static bool +packed_map_find_key(const packed_map *map, map_ele_find *find, + const cdt_payload *key) +{ + uint32_t ele_count = map->ele_count; + offset_index *offidx = (offset_index *)&map->offidx; + + if (ele_count == 0) { + return true; + } + + if (map_is_k_ordered(map) && offset_index_is_full(offidx)) { + if (! packed_map_find_key_indexed(map, find, key)) { + cf_warning(AS_PARTICLE, "packed_map_find_key() packed_map_op_find_key_indexed failed"); + return false; + } + + return true; + } + + as_unpacker pk_key = { + .buffer = key->ptr, + .length = key->sz + }; + + find->found_key = false; + + define_map_unpacker(pk, map); + uint32_t content_sz = pk.length; + + if (! offset_index_is_valid(offidx)) { + offidx = NULL; + } + + if (map_is_k_ordered(map)) { + // Ordered compare. + + // Allows for continuation from last search. + if (find->lower > 0) { + pk.offset = find->key_offset; + } + + for (uint32_t i = find->lower; i < find->upper; i++) { + uint32_t key_offset = pk.offset; + uint32_t sz; + + pk_key.offset = 0; // reset + + msgpack_compare_t cmp = as_unpack_compare(&pk_key, &pk); + + if (cmp == MSGPACK_COMPARE_ERROR) { + return false; + } + + find->value_offset = pk.offset; + + if (offidx) { + int64_t ret = map_offset_index_get_delta(offidx, i); + + if (ret < 0) { + return false; + } + + pk.offset = (uint32_t)map_offset_index_get(offidx, i + 1); + sz = (uint32_t)ret; + } + else { + // Skip value. + if (as_unpack_size(&pk) <= 0) { + return false; + } + + sz = pk.offset - key_offset; + } + + if (cmp != MSGPACK_COMPARE_GREATER) { + if (cmp == MSGPACK_COMPARE_EQUAL) { + find->found_key = true; + } + + find->idx = i; + find->key_offset = key_offset; + find->sz = sz; + + return true; + } + } + + if (find->upper == ele_count) { + find->key_offset = content_sz; + find->value_offset = content_sz; + find->sz = 0; + } + else { + if (offidx && ! offset_index_set_next(offidx, find->upper, + pk.offset)) { + cf_warning(AS_PARTICLE, "offset mismatch at i=%u offset=%u offidx_offset=%u", find->upper, pk.offset, offset_index_get_const(offidx, find->upper)); + } + + find->key_offset = pk.offset; + + // Skip key. + if (as_unpack_size(&pk) <= 0) { + return false; + } + + find->value_offset = pk.offset; + + // Skip value. + if (as_unpack_size(&pk) <= 0) { + return false; + } + + find->sz = pk.offset - find->key_offset; + } + + find->idx = find->upper; + } + else { + // Unordered compare. + // Assumes same keys are clustered. + for (uint32_t i = 0; i < ele_count; i++) { + uint32_t offset = pk.offset; + + pk_key.offset = 0; // reset + + msgpack_compare_t cmp = as_unpack_compare(&pk_key, &pk); + + if (cmp == MSGPACK_COMPARE_ERROR) { + return false; + } + + uint32_t value_offset = pk.offset; + + if (cmp == MSGPACK_COMPARE_EQUAL) { + // Skip value. + if (as_unpack_size(&pk) <= 0) { + return false; + } + + if (! find->found_key) { + find->found_key = true; + find->idx = i; + find->key_offset = offset; + find->value_offset = value_offset; + find->sz = pk.offset - offset; + } + + if (offidx && ! offset_index_set_next(offidx, i + 1, + pk.offset)) { + cf_warning(AS_PARTICLE, "offset mismatch at i=%u offset=%u offidx_offset=%u", i + 1, pk.offset, offset_index_get_const(offidx, i + 1)); + } + + return true; + } + else if (find->found_key) { + return true; + } + else if (as_unpack_size(&pk) <= 0) { // skip value + return false; + } + + if (offidx && ! offset_index_set_next(offidx, i + 1, pk.offset)) { + cf_warning(AS_PARTICLE, "offset mismatch at i=%u offset=%u offidx_offset=%u", i + 1, pk.offset, offset_index_get_const(offidx, i + 1)); + } + } + + find->key_offset = content_sz; + find->value_offset = content_sz; + find->sz = 0; + find->idx = ele_count; + } + + return true; +} + +static int +packed_map_get_remove_by_key_interval(const packed_map *map, as_bin *b, + rollback_alloc *alloc_buf, const cdt_payload *key_start, + const cdt_payload *key_end, cdt_result_data *result) +{ + if (result_data_is_return_rank_range(result)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_key_interval() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + vla_map_offidx_if_invalid(u, map); + uint32_t index = 0; + uint32_t count = 0; + + if (map_is_k_ordered(map)) { + if (! packed_map_get_range_by_key_interval_ordered(map, key_start, + key_end, &index, &count)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return packed_map_get_remove_by_index_range(map, b, alloc_buf, index, + count, result); + } + + bool inverted = result_data_is_inverted(result); + + if (inverted && ! result->is_multi) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_key_interval() INVERTED flag not supported for single result ops"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (key_start == key_end) { + map_ele_find find_key; + map_ele_find_init(&find_key, map); + + if (! packed_map_find_key(map, &find_key, key_start)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_key_interval() find key failed, ele_count=%u", map->ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (! find_key.found_key) { + if (! result_data_set_key_not_found(result, -1)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_key_interval() invalid result_type %d", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; + } + + if (b) { + define_map_op(op, map); + int32_t new_sz = packed_map_op_remove(&op, &find_key, 1, + find_key.sz); + + if (new_sz < 0) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_key_interval() packed_map_transform_remove_key failed with ret=%d, ele_count=%u", new_sz, map->ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + define_map_packer(mpk, op.new_ele_count, map->flags, + (uint32_t)new_sz); + + map_packer_setup_bin(&mpk, b, alloc_buf); + map_packer_write_hdridx(&mpk); + map_packer_write_seg1(&mpk, &op); + map_packer_write_seg2(&mpk, &op); + } + +#ifdef MAP_DEBUG_VERIFY + if (b && ! map_verify(b)) { + cdt_bin_print(b, "packed_map_get_remove_by_key_interval"); + map_print(map, "original"); + cf_crash(AS_PARTICLE, "ele_count %u index %u count 1 is_multi %d inverted %d", map->ele_count, index, result->is_multi, inverted); + } +#endif + + return packed_map_build_result_by_key(map, key_start, find_key.idx, + 1, result); + } + + define_cdt_idx_mask(rm_mask, map->ele_count); + + if (! packed_map_get_range_by_key_interval_unordered(map, key_start, + key_end, &index, &count, rm_mask)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rm_count = count; + + if (inverted) { + rm_count = map->ele_count - count; + cdt_idx_mask_invert(rm_mask, map->ele_count); + } + + int ret = AS_PROTO_RESULT_OK; + uint32_t rm_sz = 0; + + if (b) { + if ((ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask, + rm_count, &rm_sz)) != AS_PROTO_RESULT_OK) { + return ret; + } + } + + if (result_data_is_return_elements(result)) { + if (! packed_map_build_ele_result_by_mask(map, rm_mask, rm_count, rm_sz, + result)) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + } + else if (result_data_is_return_rank(result)) { + ret = packed_map_build_rank_result_by_mask(map, rm_mask, rm_count, + result); + } + else { + ret = result_data_set_range(result, index, count, map->ele_count); + } + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + +#ifdef MAP_DEBUG_VERIFY + if (b && ! map_verify(b)) { + cdt_bin_print(b, "packed_map_get_remove_by_key_interval"); + map_print(map, "original"); + cf_crash(AS_PARTICLE, "ele_count %u index %u count %u rm_count %u inverted %d", map->ele_count, index, count, rm_count, inverted); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +packed_map_trim_ordered(const packed_map *map, as_bin *b, rollback_alloc *alloc_buf, + uint32_t index, uint32_t count, cdt_result_data *result) +{ + cf_assert(result->is_multi, AS_PARTICLE, "packed_map_trim_ordered() required to be a multi op"); + cf_assert(! result_data_is_inverted(result), AS_PARTICLE, "packed_map_trim_ordered() INVERTED flag not supported"); + + vla_map_offidx_if_invalid(u, map); + uint32_t rm_count = map->ele_count - count; + uint32_t index1 = index + count; + + // Pre-fill index. + if (! map_offset_index_fill(u.offidx, index + count)) { + cf_warning(AS_PARTICLE, "packed_map_trim_ordered() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t offset0 = offset_index_get_const(u.offidx, index); + uint32_t offset1 = offset_index_get_const(u.offidx, index1); + uint32_t content_sz = offset1 - offset0; + + if (b) { + define_map_packer(mpk, count, map->flags, content_sz); + + map_packer_setup_bin(&mpk, b, alloc_buf); + map_packer_write_hdridx(&mpk); + memcpy(mpk.write_ptr, map->contents + offset0, content_sz); + } + + switch (result->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_COUNT: + as_bin_set_int(result->result, rm_count); + break; + case RESULT_TYPE_REVINDEX: + case RESULT_TYPE_INDEX: { + bool is_rev = (result->type == RESULT_TYPE_REVINDEX); + define_int_list_builder(builder, result->alloc, rm_count); + + cdt_container_builder_add_int_range(&builder, 0, index, map->ele_count, + is_rev); + cdt_container_builder_add_int_range(&builder, index1, + map->ele_count - index1, map->ele_count, is_rev); + cdt_container_builder_set_result(&builder, result); + break; + } + case RESULT_TYPE_RANK: + case RESULT_TYPE_REVRANK: + result->flags = AS_CDT_OP_FLAG_INVERTED; + + return packed_map_build_rank_result_by_index_range(map, index, count, + result); + case RESULT_TYPE_KEY: + case RESULT_TYPE_VALUE: + case RESULT_TYPE_MAP: + result->flags = AS_CDT_OP_FLAG_INVERTED; + + if (! packed_map_build_ele_result_by_idx_range(map, index, count, + result)) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + break; + default: + cf_warning(AS_PARTICLE, "packed_map_trim_ordered() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; +} + +// Set b = NULL for get_by_index_range operation. +static int +packed_map_get_remove_by_index_range(const packed_map *map, as_bin *b, + rollback_alloc *alloc_buf, int64_t index, uint64_t count, + cdt_result_data *result) +{ + uint32_t uindex; + uint32_t count32; + + if (! calc_index_count(index, count, map->ele_count, &uindex, &count32, + result->is_multi)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() index %ld out of bounds for ele_count %u", index, map->ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (result_data_is_return_rank_range(result)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (result_data_is_inverted(result)) { + if (! result->is_multi) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() INVERTED flag not supported for single result ops"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + result->flags &= ~AS_CDT_OP_FLAG_INVERTED; + + if (count32 == 0) { + // Reduce to remove all. + uindex = 0; + count32 = map->ele_count; + } + else if (uindex == 0) { + // Reduce to remove tail section. + uindex = count32; + count32 = map->ele_count - count32; + } + else if (uindex + count32 >= map->ele_count) { + // Reduce to remove head section. + count32 = uindex; + uindex = 0; + } + else if (map_is_k_ordered(map)) { + return packed_map_trim_ordered(map, b, alloc_buf, uindex, count32, + result); + } + else { + result->flags |= AS_CDT_OP_FLAG_INVERTED; + } + } + + if (count32 == 0) { + if (! result_data_set_key_not_found(result, uindex)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() invalid result type %d", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; + } + + vla_map_offidx_if_invalid(u, map); + + if (count32 == map->ele_count) { + return packed_map_get_remove_all(map, b, alloc_buf, result); + } + + int ret = AS_PROTO_RESULT_OK; + + if (map_is_k_ordered(map)) { + // Pre-fill index. + if (! map_offset_index_fill(u.offidx, uindex + count32)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (b) { + ret = packed_map_remove_idx_range(map, b, alloc_buf, uindex, + count32); + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + } + + if (result_data_is_return_elements(result)) { + if (! packed_map_build_ele_result_by_idx_range(map, uindex, count32, + result)) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + } + else if (result_data_is_return_rank(result)) { + ret = packed_map_build_rank_result_by_index_range(map, uindex, + count32, result); + } + else { + ret = result_data_set_range(result, uindex, count32, + map->ele_count); + } + } + else { + // Pre-fill index. + if (! map_fill_offidx(map)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + define_build_order_heap_by_range(heap, uindex, count32, map->ele_count, + map, packed_map_compare_key_by_idx, success); + + if (! success) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rm_sz = 0; + bool inverted = result_data_is_inverted(result); + define_cdt_idx_mask(rm_mask, map->ele_count); + uint32_t rm_count = (inverted ? map->ele_count - count32 : count32); + + cdt_idx_mask_set_by_ordidx(rm_mask, &heap._, heap.filled, count32, + inverted); + + if (b) { + int ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask, + rm_count, &rm_sz); + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + } + + switch (result->type) { + case RESULT_TYPE_RANK: + case RESULT_TYPE_REVRANK: + if (inverted) { + ret = packed_map_build_rank_result_by_mask(map, rm_mask, + rm_count, result); + } + else { + if (heap.cmp == MSGPACK_COMPARE_LESS) { + order_heap_reverse_end(&heap, count32); + } + + ret = packed_map_build_rank_result_by_ele_idx(map, &heap._, + heap.filled, count32, result); + } + break; + case RESULT_TYPE_KEY: + case RESULT_TYPE_VALUE: + case RESULT_TYPE_MAP: { + bool success; + + if (inverted) { + success = packed_map_build_ele_result_by_mask(map, rm_mask, + rm_count, rm_sz, result); + } + else { + if (heap.cmp == MSGPACK_COMPARE_LESS) { + order_heap_reverse_end(&heap, count32); + } + + success = packed_map_build_ele_result_by_ele_idx(map, &heap._, + heap.filled, count32, rm_sz, result); + } + + if (! success) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + break; + } + default: + ret = result_data_set_range(result, uindex, count32, + map->ele_count); + break; + } + } + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + +#ifdef MAP_DEBUG_VERIFY + if (b && ! map_verify(b)) { + cdt_bin_print(b, "packed_map_get_remove_by_index_range"); + map_print(map, "original"); + cf_crash(AS_PARTICLE, "ele_count %u uindex %u count32 %u", map->ele_count, uindex, count32); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +// value_end == NULL means looking for: [value_start, largest possible value]. +// value_start == value_end means looking for a single value: [value_start, value_start]. +static int +packed_map_get_remove_by_value_interval(const packed_map *map, as_bin *b, + rollback_alloc *alloc_buf, const cdt_payload *value_start, + const cdt_payload *value_end, cdt_result_data *result) +{ + if (result_data_is_return_index_range(result)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_value_interval() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + bool inverted = result_data_is_inverted(result); + + if (inverted && ! result->is_multi) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_value_interval() INVERTED flag not supported for single result ops"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (map->ele_count == 0) { + if (! result_data_set_value_not_found(result, -1)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; + } + + vla_map_offidx_if_invalid(u, map); + + // Pre-fill index. + if (! map_fill_offidx(map)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_value_interval() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rank = 0; + uint32_t count = 0; + int ret = AS_PROTO_RESULT_OK; + + if (order_index_is_valid(&map->value_idx)) { + if (! packed_map_ensure_ordidx_filled(map)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (! packed_map_find_rank_range_by_value_interval_indexed(map, + value_start, value_end, &rank, &count, result->is_multi)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rm_count = (inverted ? map->ele_count - count : count); + bool need_mask = b || (inverted && + (result_data_is_return_elements(result) || + result_data_is_return_index(result))); + cond_define_cdt_idx_mask(rm_mask, map->ele_count, need_mask); + uint32_t rm_sz = 0; + + if (need_mask) { + cdt_idx_mask_set_by_ordidx(rm_mask, &map->value_idx, rank, count, + inverted); + } + + if (b) { + int ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask, + rm_count, &rm_sz); + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + } + + if (result_data_is_return_elements(result)) { + if (inverted) { + if (! packed_map_build_ele_result_by_mask(map, rm_mask, + rm_count, rm_sz, result)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_value_interval() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + else if (! packed_map_build_ele_result_by_ele_idx(map, + &map->value_idx, rank, count, rm_sz, result)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_value_interval() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + else if (result_data_is_return_index(result)) { + if (inverted) { + ret = packed_map_build_index_result_by_mask(map, rm_mask, + rm_count, result); + } + else { + ret = packed_map_build_index_result_by_ele_idx(map, + &map->value_idx, rank, count, result); + } + } + else { + ret = result_data_set_range(result, rank, count, map->ele_count); + } + } + else { + define_cdt_idx_mask(rm_mask, map->ele_count); + + if (! packed_map_find_rank_range_by_value_interval_unordered(map, + value_start, value_end, &rank, &count, rm_mask)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_value_interval() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (count == 0) { + if (inverted) { + result->flags &= ~AS_CDT_OP_FLAG_INVERTED; + + return packed_map_get_remove_all(map, b, alloc_buf, result); + } + else if (! result_data_set_value_not_found(result, rank)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + else { + if (! result->is_multi) { + count = 1; + } + + uint32_t rm_sz = 0; + uint32_t rm_count = count; + + if (inverted) { + cdt_idx_mask_invert(rm_mask, map->ele_count); + rm_count = map->ele_count - count; + } + + if (b) { + ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask, + rm_count, &rm_sz); + } + + if (result_data_is_return_elements(result)) { + if (! packed_map_build_ele_result_by_mask(map, rm_mask, + rm_count, rm_sz, result)) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + } + else if (result_data_is_return_index(result)) { + ret = packed_map_build_index_result_by_mask(map, rm_mask, + rm_count, result); + } + else { + ret = result_data_set_range(result, rank, count, + map->ele_count); + } + } + } + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + +#ifdef MAP_DEBUG_VERIFY + if (b && ! map_verify(b)) { + cdt_bin_print(b, "packed_map_get_remove_by_value_interval"); + map_print(map, "original"); + cf_crash(AS_PARTICLE, "ele_count %u rank %u count %u inverted %d", map->ele_count, rank, count, inverted); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +packed_map_get_remove_by_rank_range(const packed_map *map, as_bin *b, + rollback_alloc *alloc_buf, int64_t rank, uint64_t count, + cdt_result_data *result) +{ + uint32_t urank; + uint32_t count32; + + if (! calc_index_count(rank, count, map->ele_count, &urank, &count32, + result->is_multi)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() rank %ld out of bounds for ele_count %u", rank, map->ele_count); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (result_data_is_return_index_range(result)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (result_data_is_inverted(result)) { + if (! result->is_multi) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_index_range() INVERTED flag not supported for single result ops"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + result->flags &= ~AS_CDT_OP_FLAG_INVERTED; + + if (count32 == 0) { + // Reduce to remove all. + urank = 0; + count32 = map->ele_count; + } + else if (urank == 0) { + // Reduce to remove tail section. + urank = count32; + count32 = map->ele_count - count32; + } + else if (urank + count32 >= map->ele_count) { + // Reduce to remove head section. + count32 = urank; + urank = 0; + } + else { + result->flags |= AS_CDT_OP_FLAG_INVERTED; + } + } + + if (count32 == 0) { + if (! result_data_set_value_not_found(result, urank)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; + } + + vla_map_offidx_if_invalid(u, map); + + if (! map_fill_offidx(map)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + bool inverted = result_data_is_inverted(result); + uint32_t rm_count = inverted ? map->ele_count - count32 : count32; + const order_index *ordidx = &map->value_idx; + define_cdt_idx_mask(rm_mask, map->ele_count); + order_index ret_idxs; + + if (order_index_is_valid(ordidx)) { + if (! packed_map_ensure_ordidx_filled(map)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + cdt_idx_mask_set_by_ordidx(rm_mask, ordidx, urank, count32, inverted); + order_index_init_ref(&ret_idxs, ordidx, urank, count32); + } + else { + define_build_order_heap_by_range(heap, urank, count32, map->ele_count, + map, packed_map_compare_value_by_idx, success); + + if (! success) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + cdt_idx_mask_set_by_ordidx(rm_mask, &heap._, heap.filled, count32, + inverted); + + if (! inverted) { + if (heap.cmp == MSGPACK_COMPARE_LESS) { + // Reorder results from lowest to highest order. + order_heap_reverse_end(&heap, count32); + } + + if (result_data_is_return_index(result)) { + int ret = packed_map_build_index_result_by_ele_idx(map, + &heap._, heap.filled, count32, result); + + if (ret != AS_PROTO_RESULT_OK) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() build index result failed"); + return ret; + } + } + else if (result_data_is_return_elements(result)) { + if (! packed_map_build_ele_result_by_ele_idx(map, &heap._, + heap.filled, count32, 0, result)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + } + } + + uint32_t rm_sz = 0; + int ret = AS_PROTO_RESULT_OK; + + if (b) { + ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask, rm_count, + &rm_sz); + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + } + + switch (result->type) { + case RESULT_TYPE_NONE: + case RESULT_TYPE_COUNT: + case RESULT_TYPE_RANK: + case RESULT_TYPE_REVRANK: + ret = result_data_set_index_rank_count(result, urank, count32, + map->ele_count); + break; + case RESULT_TYPE_INDEX: + case RESULT_TYPE_REVINDEX: + if (inverted) { + ret = packed_map_build_index_result_by_mask(map, rm_mask, rm_count, + result); + } + else if (! as_bin_inuse(result->result)) { + ret = packed_map_build_index_result_by_ele_idx(map, &ret_idxs, + 0, rm_count, result); + } + break; + case RESULT_TYPE_KEY: + case RESULT_TYPE_VALUE: + case RESULT_TYPE_MAP: + if (inverted) { + if (! packed_map_build_ele_result_by_mask(map, rm_mask, + rm_count, rm_sz, result)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + else if (! as_bin_inuse(result->result) && + ! packed_map_build_ele_result_by_ele_idx(map, &ret_idxs, 0, + rm_count, rm_sz, result)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + break; + case RESULT_TYPE_REVRANK_RANGE: + case RESULT_TYPE_RANK_RANGE: + ret = result_data_set_range(result, urank, rm_count, map->ele_count); + break; + case RESULT_TYPE_INDEX_RANGE: + case RESULT_TYPE_REVINDEX_RANGE: + default: + cf_warning(AS_PARTICLE, "packed_map_get_remove_by_rank_range() result_type %d not supported", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + +#ifdef MAP_DEBUG_VERIFY + if (b && ! map_verify(b)) { + cdt_bin_print(b, "packed_map_get_remove_by_rank_range"); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +packed_map_get_remove_all_by_key_list(const packed_map *map, as_bin *b, + rollback_alloc *alloc_buf, const cdt_payload *key_list, + cdt_result_data *result) +{ + as_unpacker items_pk; + uint32_t items_count; + + if (! list_param_parse(key_list, &items_pk, &items_count)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + bool inverted = result_data_is_inverted(result); + + if (items_count == 0) { + switch (result->type) { + case RESULT_TYPE_RANK: + case RESULT_TYPE_REVRANK: + case RESULT_TYPE_INDEX_RANGE: + case RESULT_TYPE_REVINDEX_RANGE: + case RESULT_TYPE_RANK_RANGE: + case RESULT_TYPE_REVRANK_RANGE: + cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list() invalid result type %d", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + default: + break; + } + + if (! inverted) { + result_data_set_key_not_found(result, 0); + + return AS_PROTO_RESULT_OK; + } + + result->flags &= ~AS_CDT_OP_FLAG_INVERTED; + + return packed_map_get_remove_all(map, b, alloc_buf, result); + } + + vla_map_offidx_if_invalid(u, map); + + if (map_is_k_ordered(map)) { + return packed_map_get_remove_all_by_key_list_ordered(map, b, alloc_buf, + &items_pk, items_count, result); + } + + return packed_map_get_remove_all_by_key_list_unordered(map, b, alloc_buf, + &items_pk, items_count, result); +} + +static int +packed_map_get_remove_all_by_key_list_ordered(const packed_map *map, + as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk, + uint32_t items_count, cdt_result_data *result) +{ + define_order_index2(rm_ic, map->ele_count, 2 * items_count); + uint32_t rm_count = 0; + + for (uint32_t i = 0; i < items_count; i++) { + cdt_payload key = { + .ptr = items_pk->buffer + items_pk->offset, + .sz = items_pk->offset + }; + + if (as_unpack_size(items_pk) <= 0) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list_ordered() invalid parameter"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + key.sz = items_pk->offset - key.sz; + + map_ele_find find_key; + map_ele_find_init(&find_key, map); + + if (! packed_map_find_key(map, &find_key, &key)) { + if (cdt_payload_is_int(&key)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list_ordered() find key=%ld failed, ele_count=%u", cdt_payload_get_int64(&key), map->ele_count); + } + else { + cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list_ordered() find key failed, ele_count=%u", map->ele_count); + } + + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t count = find_key.found_key ? 1 : 0; + + order_index_set(&rm_ic, 2 * i, find_key.idx); + order_index_set(&rm_ic, (2 * i) + 1, count); + rm_count += count; + } + + bool inverted = result_data_is_inverted(result); + bool need_mask = b || result_data_is_return_elements(result) || + (inverted && result_data_is_return_index(result)); + cond_define_cdt_idx_mask(rm_mask, map->ele_count, need_mask); + uint32_t rm_sz = 0; + + if (inverted) { + rm_count = map->ele_count - rm_count; + } + + if (need_mask) { + cdt_idx_mask_set_by_irc(rm_mask, &rm_ic, NULL, inverted); + } + + if (b) { + int ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask, + rm_count, &rm_sz); + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + } + + switch (result->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_REVINDEX: + case RESULT_TYPE_INDEX: + if (inverted) { + result_data_set_int_list_by_mask(result, rm_mask, rm_count, + map->ele_count); + } + else { + result_data_set_by_irc(result, &rm_ic, NULL, items_count); + } + break; + case RESULT_TYPE_COUNT: + as_bin_set_int(result->result, rm_count); + break; + case RESULT_TYPE_KEY: + case RESULT_TYPE_VALUE: + case RESULT_TYPE_MAP: + if (! packed_map_build_ele_result_by_mask(map, rm_mask, rm_count, + rm_sz, result)) { + cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list_ordered() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + break; + case RESULT_TYPE_REVRANK: + case RESULT_TYPE_RANK: + default: + cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list_ordered() invalid return type %d", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + +#ifdef MAP_DEBUG_VERIFY + if (b && ! map_verify(b)) { + cdt_bin_print(b, "packed_map_get_remove_all_by_key_list_ordered"); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +packed_map_get_remove_all_by_key_list_unordered(const packed_map *map, + as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk, + uint32_t items_count, cdt_result_data *result) +{ + bool inverted = result_data_is_inverted(result); + bool is_ret_index = result_data_is_return_index(result); + uint32_t rm_count; + define_cdt_idx_mask(rm_mask, map->ele_count); + define_order_index(key_list_ordidx, items_count); + cond_vla_order_index2(ic, map->ele_count, items_count * 2, is_ret_index); + + if (! offset_index_find_items((offset_index *)&map->offidx, + CDT_FIND_ITEMS_IDXS_FOR_MAP_KEY, items_pk, &key_list_ordidx, + inverted, rm_mask, &rm_count, is_ret_index ? &ic.ordidx : NULL)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rm_sz = 0; + + if (b) { + int ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask, + rm_count, &rm_sz); + + if (ret < 0) { + return ret; + } + } + + switch (result->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_REVINDEX: + case RESULT_TYPE_INDEX: { + result_data_set_by_itemlist_irc(result, &key_list_ordidx, &ic.ordidx, + rm_count); + break; + } + case RESULT_TYPE_COUNT: + as_bin_set_int(result->result, rm_count); + break; + case RESULT_TYPE_KEY: + case RESULT_TYPE_VALUE: + case RESULT_TYPE_MAP: { + if (! packed_map_build_ele_result_by_mask(map, rm_mask, rm_count, rm_sz, + result)) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + break; + } + default: + cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_key_list_unordered() invalid return type %d", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + +#ifdef MAP_DEBUG_VERIFY + if (b && ! map_verify(b)) { + cdt_bin_print(b, "packed_map_get_remove_all_by_key_list_unordered"); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +packed_map_get_remove_all_by_value_list(const packed_map *map, as_bin *b, + rollback_alloc *alloc_buf, const cdt_payload *value_list, + cdt_result_data *result) +{ + as_unpacker items_pk; + uint32_t items_count; + + if (! list_param_parse(value_list, &items_pk, &items_count)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + bool inverted = result_data_is_inverted(result); + + if (items_count == 0) { + switch (result->type) { + case RESULT_TYPE_INDEX_RANGE: + case RESULT_TYPE_REVINDEX_RANGE: + case RESULT_TYPE_RANK_RANGE: + case RESULT_TYPE_REVRANK_RANGE: + cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_value_list() invalid result type %d", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + default: + break; + } + + if (! inverted) { + result_data_set_not_found(result, 0); + return AS_PROTO_RESULT_OK; + } + + result->flags &= ~AS_CDT_OP_FLAG_INVERTED; + + return packed_map_get_remove_all(map, b, alloc_buf, result); + } + + vla_map_offidx_if_invalid(u, map); + + if (order_index_is_valid(&map->value_idx)) { + return packed_map_get_remove_all_by_value_list_ordered(map, b, + alloc_buf, &items_pk, items_count, result); + } + + bool is_ret_rank = result_data_is_return_rank(result); + define_cdt_idx_mask(rm_mask, map->ele_count); + uint32_t rm_count = 0; + define_order_index(value_list_ordidx, items_count); + cond_vla_order_index2(rc, map->ele_count, items_count * 2, is_ret_rank); + + if (! offset_index_find_items(u.offidx, + CDT_FIND_ITEMS_IDXS_FOR_MAP_VALUE, &items_pk, &value_list_ordidx, + inverted, rm_mask, &rm_count, is_ret_rank ? &rc.ordidx : NULL)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rm_sz = 0; + + if (b) { + int ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask, + rm_count, &rm_sz); + + if (ret < 0) { + return ret; + } + } + + switch (result->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_REVINDEX: + case RESULT_TYPE_INDEX: { + int ret = packed_map_build_index_result_by_mask(map, rm_mask, rm_count, + result); + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + + break; + } + case RESULT_TYPE_REVRANK: + case RESULT_TYPE_RANK: { + result_data_set_by_itemlist_irc(result, &value_list_ordidx, + &rc.ordidx, rm_count); + break; + } + case RESULT_TYPE_COUNT: + as_bin_set_int(result->result, rm_count); + break; + case RESULT_TYPE_KEY: + case RESULT_TYPE_VALUE: + case RESULT_TYPE_MAP: { + if (! packed_map_build_ele_result_by_mask(map, rm_mask, rm_count, rm_sz, + result)) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + break; + } + default: + cf_warning(AS_PARTICLE, "packed_map_get_remove_all_by_value_list() invalid return type %d", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + +#ifdef MAP_DEBUG_VERIFY + if (b && ! map_verify(b)) { + cdt_bin_print(b, "packed_map_get_remove_all_by_value_list"); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +packed_map_get_remove_all_by_value_list_ordered(const packed_map *map, + as_bin *b, rollback_alloc *alloc_buf, as_unpacker *items_pk, + uint32_t items_count, cdt_result_data *result) +{ + define_order_index2(rm_rc, map->ele_count, 2 * items_count); + + if (! packed_map_ensure_ordidx_filled(map)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rm_count = 0; + + for (uint32_t i = 0; i < items_count; i++) { + cdt_payload value = { + .ptr = items_pk->buffer + items_pk->offset, + .sz = items_pk->offset + }; + + if (as_unpack_size(items_pk) <= 0) { + cf_warning(AS_PARTICLE, "packed_map_remove_all_value_items_ordered() invalid parameter"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + value.sz = items_pk->offset - value.sz; + + uint32_t rank = 0; + uint32_t count = 0; + + if (! packed_map_find_rank_range_by_value_interval_indexed(map, + &value, &value, &rank, &count, result->is_multi)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + order_index_set(&rm_rc, 2 * i, rank); + order_index_set(&rm_rc, (2 * i) + 1, count); + rm_count += count; + } + + bool inverted = result_data_is_inverted(result); + bool need_mask = b || result_data_is_return_elements(result) || + (inverted && (result_data_is_return_index(result) || + result_data_is_return_rank(result))); + cond_define_cdt_idx_mask(rm_mask, map->ele_count, need_mask); + uint32_t rm_sz = 0; + + if (inverted) { + rm_count = map->ele_count - rm_count; + } + + if (need_mask) { + cdt_idx_mask_set_by_irc(rm_mask, &rm_rc, &map->value_idx, inverted); + } + + if (b) { + int ret = packed_map_remove_by_mask(map, b, alloc_buf, rm_mask, + rm_count, &rm_sz); + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + } + + switch (result->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_REVINDEX: + case RESULT_TYPE_INDEX: { + if (inverted) { + int ret = packed_map_build_index_result_by_mask(map, rm_mask, + rm_count, result); + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + } + else { + result_data_set_by_irc(result, &rm_rc, &map->value_idx, rm_count); + } + break; + } + case RESULT_TYPE_REVRANK: + case RESULT_TYPE_RANK: + if (inverted) { + int ret = packed_map_build_rank_result_by_mask(map, rm_mask, + rm_count, result); + + if (ret != AS_PROTO_RESULT_OK) { + return ret; + } + } + else { + result_data_set_by_irc(result, &rm_rc, NULL, rm_count); + } + break; + case RESULT_TYPE_COUNT: + as_bin_set_int(result->result, rm_count); + break; + case RESULT_TYPE_KEY: + case RESULT_TYPE_VALUE: + case RESULT_TYPE_MAP: { + if (! packed_map_build_ele_result_by_mask(map, rm_mask, rm_count, rm_sz, + result)) { + cf_warning(AS_PARTICLE, "packed_map_remove_all_value_items_ordered() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + break; + } + default: + cf_warning(AS_PARTICLE, "packed_map_remove_all_value_items_ordered() invalid return type %d", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + +#ifdef MAP_DEBUG_VERIFY + if (b && ! map_verify(b)) { + cdt_bin_print(b, "packed_map_remove_all_value_items_ordered"); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +packed_map_get_remove_all(const packed_map *map, as_bin *b, + rollback_alloc *alloc_buf, cdt_result_data *result) +{ + cf_assert(! result_data_is_inverted(result), AS_PARTICLE, "packed_map_get_remove_all() INVERTED flag is invalid here"); + + if (b) { + as_bin_set_empty_packed_map(b, alloc_buf, map->flags); + } + + bool is_rev = false; + + switch (result->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_REVINDEX: + case RESULT_TYPE_REVRANK: + is_rev = true; + // no break + case RESULT_TYPE_INDEX: + case RESULT_TYPE_RANK: { + define_int_list_builder(builder, result->alloc, map->ele_count); + + cdt_container_builder_add_int_range(&builder, 0, map->ele_count, + map->ele_count, is_rev); + cdt_container_builder_set_result(&builder, result); + break; + } + case RESULT_TYPE_COUNT: + as_bin_set_int(result->result, map->ele_count); + break; + case RESULT_TYPE_KEY: + case RESULT_TYPE_VALUE: + case RESULT_TYPE_MAP: { + if (! packed_map_build_ele_result_by_idx_range(map, 0, map->ele_count, + result)) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + break; + } + case RESULT_TYPE_INDEX_RANGE: + case RESULT_TYPE_REVINDEX_RANGE: + case RESULT_TYPE_RANK_RANGE: + case RESULT_TYPE_REVRANK_RANGE: + result_data_set_list_int2x(result, 0, map->ele_count); + break; + default: + cf_warning(AS_PARTICLE, "packed_map_get_remove_all() invalid return type %d", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + +#ifdef MAP_DEBUG_VERIFY + if (b && ! map_verify(b)) { + cdt_bin_print(b, "packed_map_get_remove_all"); + } +#endif + + return AS_PROTO_RESULT_OK; +} + +static int +packed_map_remove_by_mask(const packed_map *map, as_bin *b, + rollback_alloc *alloc_buf, const uint64_t *rm_mask, uint32_t count, + uint32_t *rm_sz_r) +{ + if (count == 0) { + return AS_PROTO_RESULT_OK; + } + + const offset_index *offidx = &map->offidx; + uint32_t rm_sz = cdt_idx_mask_get_content_sz(rm_mask, count, offidx); + + if (rm_sz_r) { + *rm_sz_r = rm_sz; + } + + uint32_t new_ele_count = map->ele_count - count; + uint32_t content_sz = map->content_sz - rm_sz; + define_map_packer(mpk, new_ele_count, map->flags, content_sz); + + map_packer_setup_bin(&mpk, b, alloc_buf); + map_packer_write_hdridx(&mpk); + mpk.write_ptr = cdt_idx_mask_write_eles(rm_mask, count, offidx, + mpk.write_ptr, true); + + if (offset_index_is_valid(&mpk.offset_idx)) { + if (offset_index_is_full(offidx)) { + offidx_op off_op; + offidx_op_init(&off_op, &mpk.offset_idx, offidx); + uint32_t rm_idx = 0; + + for (uint32_t i = 0; i < count; i++) { + rm_idx = cdt_idx_mask_find(rm_mask, rm_idx, map->ele_count, + false); + offidx_op_remove(&off_op, rm_idx); + rm_idx++; + } + + offidx_op_end(&off_op); + } + else { + offset_index_set_filled(&mpk.offset_idx, 1); + map_offset_index_fill(&mpk.offset_idx, new_ele_count); + } + } + + if (order_index_is_valid(&mpk.value_idx)) { + if (order_index_is_filled(&map->value_idx)) { + order_index_op_remove_idx_mask(&mpk.value_idx, &map->value_idx, + rm_mask, count); + } + else if (! order_index_set_sorted(&mpk.value_idx, &mpk.offset_idx, + mpk.contents, mpk.content_sz, SORT_BY_VALUE)) { + cf_warning(AS_PARTICLE, "packed_map_remove_indexes() failed to sort new value_idex"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + } + + return AS_PROTO_RESULT_OK; +} + +static int +packed_map_remove_idx_range(const packed_map *map, as_bin *b, + rollback_alloc *alloc_buf, uint32_t idx, uint32_t count) +{ + offset_index *offidx = (offset_index *)&map->offidx; + uint32_t offset0 = offset_index_get_const(offidx, idx); + uint32_t idx_end = idx + count; + uint32_t offset1 = offset_index_get_const(offidx, idx_end); + uint32_t content_sz = map->content_sz - offset1 + offset0; + uint32_t new_ele_count = map->ele_count - count; + define_map_packer(mpk, new_ele_count, map->flags, content_sz); + + map_packer_setup_bin(&mpk, b, alloc_buf); + map_packer_write_hdridx(&mpk); + + uint32_t tail_sz = map->content_sz - offset1; + + memcpy(mpk.write_ptr, map->contents, offset0); + mpk.write_ptr += offset0; + memcpy(mpk.write_ptr, map->contents + offset1, tail_sz); + + if (offset_index_is_valid(&mpk.offset_idx)) { + if (offset_index_is_full(offidx)) { + offidx_op offop; + + offidx_op_init(&offop, &mpk.offset_idx, offidx); + offidx_op_remove_range(&offop, idx, count); + offidx_op_end(&offop); + } + else { + offset_index_set_filled(&mpk.offset_idx, 1); + map_offset_index_fill(&mpk.offset_idx, new_ele_count); + } + } + + if (order_index_is_valid(&mpk.value_idx)) { + if (order_index_is_filled(&map->value_idx)) { + uint32_t count0 = 0; + + for (uint32_t i = 0; i < map->ele_count; i++) { + uint32_t idx0 = order_index_get(&map->value_idx, i); + + if (idx0 >= idx && idx0 < idx_end) { + continue; + } + + if (idx0 >= idx_end) { + idx0 -= count; + } + + order_index_set(&mpk.value_idx, count0++, idx0); + } + } + else if (! order_index_set_sorted(&mpk.value_idx, &mpk.offset_idx, + mpk.contents, mpk.content_sz, SORT_BY_VALUE)) { + cf_warning(AS_PARTICLE, "packed_map_remove_idx_range() failed to sort new value_idex"); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + } + + return AS_PROTO_RESULT_OK; +} + +static bool +packed_map_get_range_by_key_interval_unordered(const packed_map *map, + const cdt_payload *key_start, const cdt_payload *key_end, + uint32_t *index, uint32_t *count, uint64_t *mask) +{ + cf_assert(key_end, AS_PARTICLE, "key_end == NULL"); + + as_unpacker pk_start = { + .buffer = key_start->ptr, + .length = key_start->sz + }; + + as_unpacker pk_end = { + .buffer = key_end->ptr, + .length = key_end->sz + }; + + // Pre-check parameters. + if (as_unpack_size(&pk_start) <= 0) { + cf_warning(AS_PARTICLE, "packed_map_get_range_by_key_interval_unordered() invalid start key"); + return false; + } + + if (key_end->ptr) { + // Pre-check parameters. + if (as_unpack_size(&pk_end) <= 0) { + cf_warning(AS_PARTICLE, "packed_map_get_range_by_key_interval_unordered() invalid end key"); + return false; + } + } + + *index = 0; + *count = 0; + + offset_index *offidx = (offset_index *)&map->offidx; + define_map_unpacker(pk, map); + + for (uint32_t i = 0; i < map->ele_count; i++) { + uint32_t key_offset = pk.offset; // start of key + + offset_index_set(offidx, i, key_offset); + + pk_start.offset = 0; + + msgpack_compare_t cmp_start = as_unpack_compare(&pk, &pk_start); + + if (cmp_start == MSGPACK_COMPARE_ERROR) { + cf_warning(AS_PARTICLE, "packed_map_get_range_by_key_interval_unordered() invalid packed map at index %u", i); + return false; + } + + if (cmp_start == MSGPACK_COMPARE_LESS) { + (*index)++; + } + else { + msgpack_compare_t cmp_end = MSGPACK_COMPARE_LESS; + + // NULL key_end->ptr means largest possible value. + if (key_end->ptr) { + pk.offset = key_offset; + pk_end.offset = 0; + cmp_end = as_unpack_compare(&pk, &pk_end); + } + + if (cmp_end == MSGPACK_COMPARE_LESS) { + cdt_idx_mask_set(mask, i); + (*count)++; + } + } + + // Skip value. + if (as_unpack_size(&pk) <= 0) { + cf_warning(AS_PARTICLE, "packed_map_get_range_by_key_interval_unordered() invalid packed map at index %u", i); + return false; + } + } + + offset_index_set_filled(offidx, map->ele_count); + + return true; +} + +static bool +packed_map_get_range_by_key_interval_ordered(const packed_map *map, + const cdt_payload *key_start, const cdt_payload *key_end, + uint32_t *index, uint32_t *count) +{ + map_ele_find find_key_start; + map_ele_find_init(&find_key_start, map); + + if (! packed_map_find_key(map, &find_key_start, key_start)) { + cf_warning(AS_PARTICLE, "packed_map_get_range_by_key_interval_ordered() find key failed, ele_count=%u", map->ele_count); + return false; + } + + *index = find_key_start.idx; + + if (key_start == key_end) { + if (find_key_start.found_key) { + *count = 1; + } + else { + *count = 0; + } + } + else if (key_end && key_end->ptr) { + map_ele_find find_key_end; + + map_ele_find_continue_from_lower(&find_key_end, &find_key_start, + map->ele_count); + + if (! packed_map_find_key(map, &find_key_end, key_end)) { + cf_warning(AS_PARTICLE, "packed_map_get_range_by_key_interval_ordered() find key failed, ele_count=%u", map->ele_count); + return false; + } + + if (find_key_end.idx <= find_key_start.idx) { + *count = 0; + } + else { + *count = find_key_end.idx - find_key_start.idx; + } + } + else { + *count = map->ele_count - find_key_start.idx; + } + + return true; +} + +// Does not respect invert flag. +static int +packed_map_build_rank_result_by_ele_idx(const packed_map *map, + const order_index *ele_idx, uint32_t start, uint32_t count, + cdt_result_data *result) +{ + if (! result->is_multi) { + uint32_t idx = order_index_get(ele_idx, start); + + return packed_map_build_rank_result_by_idx(map, idx, result); + } + + define_int_list_builder(builder, result->alloc, count); + bool is_rev = result->type == RESULT_TYPE_REVRANK; + + vla_map_allidx_if_invalid(uv, map); + + if (! packed_map_ensure_ordidx_filled(map)) { + cf_warning(AS_PARTICLE, "packed_map_build_rank_result_by_ele_idx() ordidx fill failed"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + for (uint32_t i = 0; i < count; i++) { + uint32_t idx = order_index_get(ele_idx, start + i); + map_ele_find find; + + map_ele_find_init_from_idx(&find, map, idx); + packed_map_find_rank_indexed(map, &find); + + if (! find.found_value) { + cf_warning(AS_PARTICLE, "packed_map_build_rank_result_by_ele_idx() idx %u not found find.rank %u", idx, find.rank); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rank = find.rank; + + cdt_container_builder_add_int_range(&builder, rank, 1, map->ele_count, + is_rev); + } + + cdt_container_builder_set_result(&builder, result); + + return AS_PROTO_RESULT_OK; +} + +// Does not respect invert flag. +static int +packed_map_build_rank_result_by_mask(const packed_map *map, + const uint64_t *mask, uint32_t count, cdt_result_data *result) +{ + uint32_t idx = 0; + + if (! result->is_multi) { + idx = cdt_idx_mask_find(mask, idx, map->ele_count, false); + + return packed_map_build_rank_result_by_idx(map, idx, result); + } + + define_int_list_builder(builder, result->alloc, count); + bool is_rev = result->type == RESULT_TYPE_REVRANK; + + vla_map_allidx_if_invalid(uv, map); + + if (! packed_map_ensure_ordidx_filled(map)) { + cf_warning(AS_PARTICLE, "packed_map_build_rank_result_by_mask() ordidx fill failed"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + for (uint32_t i = 0; i < count; i++) { + idx = cdt_idx_mask_find(mask, idx, map->ele_count, false); + + map_ele_find find; + + map_ele_find_init_from_idx(&find, map, idx); + packed_map_find_rank_indexed(map, &find); + + if (! find.found_value) { + cf_warning(AS_PARTICLE, "packed_map_build_rank_result_by_mask() idx %u not found find.rank %u", idx, find.rank); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rank = find.rank; + + cdt_container_builder_add_int_range(&builder, rank, 1, map->ele_count, + is_rev); + idx++; + } + + cdt_container_builder_set_result(&builder, result); + + return AS_PROTO_RESULT_OK; +} + +static int +packed_map_build_rank_result_by_index_range(const packed_map *map, + uint32_t index, uint32_t count, cdt_result_data *result) +{ + if (! result->is_multi) { + return packed_map_build_rank_result_by_idx(map, index, result); + } + + cf_assert(map_is_k_ordered(map), AS_PARTICLE, "map must be K_ORDERED"); + + bool inverted = result_data_is_inverted(result); + uint32_t ret_count = (inverted ? map->ele_count - count : count); + define_int_list_builder(builder, result->alloc, ret_count); + vla_map_allidx_if_invalid(uv, map); + + if (! packed_map_ensure_ordidx_filled(map)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + bool is_rev = result->type == RESULT_TYPE_REVRANK; + + if (inverted) { + for (uint32_t i = 0; i < index; i++) { + map_ele_find find; + + map_ele_find_init_from_idx(&find, map, i); + packed_map_find_rank_indexed(map, &find); + + if (! find.found_value) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rank = find.rank; + + if (is_rev) { + rank = map->ele_count - rank - 1; + } + + cdt_container_builder_add_int64(&builder, rank); + } + + for (uint32_t i = index + count; i < map->ele_count; i++) { + map_ele_find find; + + map_ele_find_init_from_idx(&find, map, i); + packed_map_find_rank_indexed(map, &find); + + if (! find.found_value) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rank = find.rank; + + if (is_rev) { + rank = map->ele_count - rank - 1; + } + + cdt_container_builder_add_int64(&builder, rank); + } + } + else { + for (uint32_t i = 0; i < count; i++) { + map_ele_find find; + + map_ele_find_init_from_idx(&find, map, index + i); + packed_map_find_rank_indexed(map, &find); + + if (! find.found_value) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t rank = find.rank; + + if (result->type == RESULT_TYPE_REVRANK) { + rank = map->ele_count - rank - 1; + } + + cdt_container_builder_add_int64(&builder, rank); + } + } + + cdt_container_builder_set_result(&builder, result); + + return AS_PROTO_RESULT_OK; +} + +static bool +packed_map_get_key_by_idx(const packed_map *map, cdt_payload *key, + uint32_t index) +{ + uint32_t pk_offset = offset_index_get_const(&map->offidx, index); + + as_unpacker pk = { + .buffer = map->contents + pk_offset, + .length = map->content_sz - pk_offset + }; + + int64_t sz = as_unpack_size(&pk); // read key + + if (sz <= 0) { + cf_warning(AS_PARTICLE, "packed_map_get_key_by_idx() read key failed sz %ld", sz); + return false; + } + + key->ptr = pk.buffer; + key->sz = (uint32_t)sz; + + return true; +} + +static bool +packed_map_get_value_by_idx(const packed_map *map, cdt_payload *value, + uint32_t idx) +{ + uint32_t pk_offset = offset_index_get_const(&map->offidx, idx); + uint32_t sz = offset_index_get_delta_const(&map->offidx, idx); + + as_unpacker pk = { + .buffer = map->contents + pk_offset, + .length = map->content_sz - pk_offset + }; + + int64_t key_sz = as_unpack_size(&pk); // read key + + if (key_sz <= 0) { + cf_warning(AS_PARTICLE, "packed_map_get_value_by_idx() read key failed key_sz %ld", key_sz); + return false; + } + + value->ptr = pk.buffer + (uint32_t)key_sz; + value->sz = sz - (uint32_t)key_sz; + + return true; +} + +static bool +packed_map_get_pair_by_idx(const packed_map *map, cdt_payload *value, + uint32_t index) +{ + uint32_t pk_offset = offset_index_get_const(&map->offidx, index); + uint32_t sz = offset_index_get_delta_const(&map->offidx, index); + + value->ptr = map->contents + pk_offset; + value->sz = sz; + + return true; +} + +// Does not respect invert flag. +static int +packed_map_build_index_result_by_ele_idx(const packed_map *map, + const order_index *ele_idx, uint32_t start, uint32_t count, + cdt_result_data *result) +{ + if (count == 0) { + if (! result_data_set_not_found(result, start)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; + } + + if (! result->is_multi) { + uint32_t index = order_index_get(ele_idx, start); + + if (! map_is_k_ordered(map)) { + index = packed_map_find_index_by_idx_unordered(map, index); + } + + if (result->type == RESULT_TYPE_REVINDEX) { + index = map->ele_count - index - 1; + } + + as_bin_set_int(result->result, index); + + return AS_PROTO_RESULT_OK; + } + + define_int_list_builder(builder, result->alloc, count); + + if (map_is_k_ordered(map)) { + for (uint32_t i = 0; i < count; i++) { + uint32_t index = order_index_get(ele_idx, start + i); + + if (result->type == RESULT_TYPE_REVINDEX) { + index = map->ele_count - index - 1; + } + + cdt_container_builder_add_int64(&builder, index); + } + } + else { + offset_index *offidx = (offset_index *)&map->offidx; + + // Preset offsets if necessary. + if (! map_offset_index_fill(offidx, map->ele_count)) { + cf_warning(AS_PARTICLE, "packed_map_build_index_result_by_ele_idx() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + // Make order index on stack. + define_order_index(keyordidx, map->ele_count); + bool success = order_index_set_sorted(&keyordidx, offidx, map->contents, + map->content_sz, SORT_BY_KEY); + + cf_assert(success, AS_PARTICLE, "invalid packed map with full offidx"); + + for (uint32_t i = 0; i < count; i++) { + uint32_t idx = order_index_get(ele_idx, start + i); + uint32_t index = order_index_find_idx(&keyordidx, idx, 0, + map->ele_count); + + if (index >= map->ele_count) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (result->type == RESULT_TYPE_REVINDEX) { + index = map->ele_count - index - 1; + } + + cdt_container_builder_add_int64(&builder, index); + } + } + + cdt_container_builder_set_result(&builder, result); + + return AS_PROTO_RESULT_OK; +} + +// Does not respect invert flag. +static int +packed_map_build_index_result_by_mask(const packed_map *map, + const uint64_t *mask, uint32_t count, cdt_result_data *result) +{ + if (count == 0) { + result_data_set_not_found(result, -1); + return AS_PROTO_RESULT_OK; + } + + if (! result->is_multi) { + uint32_t index = cdt_idx_mask_find(mask, 0, map->ele_count, false); + + if (! map_is_k_ordered(map)) { + index = packed_map_find_index_by_idx_unordered(map, index); + } + + if (result->type == RESULT_TYPE_REVINDEX) { + index = map->ele_count - index - 1; + } + + as_bin_set_int(result->result, index); + + return AS_PROTO_RESULT_OK; + } + + define_int_list_builder(builder, result->alloc, count); + + if (map_is_k_ordered(map)) { + uint32_t index = 0; + + for (uint32_t i = 0; i < count; i++) { + index = cdt_idx_mask_find(mask, index, map->ele_count, false); + cdt_container_builder_add_int64(&builder, + result->type == RESULT_TYPE_REVINDEX ? + map->ele_count - index - 1 : index); + index++; + } + } + else { + offset_index *offidx = (offset_index *)&map->offidx; + + // Preset offsets if necessary. + if (! map_offset_index_fill(offidx, map->ele_count)) { + cf_warning(AS_PARTICLE, "packed_map_build_index_result_by_ele_idx() invalid packed map"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + // Make order index on stack. + define_order_index(keyordidx, map->ele_count); + bool success = order_index_set_sorted(&keyordidx, offidx, map->contents, + map->content_sz, SORT_BY_KEY); + uint32_t idx = 0; + + cf_assert(success, AS_PARTICLE, "invalid packed map with full offidx"); + + for (uint32_t i = 0; i < count; i++) { + idx = cdt_idx_mask_find(mask, idx, map->ele_count, false); + + uint32_t index = order_index_find_idx(&keyordidx, idx, 0, + map->ele_count); + + if (index >= map->ele_count) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (result->type == RESULT_TYPE_REVINDEX) { + index = map->ele_count - index - 1; + } + + cdt_container_builder_add_int64(&builder, index); + idx++; + } + } + + cdt_container_builder_set_result(&builder, result); + + return AS_PROTO_RESULT_OK; +} + +// Build by map ele_idx range. +static bool +packed_map_build_ele_result_by_idx_range(const packed_map *map, + uint32_t start_idx, uint32_t count, cdt_result_data *result) +{ + offset_index *offidx = (offset_index *)&map->offidx; + + if (! map_offset_index_fill(offidx, map->ele_count)) { + cf_warning(AS_PARTICLE, "packed_map_build_ele_result_by_idx_range() invalid packed map"); + return false; + } + + bool inverted = result_data_is_inverted(result); + uint32_t offset0 = offset_index_get_const(offidx, start_idx); + uint32_t offset1 = offset_index_get_const(offidx, start_idx + count); + uint32_t max_sz = offset1 - offset0; + uint32_t ret_count = count; + cdt_container_builder builder; + + if (inverted) { + ret_count = map->ele_count - count; + max_sz = map->content_sz - max_sz; + } + + if (result->type == RESULT_TYPE_MAP) { + cdt_map_builder_start(&builder, result->alloc, ret_count, max_sz, + AS_PACKED_MAP_FLAG_PRESERVE_ORDER); + + if (inverted) { + uint32_t tail_sz = map->content_sz - offset1; + + memcpy(builder.write_ptr, map->contents, offset0); + builder.write_ptr += offset0; + memcpy(builder.write_ptr, map->contents + offset1, tail_sz); + } + else { + memcpy(builder.write_ptr, map->contents + offset0, max_sz); + } + + *builder.sz += max_sz; + cdt_container_builder_set_result(&builder, result); + + return true; + } + + packed_map_get_by_idx_func get_by_idx_func; + + if (result->type == RESULT_TYPE_KEY) { + get_by_idx_func = packed_map_get_key_by_idx; + } + else { + get_by_idx_func = packed_map_get_value_by_idx; + } + + if (result->is_multi) { + cdt_list_builder_start(&builder, result->alloc, ret_count, max_sz); + } + else { + cdt_payload packed; + + if (! get_by_idx_func(map, &packed, start_idx)) { + return false; + } + + return rollback_alloc_from_msgpack(result->alloc, result->result, + &packed); + } + + if (inverted) { + for (uint32_t i = 0; i < start_idx; i++) { + cdt_payload packed; + + if (! get_by_idx_func(map, &packed, i)) { + return false; + } + + cdt_container_builder_add(&builder, packed.ptr, packed.sz); + } + + for (uint32_t i = start_idx + count; i < map->ele_count; i++) { + cdt_payload packed; + + if (! get_by_idx_func(map, &packed, i)) { + return false; + } + + cdt_container_builder_add(&builder, packed.ptr, packed.sz); + } + } + else { + for (uint32_t i = 0; i < count; i++) { + cdt_payload packed; + + if (! get_by_idx_func(map, &packed, start_idx + i)) { + return false; + } + + cdt_container_builder_add(&builder, packed.ptr, packed.sz); + } + } + + cdt_container_builder_set_result(&builder, result); + + return true; +} + +// Does not respect invert flag. +static bool +packed_map_build_ele_result_by_ele_idx(const packed_map *map, + const order_index *ele_idx, uint32_t start, uint32_t count, + uint32_t rm_sz, cdt_result_data *result) +{ + if (rm_sz == 0) { + if (start != 0) { + order_index ref; + + order_index_init_ref(&ref, ele_idx, start, count); + rm_sz = order_index_get_ele_size(&ref, count, &map->offidx); + } + else { + rm_sz = order_index_get_ele_size(ele_idx, count, &map->offidx); + } + } + + packed_map_get_by_idx_func get_by_index_func; + cdt_container_builder builder; + uint32_t max_sz = (count != 0 ? rm_sz : 0); + + if (result->type == RESULT_TYPE_MAP) { + get_by_index_func = packed_map_get_pair_by_idx; + + cdt_map_builder_start(&builder, result->alloc, count, max_sz, + AS_PACKED_MAP_FLAG_PRESERVE_ORDER); + } + else { + if (result->type == RESULT_TYPE_KEY) { + get_by_index_func = packed_map_get_key_by_idx; + } + else { + get_by_index_func = packed_map_get_value_by_idx; + } + + if (result->is_multi) { + cdt_list_builder_start(&builder, result->alloc, count, + max_sz - count); + } + else if (count == 0) { + return true; + } + else { + uint32_t index = order_index_get(ele_idx, start); + cdt_payload packed; + + if (! get_by_index_func(map, &packed, index)) { + return false; + } + + return rollback_alloc_from_msgpack(result->alloc, result->result, + &packed); + } + } + + for (uint32_t i = 0; i < count; i++) { + uint32_t index = order_index_get(ele_idx, i + start); + cdt_payload packed; + + if (! get_by_index_func(map, &packed, index)) { + return false; + } + + cdt_container_builder_add(&builder, packed.ptr, packed.sz); + } + + cdt_container_builder_set_result(&builder, result); + + return true; +} + +// Does not respect invert flag. +static bool +packed_map_build_ele_result_by_mask(const packed_map *map, const uint64_t *mask, + uint32_t count, uint32_t rm_sz, cdt_result_data *result) +{ + if (! result->is_multi) { + uint32_t idx = cdt_idx_mask_find(mask, 0, map->ele_count, false); + define_order_index2(ele_idx, map->ele_count, 1); + + order_index_set(&ele_idx, 0, idx); + + return packed_map_build_ele_result_by_ele_idx(map, &ele_idx, 0, 1, + rm_sz, result); + } + + if (rm_sz == 0) { + rm_sz = cdt_idx_mask_get_content_sz(mask, count, &map->offidx); + } + + packed_map_get_by_idx_func get_by_index_func; + cdt_container_builder builder; + uint32_t max_sz = (count != 0 ? rm_sz : 0); + + if (result->type == RESULT_TYPE_MAP) { + get_by_index_func = packed_map_get_pair_by_idx; + + cdt_map_builder_start(&builder, result->alloc, count, max_sz, + AS_PACKED_MAP_FLAG_PRESERVE_ORDER); + } + else { + if (result->type == RESULT_TYPE_KEY) { + get_by_index_func = packed_map_get_key_by_idx; + } + else { + get_by_index_func = packed_map_get_value_by_idx; + } + + cdt_list_builder_start(&builder, result->alloc, count, max_sz - count); + } + + uint32_t index = 0; + + for (uint32_t i = 0; i < count; i++) { + cdt_payload packed; + + index = cdt_idx_mask_find(mask, index, map->ele_count, false); + + if (! get_by_index_func(map, &packed, index)) { + return false; + } + + cdt_container_builder_add(&builder, packed.ptr, packed.sz); + index++; + } + + cdt_container_builder_set_result(&builder, result); + + return true; +} + +static int +packed_map_build_result_by_key(const packed_map *map, const cdt_payload *key, + uint32_t idx, uint32_t count, cdt_result_data *result) +{ + switch (result->type) { + case RESULT_TYPE_NONE: + break; + case RESULT_TYPE_INDEX_RANGE: + case RESULT_TYPE_REVINDEX_RANGE: + case RESULT_TYPE_INDEX: + case RESULT_TYPE_REVINDEX: { + uint32_t index = idx; + + if (! map_is_k_ordered(map)) { + index = packed_map_find_index_by_key_unordered(map, key); + } + + if (result_data_is_return_index_range(result)) { + if (result->type == RESULT_TYPE_REVINDEX_RANGE) { + index = map->ele_count - index - count; + } + + result_data_set_list_int2x(result, index, count); + } + else { + if (result->type == RESULT_TYPE_REVINDEX) { + index = map->ele_count - index - count; + } + + as_bin_set_int(result->result, index); + } + + break; + } + case RESULT_TYPE_RANK: + case RESULT_TYPE_REVRANK: + if (result->is_multi) { + return packed_map_build_rank_result_by_idx_range(map, idx, count, + result); + } + + return packed_map_build_rank_result_by_idx(map, idx, result); + case RESULT_TYPE_COUNT: + as_bin_set_int(result->result, count); + break; + case RESULT_TYPE_KEY: + case RESULT_TYPE_VALUE: + case RESULT_TYPE_MAP: + if (! packed_map_build_ele_result_by_idx_range(map, idx, count, + result)) { + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + break; + case RESULT_TYPE_RANK_RANGE: + case RESULT_TYPE_REVRANK_RANGE: + default: + cf_warning(AS_PARTICLE, "packed_map_build_result_by_key() invalid result_type %d", result->type); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + return AS_PROTO_RESULT_OK; +} + +// Return negative codes on error. +static int64_t +packed_map_get_rank_by_idx(const packed_map *map, uint32_t idx) +{ + cf_assert(map_has_offidx(map), AS_PARTICLE, "packed_map_get_rank_by_idx() offset_index needs to be valid"); + + uint32_t rank; + + if (order_index_is_valid(&map->value_idx)) { + if (! packed_map_ensure_ordidx_filled(map)) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + map_ele_find find_key; + map_ele_find_init_from_idx(&find_key, map, idx); + + if (! packed_map_find_rank_indexed(map, &find_key)) { + cf_warning(AS_PARTICLE, "packed_map_get_rank_by_idx() packed_map_find_rank_indexed failed"); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (! find_key.found_value) { + cf_warning(AS_PARTICLE, "packed_map_get_rank_by_idx() rank not found, idx=%u rank=%u", find_key.idx, find_key.rank); + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + rank = find_key.rank; + } + else { + const offset_index *offidx = &map->offidx; + uint32_t pk_offset = offset_index_get_const(offidx, idx); + define_map_unpacker(pk, map); + + as_unpacker pk_entry = { + .buffer = map->contents + pk_offset, + .length = map->content_sz - pk_offset + }; + + rank = 0; + + for (uint32_t i = 0; i < map->ele_count; i++) { + pk_entry.offset = 0; + + msgpack_compare_t cmp = packed_map_compare_values(&pk, &pk_entry); + + if (cmp == MSGPACK_COMPARE_ERROR) { + return -AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (cmp == MSGPACK_COMPARE_LESS) { + rank++; + } + } + } + + return (int64_t)rank; +} + +static int +packed_map_build_rank_result_by_idx(const packed_map *map, uint32_t idx, + cdt_result_data *result) +{ + int64_t rank = packed_map_get_rank_by_idx(map, idx); + + if (rank < 0) { + return (int)rank; + } + + if (result->type == RESULT_TYPE_REVRANK) { + as_bin_set_int(result->result, (int64_t)map->ele_count - rank - 1); + } + else { + as_bin_set_int(result->result, rank); + } + + return AS_PROTO_RESULT_OK; +} + +static int +packed_map_build_rank_result_by_idx_range(const packed_map *map, uint32_t idx, + uint32_t count, cdt_result_data *result) +{ + define_int_list_builder(builder, result->alloc, count); + + for (uint32_t i = 0; i < count; i++) { + int64_t rank = packed_map_get_rank_by_idx(map, idx); + + if (rank < 0) { + return (int)rank; + } + + if (result->type == RESULT_TYPE_REVRANK) { + rank = (int64_t)map->ele_count - rank - 1; + } + + cdt_container_builder_add_int64(&builder, rank); + } + + cdt_container_builder_set_result(&builder, result); + + return AS_PROTO_RESULT_OK; +} + +static msgpack_compare_t +packed_map_compare_key_by_idx(const void *ptr, uint32_t idx1, uint32_t idx2) +{ + const packed_map *map = ptr; + const offset_index *offidx = &map->offidx; + + as_unpacker pk1 = { + .buffer = map->contents, + .offset = offset_index_get_const(offidx, idx1), + .length = map->content_sz + }; + + as_unpacker pk2 = { + .buffer = map->contents, + .offset = offset_index_get_const(offidx, idx2), + .length = map->content_sz + }; + + msgpack_compare_t ret = as_unpack_compare(&pk1, &pk2); + + if (ret == MSGPACK_COMPARE_EQUAL) { + ret = as_unpack_compare(&pk1, &pk2); + } + + return ret; +} + +static msgpack_compare_t +packed_map_compare_values(as_unpacker *pk1, as_unpacker *pk2) +{ + msgpack_compare_t keycmp = as_unpack_compare(pk1, pk2); + + if (keycmp == MSGPACK_COMPARE_ERROR) { + return MSGPACK_COMPARE_ERROR; + } + + msgpack_compare_t ret = as_unpack_compare(pk1, pk2); + + if (ret == MSGPACK_COMPARE_EQUAL) { + return keycmp; + } + + return ret; +} + +static msgpack_compare_t +packed_map_compare_value_by_idx(const void *ptr, uint32_t idx1, uint32_t idx2) +{ + const packed_map *map = ptr; + const offset_index *offidx = &map->offidx; + + as_unpacker pk1 = { + .buffer = map->contents, + .offset = offset_index_get_const(offidx, idx1), + .length = map->content_sz + }; + + as_unpacker pk2 = { + .buffer = map->contents, + .offset = offset_index_get_const(offidx, idx2), + .length = map->content_sz + }; + + return packed_map_compare_values(&pk1, &pk2); +} + +static bool +packed_map_write_k_ordered(const packed_map *map, uint8_t *write_ptr, + offset_index *offsets_new) +{ + uint32_t ele_count = map->ele_count; + define_order_index(key_ordidx, ele_count); + vla_map_offidx_if_invalid(old, map); + + if (! map_fill_offidx(map)) { + cf_warning(AS_PARTICLE, "packed_map_op_write_k_ordered() offset fill failed"); + return false; + } + + if (! order_index_set_sorted_with_offsets(&key_ordidx, old.offidx, + SORT_BY_KEY)) { + return false; + } + + const uint8_t *ptr = old.offidx->contents; + + offset_index_set_filled(offsets_new, 1); + + for (uint32_t i = 0; i < ele_count; i++) { + uint32_t index = order_index_get(&key_ordidx, i); + uint32_t offset = offset_index_get_const(old.offidx, index); + uint32_t sz = offset_index_get_delta_const(old.offidx, index); + + memcpy(write_ptr, ptr + offset, sz); + write_ptr += sz; + offset_index_append_size(offsets_new, sz); + } + + return true; +} + +//------------------------------------------------ +// packed_map_op + +static void +packed_map_op_init(packed_map_op *op, const packed_map *map) +{ + op->map = map; + + op->new_ele_count = 0; + op->ele_removed = 0; + + op->seg1_sz = 0; + op->seg2_offset = 0; + op->seg2_sz = 0; + + op->key1_offset = 0; + op->key1_sz = 0; + op->key2_offset = 0; + op->key2_sz = 0; +} + +// Return new size of map elements. +static int32_t +packed_map_op_add(packed_map_op *op, const map_ele_find *found) +{ + // Replace at offset. + if (found->found_key) { + op->new_ele_count = op->map->ele_count; + op->seg2_offset = found->key_offset + found->sz; + } + // Insert at offset. + else { + op->new_ele_count = op->map->ele_count + 1; + op->seg2_offset = found->key_offset; + } + + op->seg1_sz = found->key_offset; + op->seg2_sz = op->map->content_sz - op->seg2_offset; + + return (int32_t)(op->seg1_sz + op->seg2_sz); +} + +static int32_t +packed_map_op_remove(packed_map_op *op, const map_ele_find *found, + uint32_t count, uint32_t remove_sz) +{ + op->new_ele_count = op->map->ele_count - count; + op->seg1_sz = found->key_offset; + op->seg2_offset = found->key_offset + remove_sz; + op->seg2_sz = op->map->content_sz - op->seg2_offset; + + op->ele_removed = count; + + return (int32_t)(op->seg1_sz + op->seg2_sz); +} + +static uint8_t * +packed_map_op_write_seg1(const packed_map_op *op, uint8_t *buf) +{ + const uint8_t *src = op->map->contents; + + memcpy(buf, src, op->seg1_sz); + memcpy(buf + op->seg1_sz, src + op->key1_offset, op->key1_sz); + + return buf + op->seg1_sz + op->key1_sz; +} + +static uint8_t * +packed_map_op_write_seg2(const packed_map_op *op, uint8_t *buf) +{ + const uint8_t *src = op->map->contents; + + memcpy(buf, src + op->key2_offset, op->key2_sz); + memcpy(buf + op->key2_sz, src + op->seg2_offset, op->seg2_sz); + + return buf + op->key2_sz + op->seg2_sz; +} + +static bool +packed_map_op_write_new_offidx(const packed_map_op *op, + const map_ele_find *remove_info, const map_ele_find *add_info, + offset_index *new_offidx, uint32_t kv_sz) +{ + const offset_index *offidx = &op->map->offidx; + + if (! offset_index_is_full(offidx)) { + return false; + } + + cf_assert(op->new_ele_count >= op->map->ele_count, AS_PARTICLE, "op->new_ele_count %u < op->map->ele_count %u", op->new_ele_count, op->map->ele_count); + + uint32_t ele_count = op->map->ele_count; + + if (op->new_ele_count - op->map->ele_count != 0) { // add 1 + // Insert at end. + if (remove_info->idx == ele_count) { + offset_index_copy(new_offidx, offidx, 0, 0, ele_count, 0); + offset_index_set(new_offidx, ele_count, op->seg1_sz + op->seg2_sz); + } + // Insert at offset. + else { + offset_index_copy(new_offidx, offidx, 0, 0, + remove_info->idx + 1, 0); + offset_index_copy(new_offidx, offidx, remove_info->idx + 1, + remove_info->idx, (ele_count - remove_info->idx), kv_sz); + } + } + else { // replace 1 + cf_assert(remove_info->idx == add_info->idx, AS_PARTICLE, "remove_info->idx %u != add_info->idx %u", remove_info->idx, add_info->idx); + + offset_index_copy(new_offidx, offidx, 0, 0, remove_info->idx, 0); + offset_index_set(new_offidx, remove_info->idx, remove_info->key_offset); + + int delta = (int)kv_sz - (int)remove_info->sz; + + offset_index_copy(new_offidx, offidx, remove_info->idx + 1, + remove_info->idx + 1, ele_count - remove_info->idx - 1, delta); + } + + offset_index_set_filled(new_offidx, op->new_ele_count); + + return true; +} + +static bool +packed_map_op_write_new_ordidx(const packed_map_op *op, + const map_ele_find *remove_info, const map_ele_find *add_info, + order_index *value_idx) +{ + const order_index *ordidx = &op->map->value_idx; + + if (order_index_is_null(ordidx)) { + return false; + } + + cf_assert(op->new_ele_count >= op->map->ele_count, AS_PARTICLE, "op->new_ele_count %u < op->map->ele_count %u", op->new_ele_count, op->map->ele_count); + + if (op->new_ele_count - op->map->ele_count != 0) { // add 1 + order_index_op_add(value_idx, ordidx, add_info->idx, add_info->rank); + } + else { // replace 1 + cf_assert(remove_info->idx == add_info->idx, AS_PARTICLE, "remove_info->idx %u != add_info->idx %u", remove_info->idx, add_info->idx); + + order_index_op_replace1(value_idx, ordidx, add_info->rank, + remove_info->rank); + } + + return true; +} + +//------------------------------------------------ +// map_particle + +static as_particle * +map_particle_create(rollback_alloc *alloc_buf, uint32_t ele_count, + const uint8_t *buf, uint32_t content_sz, uint8_t flags) +{ + define_map_packer(mpk, ele_count, flags, content_sz); + map_mem *p_map_mem = (map_mem *)map_packer_create_particle(&mpk, alloc_buf); + + if (! p_map_mem) { + return NULL; + } + + map_packer_write_hdridx(&mpk); + + if (buf) { + memcpy(mpk.write_ptr, buf, content_sz); + } + + return (as_particle *)p_map_mem; +} + +// Return new size on success, negative values on failure. +static int64_t +map_particle_strip_indexes(const as_particle *p, uint8_t *dest) +{ + const map_mem *p_map_mem = (const map_mem *)p; + + if (p_map_mem->sz == 0) { + return 0; + } + + as_unpacker upk = { + .buffer = p_map_mem->data, + .length = p_map_mem->sz + }; + + int64_t ele_count = as_unpack_map_header_element_count(&upk); + + if (ele_count < 0) { + return -1; + } + + as_packer pk = { + .buffer = dest, + .capacity = INT_MAX + }; + + if (ele_count > 0 && as_unpack_peek_is_ext(&upk)) { + as_msgpack_ext ext; + + if (as_unpack_ext(&upk, &ext) != 0) { + return -2; + } + + // Skip nil val. + if (as_unpack_size(&upk) <= 0) { + return -3; + } + + uint8_t flags = ext.type; + + if (flags != AS_PACKED_MAP_FLAG_NONE) { + ele_count--; + } + + flags &= ~(AS_PACKED_MAP_FLAG_OFF_IDX | AS_PACKED_MAP_FLAG_ORD_IDX); + + if (flags != AS_PACKED_MAP_FLAG_NONE) { + as_pack_map_header(&pk, (uint32_t)ele_count + 1); + as_pack_ext_header(&pk, 0, flags); + pk.buffer[pk.offset++] = msgpack_nil[0]; + } + else { + as_pack_map_header(&pk, (uint32_t)ele_count); + } + } + else { + // Copy header. + as_pack_map_header(&pk, (uint32_t)ele_count); + } + + // Copy elements. + size_t ele_sz = (size_t)(upk.length - upk.offset); + + memcpy(pk.buffer + pk.offset, upk.buffer + upk.offset, ele_sz); + + return (int64_t)pk.offset + (int64_t)ele_sz; +} + +//------------------------------------------------ +// map_ele_find + +static void +map_ele_find_init(map_ele_find *find, const packed_map *map) +{ + find->found_key = false; + find->found_value = false; + find->idx = map->ele_count; + find->rank = map->ele_count; + + find->key_offset = 0; + find->value_offset = 0; + find->sz = 0; + + find->lower = 0; + find->upper = map->ele_count; +} + +static void +map_ele_find_continue_from_lower(map_ele_find *find, const map_ele_find *found, + uint32_t ele_count) +{ + find->found_key = false; + find->found_value = false; + + find->idx = ele_count + found->idx; + find->idx /= 2; + find->rank = find->idx; + + find->key_offset = found->key_offset; + find->value_offset = found->value_offset; + find->sz = found->sz; + + find->lower = found->idx; + find->upper = ele_count; +} + +static void +map_ele_find_init_from_idx(map_ele_find *find, const packed_map *map, + uint32_t idx) +{ + map_ele_find_init(find, map); + find->found_key = true; + find->idx = idx; + find->key_offset = offset_index_get_const(&map->offidx, idx); + + as_unpacker pk = { + .buffer = map->contents, + .offset = find->key_offset, + .length = map->content_sz + }; + + as_unpack_size(&pk); + find->value_offset = pk.offset; + find->sz = offset_index_get_const(&map->offidx, idx + 1) - find->key_offset; +} + +//------------------------------------------------ +// map_offset_index + +static bool +map_offset_index_fill(offset_index *offidx, uint32_t index) +{ + uint32_t ele_filled = offset_index_get_filled(offidx); + + if (index < ele_filled || offidx->_.ele_count == ele_filled) { + return true; + } + + as_unpacker pk = { + .buffer = offidx->contents, + .length = offidx->content_sz + }; + + pk.offset = offset_index_get_const(offidx, ele_filled - 1); + + for (uint32_t i = ele_filled; i < index; i++) { + if (as_unpack_size(&pk) <= 0) { + return false; + } + + if (as_unpack_size(&pk) <= 0) { + return false; + } + + offset_index_set(offidx, i, pk.offset); + } + + if (as_unpack_size(&pk) <= 0) { + return false; + } + + if (as_unpack_size(&pk) <= 0) { + return false; + } + + // Make sure last iteration is in range for set. + if (index < offidx->_.ele_count) { + offset_index_set(offidx, index, pk.offset); + offset_index_set_filled(offidx, index + 1); + } + // Check if sizes match. + else if (pk.offset != offidx->content_sz) { + cf_warning(AS_PARTICLE, "map_offset_index_fill() offset mismatch %u, expected %u", pk.offset, offidx->content_sz); + return false; + } + else { + offset_index_set_filled(offidx, offidx->_.ele_count); + } + + return true; +} + +static int64_t +map_offset_index_get(offset_index *offidx, uint32_t index) +{ + if (index > offidx->_.ele_count) { + index = offidx->_.ele_count; + } + + if (! map_offset_index_fill(offidx, index)) { + return -1; + } + + return (int64_t)offset_index_get_const(offidx, index); +} + +static int64_t +map_offset_index_get_delta(offset_index *offidx, uint32_t index) +{ + int64_t offset = map_offset_index_get(offidx, index); + + if (offset < 0) { + return offset; + } + + if (index == offidx->_.ele_count - 1) { + return (int64_t)offidx->content_sz - offset; + } + + return map_offset_index_get(offidx, index + 1) - offset; +} + +//------------------------------------------------ +// offidx_op + +static void +offidx_op_init(offidx_op *op, offset_index *dest, const offset_index *src) +{ + op->dest = dest; + op->src = src; + op->d_i = 0; + op->s_i = 0; + op->delta = 0; +} + +static void +offidx_op_remove(offidx_op *op, uint32_t index) +{ + uint32_t count = index - op->s_i; + uint32_t mem_sz = offset_index_get_delta_const(op->src, index); + + offset_index_copy(op->dest, op->src, op->d_i, op->s_i, count, op->delta); + + op->delta -= mem_sz; + op->d_i += count; + op->s_i += count + 1; +} + +static void +offidx_op_remove_range(offidx_op *op, uint32_t index, uint32_t count) +{ + uint32_t ele_count = op->src->_.ele_count; + uint32_t delta_count = index - op->s_i; + uint32_t offset = offset_index_get_const(op->src, index); + uint32_t mem_sz; + + if (index + count == ele_count) { + mem_sz = op->src->content_sz - offset; + } + else { + mem_sz = offset_index_get_const(op->src, index + count) - offset; + } + + offset_index_copy(op->dest, op->src, op->d_i, op->s_i, delta_count, + op->delta); + + op->delta -= mem_sz; + op->d_i += delta_count; + op->s_i += delta_count + count; +} + +static void +offidx_op_end(offidx_op *op) +{ + uint32_t ele_count = op->src->_.ele_count; + uint32_t count = ele_count - op->s_i; + + offset_index_copy(op->dest, op->src, op->d_i, op->s_i, count, op->delta); + op->d_i += count; + offset_index_set_filled(op->dest, op->d_i); +} + +//------------------------------------------------ +// order_index + +static bool +order_index_sort(order_index *ordidx, const offset_index *offsets, + const uint8_t *contents, uint32_t content_sz, sort_by_t sort_by) +{ + uint32_t ele_count = ordidx->_.ele_count; + + index_sort_userdata udata = { + .order = ordidx, + .offsets = offsets, + .contents = contents, + .content_sz = content_sz, + .error = false, + .sort_by = sort_by + }; + + qsort_r(order_index_get_mem(ordidx, 0), ele_count, ordidx->_.ele_sz, + map_packer_fill_index_sort_compare, (void *)&udata); + + if (udata.error) { + return false; + } + + return true; +} + +static inline bool +order_index_set_sorted(order_index *ordidx, const offset_index *offsets, + const uint8_t *ele_start, uint32_t tot_ele_sz, sort_by_t sort_by) +{ + uint32_t ele_count = ordidx->_.ele_count; + + for (uint32_t i = 0; i < ele_count; i++) { + order_index_set(ordidx, i, i); + } + + return order_index_sort(ordidx, offsets, ele_start, tot_ele_sz, sort_by); +} + +static bool +order_index_set_sorted_with_offsets(order_index *ordidx, + const offset_index *offsets, sort_by_t sort_by) +{ + return order_index_set_sorted(ordidx, offsets, offsets->contents, + offsets->content_sz, sort_by); +} + +static uint32_t +order_index_find_idx(const order_index *ordidx, uint32_t idx, uint32_t start, + uint32_t len) +{ + for (uint32_t i = start; i < start + len; i++) { + if (order_index_get(ordidx, i) == idx) { + return i; + } + } + + return start + len; +} + +//------------------------------------------------ +// order_index_adjust + +static uint32_t +order_index_adjust_lower(const order_index_adjust *via, uint32_t src) +{ + if (src >= via->lower) { + return src + via->delta; + } + + return src; +} + +//------------------------------------------------ +// order_index_op + +static inline void +order_index_op_add(order_index *dest, const order_index *src, uint32_t add_idx, + uint32_t add_rank) +{ + uint32_t ele_count = src->_.ele_count; + + order_index_adjust adjust = { + .f = order_index_adjust_lower, + .lower = add_idx, + .upper = 0, + .delta = 1 + }; + + cf_assert(add_rank <= ele_count, AS_PARTICLE, "order_index_op_add() add_rank(%u) > ele_count(%u)", add_rank, ele_count); + order_index_copy(dest, src, 0, 0, add_rank, &adjust); + order_index_set(dest, add_rank, add_idx); + order_index_copy(dest, src, add_rank + 1, add_rank, ele_count - add_rank, + &adjust); +} + +static inline void +order_index_op_replace1_internal(order_index *dest, const order_index *src, + uint32_t add_idx, uint32_t add_rank, uint32_t remove_rank, + const order_index_adjust *adjust) +{ + uint32_t ele_count = src->_.ele_count; + + if (add_rank == remove_rank) { + order_index_copy(dest, src, 0, 0, ele_count, NULL); + } + else if (add_rank > remove_rank) { + order_index_copy(dest, src, 0, 0, remove_rank, adjust); + order_index_copy(dest, src, remove_rank, remove_rank + 1, + add_rank - remove_rank - 1, adjust); + order_index_set(dest, add_rank - 1, add_idx); + order_index_copy(dest, src, add_rank, add_rank, ele_count - add_rank, + adjust); + } + else { + order_index_copy(dest, src, 0, 0, add_rank, adjust); + order_index_set(dest, add_rank, add_idx); + order_index_copy(dest, src, add_rank + 1, add_rank, + remove_rank - add_rank, adjust); + order_index_copy(dest, src, remove_rank + 1, remove_rank + 1, + ele_count - remove_rank - 1, adjust); + } +} + +// Replace remove_rank with add_rank in dest. +static inline void +order_index_op_replace1(order_index *dest, const order_index *src, + uint32_t add_rank, uint32_t remove_rank) +{ + uint32_t add_idx = order_index_get(src, remove_rank); + + order_index_op_replace1_internal(dest, src, add_idx, add_rank, remove_rank, + NULL); +} + +static void +order_index_op_remove_idx_mask(order_index *dest, const order_index *src, + const uint64_t *mask, uint32_t count) +{ + if (count == 0) { + return; + } + + uint32_t ele_count = src->max_idx; + uint32_t mask_count = cdt_idx_mask_count(ele_count); + define_order_index2(cntidx, ele_count, mask_count); + + order_index_set(&cntidx, 0, cf_bit_count64(mask[0])); + + for (uint32_t i = 1; i < mask_count; i++) { + uint32_t prev = order_index_get(&cntidx, i - 1); + + order_index_set(&cntidx, i, prev + cf_bit_count64(mask[i])); + } + + uint32_t di = 0; + + for (uint32_t i = 0; i < ele_count; i++) { + uint32_t idx = order_index_get(src, i); + + if (idx >= ele_count || cdt_idx_mask_is_set(mask, idx)) { + continue; + } + + uint32_t mask_i = idx / 64; + uint32_t offset = idx % 64; + uint64_t bits = cdt_idx_mask_get(mask, idx) & ((1ULL << offset) - 1); + + if (mask_i == 0) { + idx -= cf_bit_count64(bits); + } + else { + idx -= cf_bit_count64(bits) + order_index_get(&cntidx, mask_i - 1); + } + + order_index_set(dest, di++, idx); + } + + cf_assert(dest->_.ele_count == di, AS_PARTICLE, "count mismatch ele_count %u != di %u", dest->_.ele_count, di); +} + + +//========================================================== +// result_data + +static bool +result_data_set_key_not_found(cdt_result_data *rd, int64_t index) +{ + switch (rd->type) { + case RESULT_TYPE_RANK_RANGE: + case RESULT_TYPE_REVRANK_RANGE: + break; + default: + return result_data_set_not_found(rd, index); + } + + return false; +} + +static bool +result_data_set_value_not_found(cdt_result_data *rd, int64_t rank) +{ + switch (rd->type) { + case RESULT_TYPE_REVINDEX_RANGE: + case RESULT_TYPE_INDEX_RANGE: + return false; + default: + return result_data_set_not_found(rd, rank); + } + + return true; +} + + +//========================================================== +// cdt_map_builder +// + +void +cdt_map_builder_start(cdt_container_builder *builder, rollback_alloc *alloc_buf, + uint32_t ele_count, uint32_t max_sz, uint8_t flags) +{ + uint32_t sz = sizeof(map_mem) + sizeof(uint64_t) + 1 + 3 + max_sz; + map_mem *p_map_mem = (map_mem *)rollback_alloc_reserve(alloc_buf, sz); + + as_packer pk = { + .buffer = p_map_mem->data, + .capacity = INT_MAX + }; + + if (flags != AS_PACKED_MAP_FLAG_NONE) { + as_pack_map_header(&pk, ele_count + 1); + as_pack_ext_header(&pk, 0, flags); + pk.buffer[pk.offset++] = msgpack_nil[0]; + } + else { + as_pack_map_header(&pk, ele_count); + } + + p_map_mem->type = AS_PARTICLE_TYPE_MAP; + p_map_mem->sz = pk.offset; + + builder->particle = (as_particle *)p_map_mem; + builder->write_ptr = p_map_mem->data + p_map_mem->sz; + builder->ele_count = 0; + builder->sz = &p_map_mem->sz; +} + + +//========================================================== +// cdt_process_state_packed_map +// + +bool +cdt_process_state_packed_map_modify_optype(cdt_process_state *state, + cdt_modify_data *cdt_udata) +{ + as_bin *b = cdt_udata->b; + as_cdt_optype optype = state->type; + + if (! is_map_type(as_bin_get_particle_type(b)) && as_bin_inuse(b)) { + cf_warning(AS_PARTICLE, "cdt_process_state_packed_map_modify_optype() invalid type %d", as_bin_get_particle_type(b)); + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + return false; + } + + define_rollback_alloc(alloc_buf, cdt_udata->alloc_buf, 1, true); + // Results always on the heap. + define_rollback_alloc(alloc_result, NULL, 1, false); + int ret = AS_PROTO_RESULT_OK; + + cdt_result_data result = { + .result = cdt_udata->result, + .alloc = alloc_result, + }; + + switch (optype) { + case AS_CDT_OP_MAP_SET_TYPE: { + uint64_t flags; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &flags)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + as_bin_use_static_map_mem_if_notinuse(b, 0); + ret = map_set_flags(b, alloc_buf, result.result, (uint8_t)flags); + break; + } + case AS_CDT_OP_MAP_ADD: { + cdt_payload key; + cdt_payload value; + uint64_t flags = 0; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &key, &value, &flags)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + map_add_control control = { + .allow_overwrite = false, + .allow_create = true, + }; + + as_bin_use_static_map_mem_if_notinuse(b, flags); + ret = map_add(b, alloc_buf, &key, &value, result.result, &control); + break; + } + case AS_CDT_OP_MAP_ADD_ITEMS: { + cdt_payload items; + uint64_t flags = 0; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &items, &flags)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + map_add_control control = { + .allow_overwrite = false, + .allow_create = true, + }; + + as_bin_use_static_map_mem_if_notinuse(b, flags); + ret = map_add_items(b, alloc_buf, &items, result.result, &control); + break; + } + case AS_CDT_OP_MAP_PUT: { + cdt_payload key; + cdt_payload value; + uint64_t flags = 0; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &key, &value, &flags)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + map_add_control control = { + .allow_overwrite = true, + .allow_create = true, + }; + + as_bin_use_static_map_mem_if_notinuse(b, flags); + ret = map_add(b, alloc_buf, &key, &value, result.result, &control); + break; + } + case AS_CDT_OP_MAP_PUT_ITEMS: { + cdt_payload items; + uint64_t flags = 0; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &items, &flags)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + map_add_control control = { + .allow_overwrite = true, + .allow_create = true, + }; + + as_bin_use_static_map_mem_if_notinuse(b, flags); + ret = map_add_items(b, alloc_buf, &items, result.result, &control); + break; + } + case AS_CDT_OP_MAP_REPLACE: { + cdt_payload key; + cdt_payload value; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &key, &value)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + map_add_control control = { + .allow_overwrite = true, + .allow_create = false, + }; + + as_bin_use_static_map_mem_if_notinuse(b, 0); + ret = map_add(b, alloc_buf, &key, &value, result.result, &control); + break; + } + case AS_CDT_OP_MAP_REPLACE_ITEMS: { + if (! as_bin_inuse(b)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_ELEMENT_NOT_FOUND; + return false; + } + + cdt_payload items; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &items)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + map_add_control control = { + .allow_overwrite = true, + .allow_create = false, + }; + + ret = map_add_items(b, alloc_buf, &items, result.result, &control); + break; + } + case AS_CDT_OP_MAP_INCREMENT: + case AS_CDT_OP_MAP_DECREMENT: { + cdt_payload key; + cdt_payload delta_value = { NULL }; + uint64_t flags = 0; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &key, &delta_value, &flags)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + as_bin_use_static_map_mem_if_notinuse(b, flags); + ret = map_increment(b, alloc_buf, &key, &delta_value, result.result, + optype == AS_CDT_OP_MAP_DECREMENT); + break; + } + case AS_CDT_OP_MAP_REMOVE_BY_KEY: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t op_flags; + cdt_payload key; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &op_flags, &key)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, op_flags, false); + ret = map_remove_by_key_interval(b, alloc_buf, &key, &key, &result); + break; + } + case AS_CDT_OP_MAP_REMOVE_BY_INDEX: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + int64_t index; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, false); + ret = map_remove_by_index_range(b, alloc_buf, index, 1, &result); + break; + } + case AS_CDT_OP_MAP_REMOVE_BY_VALUE: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + cdt_payload value; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, false); + ret = map_remove_by_value_interval(b, alloc_buf, &value, &value, + &result); + break; + } + case AS_CDT_OP_MAP_REMOVE_BY_RANK: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + int64_t index; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, false); + ret = map_remove_by_rank_range(b, alloc_buf, index, 1, &result); + break; + } + case AS_CDT_OP_MAP_REMOVE_BY_KEY_LIST: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + cdt_payload items; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &items)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = map_remove_all_by_key_list(b, alloc_buf, &items, &result); + break; + } + case AS_CDT_OP_MAP_REMOVE_ALL_BY_VALUE: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + cdt_payload value; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = map_remove_by_value_interval(b, alloc_buf, &value, &value, + &result); + break; + } + case AS_CDT_OP_MAP_REMOVE_BY_VALUE_LIST: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + cdt_payload items; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &items)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = map_remove_all_by_value_list(b, alloc_buf, &items, &result); + break; + } + case AS_CDT_OP_MAP_REMOVE_BY_KEY_INTERVAL: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + cdt_payload key_start; + cdt_payload key_end = { NULL }; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &key_start, + &key_end)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = map_remove_by_key_interval(b, alloc_buf, &key_start, &key_end, + &result); + break; + } + case AS_CDT_OP_MAP_REMOVE_BY_INDEX_RANGE: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + int64_t index; + uint64_t count = UINT32_MAX; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index, &count)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = map_remove_by_index_range(b, alloc_buf, index, count, &result); + break; + } + case AS_CDT_OP_MAP_REMOVE_BY_VALUE_INTERVAL: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + cdt_payload value_start; + cdt_payload value_end = { NULL }; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value_start, + &value_end)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = map_remove_by_value_interval(b, alloc_buf, &value_start, + &value_end, &result); + break; + } + case AS_CDT_OP_MAP_REMOVE_BY_RANK_RANGE: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + uint64_t result_type; + int64_t rank; + uint64_t count = UINT32_MAX; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank, &count)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = map_remove_by_rank_range(b, alloc_buf, rank, count, &result); + break; + } + case AS_CDT_OP_MAP_CLEAR: { + if (! as_bin_inuse(b)) { + return true; // no-op + } + + ret = map_clear(b, alloc_buf, result.result); + break; + } + default: + cf_warning(AS_PARTICLE, "cdt_process_state_packed_map_modify_optype() invalid cdt op: %d", optype); + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + if (ret != AS_PROTO_RESULT_OK) { + cf_warning(AS_PARTICLE, "%s: failed", cdt_process_state_get_op_name(state)); + cdt_udata->ret_code = ret; + rollback_alloc_rollback(alloc_result); + rollback_alloc_rollback(alloc_buf); + return false; + } + + if (b->particle == (const as_particle *)&map_mem_empty) { + as_bin_set_empty_packed_map(b, alloc_buf, 0); + } + else if (b->particle == (const as_particle *)map_mem_empty_flagged_table) { + as_bin_set_empty_packed_map(b, alloc_buf, + map_mem_empty_flagged_table[0].map.ext_flags); + } + else if (b->particle == + (const as_particle *)(map_mem_empty_flagged_table + 1)) { + as_bin_set_empty_packed_map(b, alloc_buf, + map_mem_empty_flagged_table[1].map.ext_flags); + } + + return true; +} + +bool +cdt_process_state_packed_map_read_optype(cdt_process_state *state, + cdt_read_data *cdt_udata) +{ + const as_bin *b = cdt_udata->b; + as_cdt_optype optype = state->type; + + if (! is_map_type(as_bin_get_particle_type(b))) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + return false; + } + + packed_map map; + + if (! packed_map_init_from_bin(&map, b, false)) { + cf_warning(AS_PARTICLE, "%s: invalid map", cdt_process_state_get_op_name(state)); + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + // Just one entry needed for results bin. + define_rollback_alloc(alloc_result, NULL, 1, false); + int ret = AS_PROTO_RESULT_OK; + + cdt_result_data result = { + .result = cdt_udata->result, + .alloc = alloc_result, + }; + + switch (optype) { + case AS_CDT_OP_MAP_SIZE: { + as_bin_set_int(result.result, map.ele_count); + break; + } + case AS_CDT_OP_MAP_GET_BY_KEY: { + uint64_t op_flags; + cdt_payload key; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &op_flags, &key)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, op_flags, false); + ret = packed_map_get_remove_by_key_interval(&map, NULL, NULL, &key, + &key, &result); + break; + } + case AS_CDT_OP_MAP_GET_BY_VALUE: { + uint64_t result_type; + cdt_payload value; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, false); + ret = packed_map_get_remove_by_value_interval(&map, NULL, NULL, + &value, &value, &result); + break; + } + case AS_CDT_OP_MAP_GET_BY_INDEX: { + uint64_t result_type; + int64_t index; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, false); + ret = packed_map_get_remove_by_index_range(&map, NULL, NULL, index, 1, + &result); + break; + } + case AS_CDT_OP_MAP_GET_BY_RANK: { + uint64_t result_type; + int64_t rank; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, false); + ret = packed_map_get_remove_by_rank_range(&map, NULL, NULL, rank, 1, + &result); + break; + } + case AS_CDT_OP_MAP_GET_ALL_BY_VALUE: { + uint64_t result_type; + cdt_payload value; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = packed_map_get_remove_by_value_interval(&map, NULL, NULL, + &value, &value, &result); + break; + } + case AS_CDT_OP_MAP_GET_BY_KEY_INTERVAL: { + uint64_t result_type; + cdt_payload key_start; + cdt_payload key_end = { NULL }; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &key_start, + &key_end)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = packed_map_get_remove_by_key_interval(&map, NULL, NULL, + &key_start, &key_end, &result); + break; + } + case AS_CDT_OP_MAP_GET_BY_VALUE_INTERVAL: { + uint64_t result_type; + cdt_payload value_start; + cdt_payload value_end = { NULL }; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &value_start, + &value_end)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = packed_map_get_remove_by_value_interval(&map, NULL, NULL, + &value_start, &value_end, &result); + break; + } + case AS_CDT_OP_MAP_GET_BY_INDEX_RANGE: { + uint64_t result_type; + int64_t index; + uint64_t count = UINT32_MAX; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &index, &count)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = packed_map_get_remove_by_index_range(&map, NULL, NULL, index, + count, &result); + break; + } + case AS_CDT_OP_MAP_GET_BY_RANK_RANGE: { + uint64_t result_type; + int64_t rank; + uint64_t count = UINT32_MAX; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &rank, &count)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = packed_map_get_remove_by_rank_range(&map, NULL, NULL, rank, count, + &result); + break; + } + case AS_CDT_OP_MAP_GET_BY_KEY_LIST: { + uint64_t result_type; + cdt_payload items; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &items)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = packed_map_get_remove_all_by_key_list(&map, NULL, NULL, &items, + &result); + break; + } + case AS_CDT_OP_MAP_GET_BY_VALUE_LIST: { + uint64_t result_type; + cdt_payload items; + + if (! CDT_OP_TABLE_GET_PARAMS(state, &result_type, &items)) { + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + result_data_set(&result, result_type, true); + ret = packed_map_get_remove_all_by_value_list(&map, NULL, NULL, &items, + &result); + break; + } + default: + cf_warning(AS_PARTICLE, "cdt_process_state_packed_map_read_optype() invalid cdt op: %d", optype); + cdt_udata->ret_code = -AS_PROTO_RESULT_FAIL_PARAMETER; + return false; + } + + if (ret != AS_PROTO_RESULT_OK) { + cf_warning(AS_PARTICLE, "%s: failed", cdt_process_state_get_op_name(state)); + cdt_udata->ret_code = ret; + rollback_alloc_rollback(alloc_result); + return false; + } + + return true; +} + + +//========================================================== +// Debugging support. +// + +static void +map_print(const packed_map *map, const char *name) +{ + print_packed(map->packed, map->packed_sz, name); +} + +static bool +map_verify(const as_bin *b) +{ + packed_map map; + + uint8_t type = as_bin_get_particle_type(b); + + if (type != AS_PARTICLE_TYPE_MAP) { + cf_warning(AS_PARTICLE, "map_verify() non-map type: %u", type); + return false; + } + + // Check header. + if (! packed_map_init_from_bin(&map, b, false)) { + cf_warning(AS_PARTICLE, "map_verify() invalid packed map"); + return false; + } + + if (map.flags != 0) { + const uint8_t *byte = map.contents - 1; + + if (*byte != 0xC0) { + cf_warning(AS_PARTICLE, "map_verify() invalid ext header, expected C0 for pair.2"); + } + } + + const order_index *ordidx = &map.value_idx; + bool check_offidx = map_has_offidx(&map); + define_map_unpacker(pk, &map); + vla_map_offidx_if_invalid(u, &map); + + uint32_t filled = offset_index_get_filled(u.offidx); + define_offset_index(temp_offidx, u.offidx->contents, u.offidx->content_sz, + u.offidx->_.ele_count); + + if (map.ele_count != 0) { + offset_index_copy(&temp_offidx, u.offidx, 0, 0, filled, 0); + } + + // Check offsets. + for (uint32_t i = 0; i < map.ele_count; i++) { + uint32_t offset; + + if (check_offidx) { + if (i < filled) { + offset = offset_index_get_const(u.offidx, i); + + if (pk.offset != offset) { + cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u expected=%d", i, offset, pk.offset); + return false; + } + } + else { + offset_index_set(&temp_offidx, i, pk.offset); + } + } + else { + offset_index_set(u.offidx, i, pk.offset); + } + + offset = pk.offset; + + if (as_unpack_size(&pk) <= 0) { + cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u invalid key", i, offset, pk.offset); + return false; + } + + offset = pk.offset; + + if (as_unpack_size(&pk) <= 0) { + cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u invalid value", i, offset, pk.offset); + return false; + } + } + + if (check_offidx && filled < map.ele_count) { + u.offidx->_.ptr = temp_offidx._.ptr; + } + + // Check packed size. + if (map.content_sz != pk.offset) { + cf_warning(AS_PARTICLE, "map_verify() content_sz=%u expected=%u", map.content_sz, pk.offset); + return false; + } + + // Check key orders. + if (map_is_k_ordered(&map) && map.ele_count > 0) { + pk.offset = 0; + + define_map_unpacker(pk_key, &map); + + for (uint32_t i = 1; i < map.ele_count; i++) { + uint32_t offset = pk.offset; + msgpack_compare_t cmp = as_unpack_compare(&pk_key, &pk); + + if (cmp == MSGPACK_COMPARE_ERROR) { + cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u invalid key", i, offset, pk.offset); + return false; + } + + if (cmp == MSGPACK_COMPARE_GREATER) { + cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u keys not in order", i, offset, pk.offset); + return false; + } + + pk_key.offset = offset; + + if (as_unpack_size(&pk) <= 0) { + cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u invalid value", i, offset, pk.offset); + return false; + } + } + } + + // Check value orders. + if (order_index_is_filled(ordidx) && map.ele_count > 0) { + // Compare with freshly sorted. + define_order_index(cmp_order, map.ele_count); + + order_index_set_sorted(&cmp_order, u.offidx, map.contents, + map.content_sz, SORT_BY_VALUE); + + for (uint32_t i = 0; i < map.ele_count; i++) { + uint32_t expected = order_index_get(&cmp_order, i); + uint32_t index = order_index_get(ordidx, i); + + if (index != expected) { + cf_warning(AS_PARTICLE, "map_verify() i=%u index=%u expected=%u invalid order index", i, index, expected); + return false; + } + } + + // Walk index and check value order. + pk.offset = 0; + + define_map_unpacker(prev_value, &map); + uint32_t index = order_index_get(ordidx, 0); + + prev_value.offset = offset_index_get_const(u.offidx, index); + + if (as_unpack_size(&prev_value) <= 0) { + cf_warning(AS_PARTICLE, "map_verify() index=%u pk.offset=%u invalid key", index, pk.offset); + return false; + } + + for (uint32_t i = 1; i < map.ele_count; i++) { + index = order_index_get(ordidx, i); + pk.offset = offset_index_get_const(u.offidx, index); + + if (as_unpack_size(&pk) <= 0) { + cf_warning(AS_PARTICLE, "map_verify() i=%u index=%u pk.offset=%u invalid key", i, index, pk.offset); + return false; + } + + uint32_t offset = pk.offset; + msgpack_compare_t cmp = as_unpack_compare(&prev_value, &pk); + + if (cmp == MSGPACK_COMPARE_ERROR) { + cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u invalid value", i, offset, pk.offset); + return false; + } + + if (cmp == MSGPACK_COMPARE_GREATER) { + cf_warning(AS_PARTICLE, "map_verify() i=%u offset=%u pk.offset=%u value index not in order", i, offset, pk.offset); + return false; + } + + prev_value.offset = offset; + } + } + + return true; +} + +// Quash warnings for debug function. +void +as_cdt_map_debug_dummy() +{ + map_verify(NULL); + map_print(NULL, NULL); +} diff --git a/as/src/base/particle_string.c b/as/src/base/particle_string.c new file mode 100644 index 00000000..4f43f623 --- /dev/null +++ b/as/src/base/particle_string.c @@ -0,0 +1,173 @@ +/* + * particle_string.c + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + +#include +#include + +#include "aerospike/as_string.h" +#include "aerospike/as_val.h" + +#include "fault.h" + +#include "base/datamodel.h" +#include "base/particle.h" +#include "base/particle_blob.h" + + +//========================================================== +// STRING particle interface - function declarations. +// + +// Most STRING particle table functions just use the equivalent BLOB particle +// functions. Here are the differences... + +// Handle as_val translation. +uint32_t string_size_from_asval(const as_val *val); +void string_from_asval(const as_val *val, as_particle **pp); +as_val *string_to_asval(const as_particle *p); +uint32_t string_asval_wire_size(const as_val *val); +uint32_t string_asval_to_wire(const as_val *val, uint8_t *wire); + + +//========================================================== +// STRING particle interface - vtable. +// + +const as_particle_vtable string_vtable = { + blob_destruct, + blob_size, + + blob_concat_size_from_wire, + blob_append_from_wire, + blob_prepend_from_wire, + blob_incr_from_wire, + blob_size_from_wire, + blob_from_wire, + blob_compare_from_wire, + blob_wire_size, + blob_to_wire, + + string_size_from_asval, + string_from_asval, + string_to_asval, + string_asval_wire_size, + string_asval_to_wire, + + blob_size_from_msgpack, + blob_from_msgpack, + + blob_size_from_flat, + blob_cast_from_flat, + blob_from_flat, + blob_flat_size, + blob_to_flat +}; + + +//========================================================== +// Typedefs & constants. +// + +// Same as related BLOB struct. TODO - just expose BLOB structs? + +typedef struct string_mem_s { + uint8_t type; + uint32_t sz; + uint8_t data[]; +} __attribute__ ((__packed__)) string_mem; + + +//========================================================== +// STRING particle interface - function definitions. +// + +// Most STRING particle table functions just use the equivalent BLOB particle +// functions. Here are the differences... + +//------------------------------------------------ +// Handle as_val translation. +// + +uint32_t +string_size_from_asval(const as_val *val) +{ + return (uint32_t)(sizeof(string_mem) + as_string_len(as_string_fromval(val))); +} + +void +string_from_asval(const as_val *val, as_particle **pp) +{ + string_mem *p_string_mem = (string_mem *)*pp; + + as_string *string = as_string_fromval(val); + + p_string_mem->type = AS_PARTICLE_TYPE_STRING; + p_string_mem->sz = (uint32_t)as_string_len(string); + memcpy(p_string_mem->data, as_string_tostring(string), p_string_mem->sz); +} + +as_val * +string_to_asval(const as_particle *p) +{ + string_mem *p_string_mem = (string_mem *)p; + + uint8_t *value = cf_malloc(p_string_mem->sz + 1); + + memcpy(value, p_string_mem->data, p_string_mem->sz); + value[p_string_mem->sz] = 0; + + return (as_val *)as_string_new_wlen((char *)value, p_string_mem->sz, true); +} + +uint32_t +string_asval_wire_size(const as_val *val) +{ + return as_string_len(as_string_fromval(val)); +} + +uint32_t +string_asval_to_wire(const as_val *val, uint8_t *wire) +{ + as_string *string = as_string_fromval(val); + uint32_t size = (uint32_t)as_string_len(string); + + memcpy(wire, as_string_tostring(string), size); + + return size; +} + + +//========================================================== +// as_bin particle functions specific to STRING. +// + +uint32_t +as_bin_particle_string_ptr(const as_bin *b, char **p_value) +{ + // Caller must ensure this is called only for STRING particles. + string_mem *p_string_mem = (string_mem *)b->particle; + + *p_value = (char *)p_string_mem->data; + + return p_string_mem->sz; +} diff --git a/as/src/base/predexp.c b/as/src/base/predexp.c new file mode 100644 index 00000000..ac1add56 --- /dev/null +++ b/as/src/base/predexp.c @@ -0,0 +1,2149 @@ +/* + * predexp.c + * + * Copyright (C) 2016-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/predexp.h" + +#include +#include + +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_byte_order.h" +#include "citrusleaf/cf_clock.h" + +#include "fault.h" + +#include "base/particle.h" +#include "geospatial/geospatial.h" + +typedef enum { + PREDEXP_FALSE = 0, // Matching nodes only + PREDEXP_TRUE = 1, // Matching nodes only + PREDEXP_UNKNOWN = 2, // Matching nodes only + PREDEXP_VALUE = 3, // Value nodes only + PREDEXP_NOVALUE = 4 // Value nodes only +} predexp_retval_t; + +typedef struct wrapped_as_bin_s { + as_bin bin; + bool must_free; +} wrapped_as_bin_t; + +// Called to destroy a predexp when no longer needed. +typedef void (*predexp_eval_dtor_fn)(predexp_eval_t* bp); + +typedef predexp_retval_t (*predexp_eval_eval_fn)(predexp_eval_t* bp, + predexp_args_t* argsp, + wrapped_as_bin_t* wbinp); + +// Convenience macro, converts boolean to retval. +#define PREDEXP_RETVAL(bb) ((bb) ? PREDEXP_TRUE : PREDEXP_FALSE) + +#define PREDEXP_VALUE_NODE 0x01 // represents a value +#define PREDEXP_IMMEDIATE_NODE 0x02 // constant per-query value + +struct predexp_eval_base_s { + predexp_eval_t* next; + predexp_eval_dtor_fn dtor_fn; + predexp_eval_eval_fn eval_fn; + uint8_t flags; + uint8_t type; +}; + +struct predexp_var_s { + char vname[AS_ID_BIN_SZ]; + as_bin bin; + as_predexp_var_t* next; +}; + +// This function can set bin values for all bloblike types (strings) + +extern const as_particle_vtable *particle_vtable[]; + +#if 0 +static void predexp_eval_base_dtor(predexp_eval_t* bp) +{ + cf_free(bp); +} +#endif + +static void predexp_eval_base_init(predexp_eval_t* bp, + predexp_eval_dtor_fn dtor_fn, + predexp_eval_eval_fn eval_fn, + uint8_t flags, + uint8_t type) +{ + bp->next = NULL; + bp->dtor_fn = dtor_fn; + bp->eval_fn = eval_fn; + bp->flags = flags; + bp->type = type; +} + +// ---------------------------------------------------------------- +// Helper Functions +// ---------------------------------------------------------------- + +static void +destroy_list(predexp_eval_t* bp) +{ + while (bp != NULL) { + predexp_eval_t* next = bp->next; + (*bp->dtor_fn)(bp); + bp = next; + } +} + +// ---------------------------------------------------------------- +// Tag Definitions +// ---------------------------------------------------------------- + +// FIXME - these need to be in common w/ the clients +// +#define AS_PREDEXP_AND 1 +#define AS_PREDEXP_OR 2 +#define AS_PREDEXP_NOT 3 + +#define AS_PREDEXP_INTEGER_VALUE 10 +#define AS_PREDEXP_STRING_VALUE 11 +#define AS_PREDEXP_GEOJSON_VALUE 12 + +#define AS_PREDEXP_INTEGER_BIN 100 +#define AS_PREDEXP_STRING_BIN 101 +#define AS_PREDEXP_GEOJSON_BIN 102 +#define AS_PREDEXP_LIST_BIN 103 +#define AS_PREDEXP_MAP_BIN 104 + +#define AS_PREDEXP_INTEGER_VAR 120 +#define AS_PREDEXP_STRING_VAR 121 +#define AS_PREDEXP_GEOJSON_VAR 122 + +#define AS_PREDEXP_REC_DEVICE_SIZE 150 +#define AS_PREDEXP_REC_LAST_UPDATE 151 +#define AS_PREDEXP_REC_VOID_TIME 152 +#define AS_PREDEXP_REC_DIGEST_MODULO 153 + +#define AS_PREDEXP_INTEGER_EQUAL 200 +#define AS_PREDEXP_INTEGER_UNEQUAL 201 +#define AS_PREDEXP_INTEGER_GREATER 202 +#define AS_PREDEXP_INTEGER_GREATEREQ 203 +#define AS_PREDEXP_INTEGER_LESS 204 +#define AS_PREDEXP_INTEGER_LESSEQ 205 + +#define AS_PREDEXP_STRING_EQUAL 210 +#define AS_PREDEXP_STRING_UNEQUAL 211 +#define AS_PREDEXP_STRING_REGEX 212 + +#define AS_PREDEXP_GEOJSON_WITHIN 220 +#define AS_PREDEXP_GEOJSON_CONTAINS 221 + +#define AS_PREDEXP_LIST_ITERATE_OR 250 +#define AS_PREDEXP_MAPKEY_ITERATE_OR 251 +#define AS_PREDEXP_MAPVAL_ITERATE_OR 252 +#define AS_PREDEXP_LIST_ITERATE_AND 253 +#define AS_PREDEXP_MAPKEY_ITERATE_AND 254 +#define AS_PREDEXP_MAPVAL_ITERATE_AND 255 + +// ---------------------------------------------------------------- +// AS_PREDEXP_AND +// ---------------------------------------------------------------- + +typedef struct { + predexp_eval_t base; + predexp_eval_t* child; +} predexp_eval_and_t; + +static void +destroy_and(predexp_eval_t* bp) +{ + predexp_eval_and_t* dp = (predexp_eval_and_t *) bp; + destroy_list(dp->child); + cf_free(dp); +} + +static predexp_retval_t +eval_and(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp) +{ + predexp_eval_and_t* dp = (predexp_eval_and_t *) bp; + + // Start optimistically. + predexp_retval_t retval = PREDEXP_TRUE; + + // Scan the children. + for (predexp_eval_t* cp = dp->child; cp != NULL; cp = cp->next) { + + switch ((*cp->eval_fn)(cp, argsp, NULL)) { + case PREDEXP_FALSE: + // Shortcut, skip remaining children. + return PREDEXP_FALSE; + case PREDEXP_UNKNOWN: + // Downgrade our return value, continue scanning children. + retval = PREDEXP_UNKNOWN; + break; + case PREDEXP_TRUE: + // Continue scanning children. + break; + case PREDEXP_VALUE: + case PREDEXP_NOVALUE: + // Child can't be value node; shouldn't ever happen. + cf_crash(AS_PREDEXP, "eval_and child was value node"); + } + } + + return retval; +} + +static bool +build_and(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp) +{ + if (len != sizeof(uint16_t)) { + cf_warning(AS_PREDEXP, "predexp_and: unexpected size %d", len); + return false; + } + uint16_t nterms = cf_swap_from_be16(* (uint16_t *) pp); + pp += sizeof(uint16_t); + + predexp_eval_and_t* dp = + (predexp_eval_and_t *) cf_malloc(sizeof(predexp_eval_and_t)); + + // Start optimistically. + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_and, + eval_and, + 0, + AS_PARTICLE_TYPE_NULL); + dp->child = NULL; + + for (uint16_t ndx = 0; ndx < nterms; ++ndx) { + // If there is not an available child expr cleanup and fail. + if (! *stackpp) { + cf_warning(AS_PREDEXP, "predexp_and: missing child %d", ndx); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + // Transfer the expr at the top of the stack to our child list. + predexp_eval_t* child; + child = *stackpp; // Child from the top of the stack. + *stackpp = child->next; // Stack points around the child. + child->next = dp->child; // Child now points to prior list head. + dp->child = child; // Child is now the top of our list. + + // Make sure the child is not a value node. + if (dp->child->flags & PREDEXP_VALUE_NODE) { + cf_warning(AS_PREDEXP, "predexp_and: child %d is value node", ndx); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + } + + // Success, push ourself onto the stack. + dp->base.next = *stackpp; // We point next at the old top. + *stackpp = (predexp_eval_t *) dp; // We're the new top + + cf_debug(AS_PREDEXP, "%p: predexp_and(%d)", stackpp, nterms); + + return true; +} + +// ---------------------------------------------------------------- +// AS_PREDEXP_OR +// ---------------------------------------------------------------- + +typedef struct { + predexp_eval_t base; + predexp_eval_t* child; +} predexp_eval_or_t; + +static void +destroy_or(predexp_eval_t* bp) +{ + predexp_eval_or_t* dp = (predexp_eval_or_t *) bp; + destroy_list(dp->child); + cf_free(dp); +} + +static predexp_retval_t +eval_or(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp) +{ + predexp_eval_or_t* dp = (predexp_eval_or_t *) bp; + + // Start pessimistically. + predexp_retval_t retval = PREDEXP_FALSE; + + // Scan the children. + for (predexp_eval_t* cp = dp->child; cp != NULL; cp = cp->next) { + switch ((*cp->eval_fn)(cp, argsp, NULL)) { + case PREDEXP_TRUE: + // Shortcut, skip remaining children. + return PREDEXP_TRUE; + case PREDEXP_UNKNOWN: + // Upgrade our return value, continue scanning children. + retval = PREDEXP_UNKNOWN; + break; + case PREDEXP_FALSE: + // Continue scanning children. + break; + case PREDEXP_VALUE: + case PREDEXP_NOVALUE: + // Child can't be value node; shouldn't ever happen. + cf_crash(AS_PREDEXP, "eval_or child was value node"); + } + } + + return retval; +} + +static bool +build_or(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp) +{ + if (len != sizeof(uint16_t)) { + cf_warning(AS_PREDEXP, "predexp_or: unexpected size %d", len); + return false; + } + uint16_t nterms = cf_swap_from_be16(* (uint16_t *) pp); + pp += sizeof(uint16_t); + + predexp_eval_or_t* dp = + (predexp_eval_or_t *) cf_malloc(sizeof(predexp_eval_or_t)); + + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_or, + eval_or, + 0, + AS_PARTICLE_TYPE_NULL); + dp->child = NULL; + + for (uint16_t ndx = 0; ndx < nterms; ++ndx) { + // If there is not an available child expr cleanup and fail. + if (! *stackpp) { + cf_warning(AS_PREDEXP, "predexp_or: missing child %d", ndx); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + // Transfer the expr at the top of the stack to our child list. + predexp_eval_t* child; + child = *stackpp; // Child from the top of the stack. + *stackpp = child->next; // Stack points around the child. + child->next = dp->child; // Child now points to prior list head. + dp->child = child; // Child is now the top of our list. + + // Make sure the child is not a value node. + if (dp->child->flags & PREDEXP_VALUE_NODE) { + cf_warning(AS_PREDEXP, "predexp_or: child %d is value node", ndx); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + } + + // Success, push ourself onto the stack. + dp->base.next = *stackpp; // We point next at the old top. + *stackpp = (predexp_eval_t *) dp; // We're the new top + + cf_debug(AS_PREDEXP, "%p: predexp_or(%d)", stackpp, nterms); + + return true; +} + +// ---------------------------------------------------------------- +// AS_PREDEXP_NOT +// ---------------------------------------------------------------- + +typedef struct { + predexp_eval_t base; + predexp_eval_t* child; +} predexp_eval_not_t; + +static void +destroy_not(predexp_eval_t* bp) +{ + predexp_eval_not_t* dp = (predexp_eval_not_t *) bp; + destroy_list(dp->child); + cf_free(dp); +} + +static predexp_retval_t +eval_not(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp) +{ + predexp_eval_not_t* dp = (predexp_eval_not_t *) bp; + + predexp_eval_t* cp = dp->child; + + switch ((*cp->eval_fn)(cp, argsp, NULL)) { + case PREDEXP_FALSE: + return PREDEXP_TRUE; + case PREDEXP_UNKNOWN: + return PREDEXP_UNKNOWN; + case PREDEXP_TRUE: + return PREDEXP_FALSE; + case PREDEXP_VALUE: + case PREDEXP_NOVALUE: + // Child can't be value node; shouldn't ever happen. + cf_crash(AS_PREDEXP, "eval_not child was value node"); + } + + return PREDEXP_UNKNOWN; // Can't get here, makes compiler happy. +} + +static bool +build_not(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp) +{ + if (len != 0) { + cf_warning(AS_PREDEXP, "predexp_not: unexpected size %d", len); + return false; + } + + predexp_eval_not_t* dp = + (predexp_eval_not_t *) cf_malloc(sizeof(predexp_eval_not_t)); + + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_not, + eval_not, + 0, + AS_PARTICLE_TYPE_NULL); + dp->child = NULL; + + // If there is not an available child expr cleanup and fail. + if (! *stackpp) { + cf_warning(AS_PREDEXP, "predexp_not: missing child"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + // Transfer the expr at the top of the stack to our child list. + predexp_eval_t* child; + child = *stackpp; // Child from the top of the stack. + *stackpp = child->next; // Stack points around the child. + child->next = dp->child; // Child now points to prior list head. + dp->child = child; // Child is now the top of our list. + + // Make sure the child is not a value node. + if (dp->child->flags & PREDEXP_VALUE_NODE) { + cf_warning(AS_PREDEXP, "predexp_not: child is value node"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + // Success, push ourself onto the stack. + dp->base.next = *stackpp; // We point next at the old top. + *stackpp = (predexp_eval_t *) dp; // We're the new top + + cf_debug(AS_PREDEXP, "%p: predexp_not", stackpp); + + return true; +} + +// ---------------------------------------------------------------- +// AS_PREDEXP_*_COMPARE +// ---------------------------------------------------------------- + +// GEOSPATIAL NOTES: +// +// We want to perform all possible computation on the query region +// once, prior to visiting all the points. The current value +// interface is opaque; it returns a bin particle only; there is no +// way to pass associated precomputed state. So we keep the +// precomputed region query state here in the comparison node instead. +// +// IMPROVEMENTS: +// +// We currently parse the incoming query (IMMEDIATE) region twice; +// once in the from_wire_fn and again explicitly in the build_compare +// routine, this time retaining the region. Maybe we should make an +// exposed as_geojson_from_wire which additionally returns the +// computed region; the particle geojson_from_wire could call this +// routine and then discard the region. +// +// We can improve the performance of the comparison by covering the +// region at build time and saving all of the cell min/max ranges. +// Candidate points can first be checked against the list of ranges to +// make sure they are a rough match before performing the more +// expensive strict region match. This change requires a bunch more +// state; probably we'll want a pointer to the +// predexp_eval_geojson_state_t instead of using a union at this +// point. + +typedef struct predexp_eval_geojson_state_s { + uint64_t cellid; + geo_region_t region; +} predexp_eval_geojson_state_t; + +typedef struct predexp_eval_regex_state_s { + regex_t regex; + bool iscompiled; +} predexp_eval_regex_state_t; + +typedef struct predexp_eval_compare_s { + predexp_eval_t base; + uint16_t tag; + uint8_t type; + predexp_eval_t* lchild; + predexp_eval_t* rchild; + union { + predexp_eval_geojson_state_t geojson; + predexp_eval_regex_state_t regex; + } state; +} predexp_eval_compare_t; + +static void +destroy_compare(predexp_eval_t* bp) +{ + predexp_eval_compare_t* dp = (predexp_eval_compare_t *) bp; + if (dp->lchild) { + (*dp->lchild->dtor_fn)(dp->lchild); + } + if (dp->rchild) { + (*dp->rchild->dtor_fn)(dp->rchild); + } + if (dp->type == AS_PARTICLE_TYPE_GEOJSON && dp->state.geojson.region) { + geo_region_destroy(dp->state.geojson.region); + } + if (dp->tag == AS_PREDEXP_STRING_REGEX && dp->state.regex.iscompiled) { + regfree(&dp->state.regex.regex); + } + cf_free(dp); +} + +static predexp_retval_t +eval_compare(predexp_eval_t* bp, + predexp_args_t* argsp, + wrapped_as_bin_t* wbinp) +{ + predexp_eval_compare_t* dp = (predexp_eval_compare_t *) bp; + + predexp_retval_t retval = PREDEXP_UNKNOWN; + + wrapped_as_bin_t lwbin; + wrapped_as_bin_t rwbin; + lwbin.must_free = false; + rwbin.must_free = false; + + // Fetch the child values. Are either of the values unknown? + // During the metadata phase this returns PREDEXP_UNKNOWN. During + // the record phase we consider a comparison with an unknown value + // to be PREDEXP_FALSE (missing bin or bin or wrong type). + + if ((*dp->lchild->eval_fn)(dp->lchild, argsp, &lwbin) == + PREDEXP_NOVALUE) { + retval = argsp->rd ? PREDEXP_FALSE : PREDEXP_UNKNOWN; + goto Cleanup; + } + + if ((*dp->rchild->eval_fn)(dp->rchild, argsp, &rwbin) == + PREDEXP_NOVALUE) { + retval = argsp->rd ? PREDEXP_FALSE : PREDEXP_UNKNOWN; + goto Cleanup; + } + + switch (dp->type) { + case AS_PARTICLE_TYPE_INTEGER: { + int64_t lval = as_bin_particle_integer_value(&lwbin.bin); + int64_t rval = as_bin_particle_integer_value(&rwbin.bin); + switch (dp->tag) { + case AS_PREDEXP_INTEGER_EQUAL: + retval = PREDEXP_RETVAL(lval == rval); + goto Cleanup; + case AS_PREDEXP_INTEGER_UNEQUAL: + retval = PREDEXP_RETVAL(lval != rval); + goto Cleanup; + case AS_PREDEXP_INTEGER_GREATER: + retval = PREDEXP_RETVAL(lval > rval); + goto Cleanup; + case AS_PREDEXP_INTEGER_GREATEREQ: + retval = PREDEXP_RETVAL(lval >= rval); + goto Cleanup; + case AS_PREDEXP_INTEGER_LESS: + retval = PREDEXP_RETVAL(lval < rval); + goto Cleanup; + case AS_PREDEXP_INTEGER_LESSEQ: + retval = PREDEXP_RETVAL(lval <= rval); + goto Cleanup; + default: + cf_crash(AS_PREDEXP, "eval_compare integer unknown tag %d", + dp->tag); + } + } + case AS_PARTICLE_TYPE_STRING: { + // We always need to fetch the left argument. + char* lptr; + uint32_t llen = as_bin_particle_string_ptr(&lwbin.bin, &lptr); + char* rptr; + uint32_t rlen; + switch (dp->tag) { + case AS_PREDEXP_STRING_EQUAL: + case AS_PREDEXP_STRING_UNEQUAL: + // These comparisons need the right argument too. + rlen = as_bin_particle_string_ptr(&rwbin.bin, &rptr); + bool isequal = (llen == rlen) && (memcmp(lptr, rptr, llen) == 0); + switch (dp->tag) { + case AS_PREDEXP_STRING_EQUAL: + retval = isequal; + goto Cleanup; + case AS_PREDEXP_STRING_UNEQUAL: + retval = ! isequal; + goto Cleanup; + default: + cf_crash(AS_PREDEXP, "eval_compare string (eq) unknown tag %d", + dp->tag); + } + case AS_PREDEXP_STRING_REGEX: { + char* tmpstr = cf_strndup(lptr, llen); + int rv = regexec(&dp->state.regex.regex, tmpstr, 0, NULL, 0); + cf_free(tmpstr); + retval = rv == 0; + goto Cleanup; + } + default: + cf_crash(AS_PREDEXP, "eval_compare string unknown tag %d", dp->tag); + } + } + case AS_PARTICLE_TYPE_GEOJSON: { + // as_particle* lpart = lbinp->particle; + // as_particle* rpart = rbinp->particle; + + switch (dp->tag) { + case AS_PREDEXP_GEOJSON_WITHIN: + case AS_PREDEXP_GEOJSON_CONTAINS: { + bool isstrict = true; + bool ismatch = as_particle_geojson_match(lwbin.bin.particle, + dp->state.geojson.cellid, + dp->state.geojson.region, + isstrict); + retval = PREDEXP_RETVAL(ismatch); + goto Cleanup; + } + default: + cf_crash(AS_PREDEXP, "eval_compare geojson unknown tag %d", + dp->tag); + } + } + default: + cf_crash(AS_PREDEXP, "eval_compare unknown type %d", dp->type); + } + + Cleanup: + if (lwbin.must_free) { + cf_crash(AS_PREDEXP, "eval_compare need bin cleanup, didn't before"); + } + if (rwbin.must_free) { + cf_crash(AS_PREDEXP, "eval_compare need bin cleanup, didn't before"); + } + return retval; +} + +static bool +build_compare(predexp_eval_t** stackpp, + uint32_t len, + uint8_t* pp, + uint16_t tag) +{ + predexp_eval_compare_t* dp = (predexp_eval_compare_t *) + cf_malloc(sizeof(predexp_eval_compare_t)); + + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_compare, + eval_compare, + 0, + AS_PARTICLE_TYPE_NULL); + + dp->tag = tag; + dp->lchild = NULL; + dp->rchild = NULL; + + // IMPORTANT - If your state doesn't want to be initialized + // to all 0 rethink this ... + // + memset(&dp->state, 0, sizeof(dp->state)); + + switch (tag) { + case AS_PREDEXP_INTEGER_EQUAL: + case AS_PREDEXP_INTEGER_UNEQUAL: + case AS_PREDEXP_INTEGER_GREATER: + case AS_PREDEXP_INTEGER_GREATEREQ: + case AS_PREDEXP_INTEGER_LESS: + case AS_PREDEXP_INTEGER_LESSEQ: + dp->type = AS_PARTICLE_TYPE_INTEGER; + break; + case AS_PREDEXP_STRING_EQUAL: + case AS_PREDEXP_STRING_UNEQUAL: + case AS_PREDEXP_STRING_REGEX: + dp->type = AS_PARTICLE_TYPE_STRING; + break; + case AS_PREDEXP_GEOJSON_WITHIN: + case AS_PREDEXP_GEOJSON_CONTAINS: + dp->type = AS_PARTICLE_TYPE_GEOJSON; + break; + default: + cf_crash(AS_PREDEXP, "build_compare called with bogus tag: %d", tag); + break; + } + + uint8_t* endp = pp + len; + + uint32_t regex_opts = 0; + if (tag == AS_PREDEXP_STRING_REGEX) { + // This comparison takes a uint32_t opts argument. + if (pp + sizeof(uint32_t) > endp) { + cf_warning(AS_PREDEXP, "build_compare: regex opts past end"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + regex_opts = cf_swap_from_be32(* (uint32_t *) pp); + pp += sizeof(uint32_t); + } + + // No arguments. + if (pp != endp) { + cf_warning(AS_PREDEXP, "build_compare: msg unaligned"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + // ---- Pop the right child off the stack. + + if (! *stackpp) { + cf_warning(AS_PREDEXP, "predexp_compare: missing right child"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + dp->rchild = *stackpp; + *stackpp = dp->rchild->next; + dp->rchild->next = NULL; + + if ((dp->rchild->flags & PREDEXP_VALUE_NODE) == 0) { + cf_warning(AS_PREDEXP, + "predexp compare: right child is not value node"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + if (dp->rchild->type != dp->type) { + cf_warning(AS_PREDEXP, "predexp compare: right child is wrong type"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + // ---- Pop the left child off the stack. + + if (! *stackpp) { + cf_warning(AS_PREDEXP, "predexp_compare: missing left child"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + dp->lchild = *stackpp; + *stackpp = dp->lchild->next; + dp->lchild->next = NULL; + + if ((dp->lchild->flags & PREDEXP_VALUE_NODE) == 0) { + cf_warning(AS_PREDEXP, "predexp compare: left child is not value node"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + if (dp->lchild->type != dp->type) { + cf_warning(AS_PREDEXP, "predexp compare: left child is wrong type"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + switch (tag) { + case AS_PREDEXP_GEOJSON_WITHIN: + case AS_PREDEXP_GEOJSON_CONTAINS: + // The right child needs to be an immediate value. + if ((dp->rchild->flags & PREDEXP_IMMEDIATE_NODE) == 0) { + cf_warning(AS_PREDEXP, + "predexp compare: within arg not immediate GeoJSON"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + // Extract the query GeoJSON value. + predexp_args_t* argsp = NULL; // immediate values don't need args + wrapped_as_bin_t rwbin; + rwbin.must_free = false; + if ((*dp->rchild->eval_fn)(dp->rchild, argsp, &rwbin) == + PREDEXP_NOVALUE) { + cf_warning(AS_PREDEXP, + "predexp compare: within arg had unknown value"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + size_t sz; + char const * ptr = as_geojson_mem_jsonstr(rwbin.bin.particle, &sz); + + // Parse the child, save the computed state. + if (!geo_parse(NULL, ptr, sz, + &dp->state.geojson.cellid, + &dp->state.geojson.region)) { + cf_warning(AS_PREDEXP, "predexp compare: failed to parse GeoJSON"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + if (rwbin.must_free) { + cf_crash(AS_PREDEXP, + "predexp compare now needs bin destructor"); + } + return false; + } + if (rwbin.must_free) { + cf_crash(AS_PREDEXP, "predexp compare now needs bin destructor"); + } + break; + case AS_PREDEXP_STRING_REGEX: + // The right child needs to be an immediate value. + if ((dp->rchild->flags & PREDEXP_IMMEDIATE_NODE) == 0) { + cf_warning(AS_PREDEXP, + "predexp compare: regex arg not immediate string"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + // Extract the query regex value. + predexp_args_t* argsp2 = NULL; // immediate values don't need args + wrapped_as_bin_t rwbin2; + rwbin2.must_free = false; + if ((*dp->rchild->eval_fn)(dp->rchild, argsp2, &rwbin2) == + PREDEXP_NOVALUE) { + cf_warning(AS_PREDEXP, + "predexp compare: regex arg had unknown value"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + char* rptr; + uint32_t rlen = as_bin_particle_string_ptr(&rwbin2.bin, &rptr); + char* tmpregexp = cf_strndup(rptr, rlen); + int rv = regcomp(&dp->state.regex.regex, tmpregexp, regex_opts); + cf_free(tmpregexp); + if (rv != 0) { + char errbuf[1024]; + regerror(rv, &dp->state.regex.regex, errbuf, sizeof(errbuf)); + cf_warning(AS_PREDEXP, "predexp compare: regex compile failed: %s", + errbuf); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + if (rwbin2.must_free) { + cf_crash(AS_PREDEXP, + "predexp compare now needs bin destructor"); + } + return false; + } + dp->state.regex.iscompiled = true; + if (rwbin2.must_free) { + cf_crash(AS_PREDEXP, "predexp compare now needs bin destructor"); + } + break; + + default: + // Don't do anything for the others ... + break; + } + + // Success, push ourself onto the stack. + dp->base.next = *stackpp; // We point next at the old top. + *stackpp = (predexp_eval_t *) dp; // We're the new top + + switch (tag) { + case AS_PREDEXP_INTEGER_EQUAL: + cf_debug(AS_PREDEXP, "%p: predexp_integer_equal", stackpp); + break; + case AS_PREDEXP_INTEGER_UNEQUAL: + cf_debug(AS_PREDEXP, "%p: predexp_integer_unequal", stackpp); + break; + case AS_PREDEXP_INTEGER_GREATER: + cf_debug(AS_PREDEXP, "%p: predexp_integer_greater", stackpp); + break; + case AS_PREDEXP_INTEGER_GREATEREQ: + cf_debug(AS_PREDEXP, "%p: predexp_integer_greatereq", stackpp); + break; + case AS_PREDEXP_INTEGER_LESS: + cf_debug(AS_PREDEXP, "%p: predexp_integer_less", stackpp); + break; + case AS_PREDEXP_INTEGER_LESSEQ: + cf_debug(AS_PREDEXP, "%p: predexp_integer_lesseq", stackpp); + break; + case AS_PREDEXP_STRING_EQUAL: + cf_debug(AS_PREDEXP, "%p: predexp_string_equal", stackpp); + break; + case AS_PREDEXP_STRING_UNEQUAL: + cf_debug(AS_PREDEXP, "%p: predexp_string_unequal", stackpp); + break; + case AS_PREDEXP_STRING_REGEX: + cf_debug(AS_PREDEXP, "%p: predexp_string_regex(%d)", stackpp, + regex_opts); + break; + case AS_PREDEXP_GEOJSON_WITHIN: + cf_debug(AS_PREDEXP, "%p: predexp_geojson_within", stackpp); + break; + case AS_PREDEXP_GEOJSON_CONTAINS: + cf_debug(AS_PREDEXP, "%p: predexp_geojson_contains", stackpp); + break; + default: + cf_crash(AS_PREDEXP, "build_compare called with bogus tag: %d", tag); + break; + } + + return true; +} + +// ---------------------------------------------------------------- +// AS_PREDEXP_*_VALUE +// ---------------------------------------------------------------- + +typedef struct predexp_eval_value_s { + predexp_eval_t base; + as_bin bin; + uint8_t type; +} predexp_eval_value_t; + +static void +destroy_value(predexp_eval_t* bp) +{ + predexp_eval_value_t* dp = (predexp_eval_value_t *) bp; + as_bin_particle_destroy(&dp->bin, true); + cf_free(dp); +} + +static predexp_retval_t +eval_value(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp) +{ + if (wbinp == NULL) { + cf_crash(AS_PREDEXP, "eval_value called outside value context"); + } + + predexp_eval_value_t* dp = (predexp_eval_value_t *) bp; + // We don't have a ns in this context. But the source bin doesn't + // have any name index stuff anyway ... + as_single_bin_copy(&wbinp->bin, &dp->bin); + wbinp->must_free = false; // bin is constant, destroyed after query above + return PREDEXP_VALUE; +} + +static bool +build_value(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp, uint16_t tag) +{ + predexp_eval_value_t* dp = (predexp_eval_value_t *) + cf_malloc(sizeof(predexp_eval_value_t)); + + uint8_t type; + switch (tag) { + case AS_PREDEXP_INTEGER_VALUE: type = AS_PARTICLE_TYPE_INTEGER; break; + case AS_PREDEXP_STRING_VALUE: type = AS_PARTICLE_TYPE_STRING; break; + case AS_PREDEXP_GEOJSON_VALUE: type = AS_PARTICLE_TYPE_GEOJSON; break; + default: + cf_crash(AS_PREDEXP, "build_value called with bogus tag: %d", tag); + return false; + } + + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_value, + eval_value, + PREDEXP_VALUE_NODE | PREDEXP_IMMEDIATE_NODE, + type); + + as_bin_set_empty(&dp->bin); + dp->bin.particle = NULL; + + uint8_t* endp = pp + len; + + size_t vallen = len; + void* valptr = (char*) pp; + pp += vallen; + + if (pp != endp) { + cf_warning(AS_PREDEXP, "predexp value: msg unaligned"); + goto Failed; + } + + int32_t mem_size = particle_vtable[type]->size_from_wire_fn(valptr, vallen); + + if (mem_size != 0) { + dp->bin.particle = cf_malloc((size_t)mem_size); + } + + int result = particle_vtable[type]->from_wire_fn(type, + valptr, + vallen, + &dp->bin.particle); + + // Set the bin's iparticle metadata. + if (result == 0) { + as_bin_state_set_from_type(&dp->bin, type); + } + else { + cf_warning(AS_PREDEXP, "failed to build predexp value with err %d", + result); + if (mem_size != 0) { + cf_free(dp->bin.particle); + } + as_bin_set_empty(&dp->bin); + dp->bin.particle = NULL; + goto Failed; + } + + // Success, push ourself onto the stack. + dp->base.next = *stackpp; // We point next at the old top. + *stackpp = (predexp_eval_t *) dp; // We're the new top + + switch (tag) { + case AS_PREDEXP_INTEGER_VALUE: + cf_debug(AS_PREDEXP, "%p: predexp_integer_value(%"PRId64")", stackpp, + (int64_t) dp->bin.particle); + break; + case AS_PREDEXP_STRING_VALUE: { + cf_debug(AS_PREDEXP, "%p: predexp_string_value(\"%s\")", stackpp, + CF_ZSTR1K(valptr, vallen)); + break; + } + case AS_PREDEXP_GEOJSON_VALUE: { + size_t jsonsz; + char const * jsonptr = + as_geojson_mem_jsonstr(dp->bin.particle, &jsonsz); + cf_debug(AS_PREDEXP, "%p: predexp_geojson_value(%s)", stackpp, + CF_ZSTR1K(jsonptr, jsonsz)); + break; + } + default: + cf_crash(AS_PREDEXP, "build_value called with bogus tag: %d", tag); + break; + } + + return true; + + Failed: + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; +} + +// ---------------------------------------------------------------- +// AS_PREDEXP_*_BIN +// ---------------------------------------------------------------- + +typedef struct predexp_eval_bin_s { + predexp_eval_t base; + char bname[AS_ID_BIN_SZ]; + uint8_t type; +} predexp_eval_bin_t; + +static void +destroy_bin(predexp_eval_t* bp) +{ + predexp_eval_bin_t* dp = (predexp_eval_bin_t *) bp; + cf_free(dp); +} + +static predexp_retval_t +eval_bin(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp) +{ + if (wbinp == NULL) { + cf_crash(AS_PREDEXP, "eval_bin called outside value context"); + } + + predexp_eval_bin_t* dp = (predexp_eval_bin_t *) bp; + + // We require record data to operate. + if (! argsp->rd) { + return PREDEXP_NOVALUE; + } + + as_bin* bb = as_bin_get(argsp->rd, dp->bname); + if (! bb) { + return PREDEXP_NOVALUE; + } + + if (as_bin_get_particle_type(bb) != dp->type) { + return PREDEXP_NOVALUE; + } + + as_bin_copy(argsp->ns, &wbinp->bin, bb); + wbinp->must_free = false; // bin is owned by record, in caller + return PREDEXP_VALUE; +} + +static bool +build_bin(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp, uint16_t tag) +{ + predexp_eval_bin_t* dp = (predexp_eval_bin_t *) + cf_malloc(sizeof(predexp_eval_bin_t)); + + switch (tag) { + case AS_PREDEXP_INTEGER_BIN: + dp->type = AS_PARTICLE_TYPE_INTEGER; + break; + case AS_PREDEXP_STRING_BIN: + dp->type = AS_PARTICLE_TYPE_STRING; + break; + case AS_PREDEXP_GEOJSON_BIN: + dp->type = AS_PARTICLE_TYPE_GEOJSON; + break; + case AS_PREDEXP_LIST_BIN: + dp->type = AS_PARTICLE_TYPE_LIST; + break; + case AS_PREDEXP_MAP_BIN: + dp->type = AS_PARTICLE_TYPE_MAP; + break; + default: + cf_crash(AS_PREDEXP, "build_bin called with bogus tag: %d", tag); + break; + } + + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_bin, + eval_bin, + PREDEXP_VALUE_NODE, + dp->type); + + uint8_t* endp = pp + len; + + if (len >= sizeof(dp->bname)) { + cf_warning(AS_PREDEXP, "build_bin: binname too long"); + goto Failed; + } + uint8_t bnlen = (uint8_t) len; + memcpy(dp->bname, pp, bnlen); + dp->bname[bnlen] = '\0'; + pp += bnlen; + + if (pp != endp) { + cf_warning(AS_PREDEXP, "build_bin: msg unaligned"); + goto Failed; + } + + // Success, push ourself onto the stack. + dp->base.next = *stackpp; // We point next at the old top. + *stackpp = (predexp_eval_t *) dp; // We're the new top + + switch (tag) { + case AS_PREDEXP_INTEGER_BIN: + cf_debug(AS_PREDEXP, "%p: predexp_integer_bin(\"%s\")", stackpp, + dp->bname); + break; + case AS_PREDEXP_STRING_BIN: + cf_debug(AS_PREDEXP, "%p: predexp_string_bin(\"%s\")", stackpp, + dp->bname); + break; + case AS_PREDEXP_GEOJSON_BIN: + cf_debug(AS_PREDEXP, "%p: predexp_geojson_bin(\"%s\")", stackpp, + dp->bname); + break; + case AS_PREDEXP_LIST_BIN: + cf_debug(AS_PREDEXP, "%p: predexp_list_bin(\"%s\")", stackpp, + dp->bname); + break; + case AS_PREDEXP_MAP_BIN: + cf_debug(AS_PREDEXP, "%p: predexp_map_bin(\"%s\")", stackpp, + dp->bname); + break; + default: + cf_crash(AS_PREDEXP, "build_bin called with bogus tag: %d", tag); + break; + } + + return true; + + Failed: + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; +} + +// ---------------------------------------------------------------- +// AS_PREDEXP_*_VAR +// ---------------------------------------------------------------- + +typedef struct predexp_eval_var_s { + predexp_eval_t base; + char vname[AS_ID_BIN_SZ]; + uint8_t type; +} predexp_eval_var_t; + +static void +destroy_var(predexp_eval_t* bp) +{ + predexp_eval_var_t* dp = (predexp_eval_var_t *) bp; + cf_free(dp); +} + +static predexp_retval_t +eval_var(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp) +{ + if (wbinp == NULL) { + cf_crash(AS_PREDEXP, "eval_var called outside value context"); + } + + predexp_eval_var_t* dp = (predexp_eval_var_t *) bp; + + for (as_predexp_var_t* vp = argsp->vl; vp != NULL; vp = vp->next) { + if (strcmp(dp->vname, vp->vname) == 0) { + // Is it the correct type? + if (as_bin_get_particle_type(&vp->bin) != dp->type) { + return PREDEXP_NOVALUE; + } + + // Return it. + as_bin_copy(argsp->ns, &wbinp->bin, &vp->bin); + wbinp->must_free = false; // bin is owned by iterator + return PREDEXP_VALUE; + } + } + + // If we get here we didn't find the named variable in the list. + return PREDEXP_NOVALUE; +} + +static bool +build_var(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp, uint16_t tag) +{ + predexp_eval_var_t* dp = (predexp_eval_var_t *) + cf_malloc(sizeof(predexp_eval_var_t)); + + switch (tag) { + case AS_PREDEXP_INTEGER_VAR: + dp->type = AS_PARTICLE_TYPE_INTEGER; + break; + case AS_PREDEXP_STRING_VAR: + dp->type = AS_PARTICLE_TYPE_STRING; + break; + case AS_PREDEXP_GEOJSON_VAR: + dp->type = AS_PARTICLE_TYPE_GEOJSON; + break; + default: + cf_crash(AS_PREDEXP, "build_var called with bogus tag: %d", tag); + break; + } + + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_var, + eval_var, + PREDEXP_VALUE_NODE, + dp->type); + + uint8_t* endp = pp + len; + + if (len >= sizeof(dp->vname)) { + cf_warning(AS_PREDEXP, "build_var: varname too long"); + goto Failed; + } + uint8_t bnlen = (uint8_t) len; + memcpy(dp->vname, pp, bnlen); + dp->vname[bnlen] = '\0'; + pp += bnlen; + + if (pp != endp) { + cf_warning(AS_PREDEXP, "build_var: msg unaligned"); + goto Failed; + } + + // Success, push ourself onto the stack. + dp->base.next = *stackpp; // We point next at the old top. + *stackpp = (predexp_eval_t *) dp; // We're the new top + + switch (tag) { + case AS_PREDEXP_INTEGER_VAR: + cf_debug(AS_PREDEXP, "%p: predexp_integer_var(\"%s\")", stackpp, + dp->vname); + break; + case AS_PREDEXP_STRING_VAR: + cf_debug(AS_PREDEXP, "%p: predexp_string_var(\"%s\")", stackpp, + dp->vname); + break; + case AS_PREDEXP_GEOJSON_VAR: + cf_debug(AS_PREDEXP, "%p: predexp_geojson_var(\"%s\")", stackpp, + dp->vname); + break; + default: + cf_crash(AS_PREDEXP, "build_var called with bogus tag: %d", tag); + break; + } + + return true; + + Failed: + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; +} + +// ---------------------------------------------------------------- +// AS_PREDEXP_REC_DEVICE_SIZE +// ---------------------------------------------------------------- + +typedef struct predexp_eval_rec_device_size_s { + predexp_eval_t base; +} predexp_eval_rec_device_size_t; + +static void +destroy_rec_device_size(predexp_eval_t* bp) +{ + predexp_eval_rec_device_size_t* dp = (predexp_eval_rec_device_size_t *) bp; + cf_free(dp); +} + +static predexp_retval_t +eval_rec_device_size(predexp_eval_t* bp, + predexp_args_t* argsp, + wrapped_as_bin_t* wbinp) +{ + if (wbinp == NULL) { + cf_crash(AS_PREDEXP, + "eval_rec_device_size called outside value context"); + } + + // predexp_eval_rec_device_size_t* dp = + // (predexp_eval_rec_device_size_t *) bp; + + int64_t rec_device_size = argsp->md->n_rblocks * 128; + + as_bin_state_set_from_type(&wbinp->bin, AS_PARTICLE_TYPE_INTEGER); + as_bin_particle_integer_set(&wbinp->bin, rec_device_size); + return PREDEXP_VALUE; +} + +static bool +build_rec_device_size(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp) +{ + predexp_eval_rec_device_size_t* dp = (predexp_eval_rec_device_size_t *) + cf_malloc(sizeof(predexp_eval_rec_device_size_t)); + + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_rec_device_size, + eval_rec_device_size, + PREDEXP_VALUE_NODE, + AS_PARTICLE_TYPE_INTEGER); + + uint8_t* endp = pp + len; + + if (pp != endp) { + cf_warning(AS_PREDEXP, "build_rec_device_size: msg unaligned"); + goto Failed; + } + + // Success, push ourself onto the stack. + dp->base.next = *stackpp; // We point next at the old top. + *stackpp = (predexp_eval_t *) dp; // We're the new top + + cf_debug(AS_PREDEXP, "%p: predexp_rec_device_size()", stackpp); + + return true; + + Failed: + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; +} + +// ---------------------------------------------------------------- +// AS_PREDEXP_REC_LAST_UPDATE +// ---------------------------------------------------------------- + +typedef struct predexp_eval_rec_last_update_s { + predexp_eval_t base; + as_bin bin; +} predexp_eval_rec_last_update_t; + +static void +destroy_rec_last_update(predexp_eval_t* bp) +{ + predexp_eval_rec_last_update_t* dp = (predexp_eval_rec_last_update_t *) bp; + cf_free(dp); +} + +static predexp_retval_t +eval_rec_last_update(predexp_eval_t* bp, + predexp_args_t* argsp, + wrapped_as_bin_t* wbinp) +{ + if (wbinp == NULL) { + cf_crash(AS_PREDEXP, + "eval_rec_last_update called outside value context"); + } + + // predexp_eval_rec_last_update_t* dp = + // (predexp_eval_rec_last_update_t *) bp; + + int64_t rec_last_update_ns = + (int64_t) cf_utc_ns_from_clepoch_ms(argsp->md->last_update_time); + + as_bin_state_set_from_type(&wbinp->bin, AS_PARTICLE_TYPE_INTEGER); + as_bin_particle_integer_set(&wbinp->bin, rec_last_update_ns); + return PREDEXP_VALUE; +} + +static bool +build_rec_last_update(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp) +{ + predexp_eval_rec_last_update_t* dp = (predexp_eval_rec_last_update_t *) + cf_malloc(sizeof(predexp_eval_rec_last_update_t)); + + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_rec_last_update, + eval_rec_last_update, + PREDEXP_VALUE_NODE, + AS_PARTICLE_TYPE_INTEGER); + + uint8_t* endp = pp + len; + + if (pp != endp) { + cf_warning(AS_PREDEXP, "build_rec_last_update: msg unaligned"); + goto Failed; + } + + // Success, push ourself onto the stack. + dp->base.next = *stackpp; // We point next at the old top. + *stackpp = (predexp_eval_t *) dp; // We're the new top + + cf_debug(AS_PREDEXP, "%p: predexp_rec_last_update()", stackpp); + + return true; + + Failed: + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; +} + +// ---------------------------------------------------------------- +// AS_PREDEXP_REC_VOID_TIME +// ---------------------------------------------------------------- + +typedef struct predexp_eval_rec_void_time_s { + predexp_eval_t base; + as_bin bin; +} predexp_eval_rec_void_time_t; + +static void +destroy_rec_void_time(predexp_eval_t* bp) +{ + predexp_eval_rec_void_time_t* dp = (predexp_eval_rec_void_time_t *) bp; + cf_free(dp); +} + +static predexp_retval_t +eval_rec_void_time(predexp_eval_t* bp, + predexp_args_t* argsp, + wrapped_as_bin_t* wbinp) +{ + if (wbinp == NULL) { + cf_crash(AS_PREDEXP, "eval_rec_void_time called outside value context"); + } + + // predexp_eval_rec_void_time_t* dp = (predexp_eval_rec_void_time_t *) bp; + + int64_t rec_void_time_ns = + (int64_t) cf_utc_ns_from_clepoch_sec(argsp->md->void_time); + + // SPECIAL CASE - if the argsp->md->rec_void_time == 0 set the + // rec_void_time_ns to 0 as well. + // + if (argsp->md->void_time == 0) { + rec_void_time_ns = 0; + } + + as_bin_state_set_from_type(&wbinp->bin, AS_PARTICLE_TYPE_INTEGER); + as_bin_particle_integer_set(&wbinp->bin, rec_void_time_ns); + return PREDEXP_VALUE; +} + +static bool +build_rec_void_time(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp) +{ + predexp_eval_rec_void_time_t* dp = (predexp_eval_rec_void_time_t *) + cf_malloc(sizeof(predexp_eval_rec_void_time_t)); + + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_rec_void_time, + eval_rec_void_time, + PREDEXP_VALUE_NODE, + AS_PARTICLE_TYPE_INTEGER); + + uint8_t* endp = pp + len; + + if (pp != endp) { + cf_warning(AS_PREDEXP, "build_rec_void_time: msg unaligned"); + goto Failed; + } + + // Success, push ourself onto the stack. + dp->base.next = *stackpp; // We point next at the old top. + *stackpp = (predexp_eval_t *) dp; // We're the new top + + cf_debug(AS_PREDEXP, "%p: predexp_rec_void_time()", stackpp); + + return true; + + Failed: + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; +} + +// ---------------------------------------------------------------- +// AS_PREDEXP_REC_DIGEST_MODULO +// ---------------------------------------------------------------- + +typedef struct predexp_eval_rec_digest_modulo_s { + predexp_eval_t base; + int32_t mod; +} predexp_eval_rec_digest_modulo_t; + +static void +destroy_rec_digest_modulo(predexp_eval_t* bp) +{ + predexp_eval_rec_digest_modulo_t* dp = + (predexp_eval_rec_digest_modulo_t *) bp; + cf_free(dp); +} + +static predexp_retval_t +eval_rec_digest_modulo(predexp_eval_t* bp, + predexp_args_t* argsp, + wrapped_as_bin_t* wbinp) +{ + if (wbinp == NULL) { + cf_crash(AS_PREDEXP, + "eval_rec_digest_modulo called outside value context"); + } + + predexp_eval_rec_digest_modulo_t* dp = + (predexp_eval_rec_digest_modulo_t *) bp; + + // We point at the last 4 bytes of the digest. + uint32_t* valp = (uint32_t*) &argsp->md->keyd.digest[16]; + int64_t digest_modulo = *valp % dp->mod; + + as_bin_state_set_from_type(&wbinp->bin, AS_PARTICLE_TYPE_INTEGER); + as_bin_particle_integer_set(&wbinp->bin, digest_modulo); + return PREDEXP_VALUE; +} + +static bool +build_rec_digest_modulo(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp) +{ + predexp_eval_rec_digest_modulo_t* dp = (predexp_eval_rec_digest_modulo_t *) + cf_malloc(sizeof(predexp_eval_rec_digest_modulo_t)); + + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_rec_digest_modulo, + eval_rec_digest_modulo, + PREDEXP_VALUE_NODE, + AS_PARTICLE_TYPE_INTEGER); + + uint8_t* endp = pp + len; + + if (pp + sizeof(int32_t) > endp) { + cf_warning(AS_PREDEXP, "build_rec_digest_modulo: msg too short"); + goto Failed; + } + + dp->mod = cf_swap_from_be32(* (int32_t*) pp); + pp += sizeof(int32_t); + + if (pp != endp) { + cf_warning(AS_PREDEXP, "build_rec_digest_modulo: msg unaligned"); + goto Failed; + } + + if (dp->mod == 0) { + cf_warning(AS_PREDEXP, "build_rec_digest_modulo: zero modulo invalid"); + goto Failed; + } + + // Success, push ourself onto the stack. + dp->base.next = *stackpp; // We point next at the old top. + *stackpp = (predexp_eval_t *) dp; // We're the new top + + cf_debug(AS_PREDEXP, "%p: predexp_rec_digest_modulo(%d)", stackpp, dp->mod); + + return true; + + Failed: + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; +} + +// ---------------------------------------------------------------- +// AS_PREDEXP_*_ITERATE_* +// ---------------------------------------------------------------- + +typedef struct predexp_eval_iter_s { + predexp_eval_t base; + uint16_t tag; + uint8_t type; + predexp_eval_t* lchild; // per-element expr + predexp_eval_t* rchild; // collection + char vname[AS_ID_BIN_SZ]; +} predexp_eval_iter_t; + +static void +destroy_iter(predexp_eval_t* bp) +{ + predexp_eval_iter_t* dp = (predexp_eval_iter_t *) bp; + cf_free(dp); +} + +static predexp_retval_t +eval_list_iter(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp) +{ + predexp_eval_iter_t* dp = (predexp_eval_iter_t *) bp; + + predexp_retval_t retval = PREDEXP_UNKNOWN; // init makes compiler happy + switch (dp->tag) { + case AS_PREDEXP_LIST_ITERATE_OR: + // Start pessimistically. + retval = PREDEXP_FALSE; + break; + case AS_PREDEXP_LIST_ITERATE_AND: + // Start optimistically. + retval = PREDEXP_TRUE; + break; + default: + cf_crash(AS_PREDEXP, + "eval_list_iter called with bogus tag: %d", dp->tag); + } + + wrapped_as_bin_t lwbin; + lwbin.must_free = false; + if ((*dp->rchild->eval_fn)(dp->rchild, argsp, &lwbin) == + PREDEXP_NOVALUE) { + return argsp->rd ? PREDEXP_FALSE : PREDEXP_UNKNOWN; + } + + as_predexp_var_t var; + memcpy(var.vname, dp->vname, sizeof(var.vname)); + + // Make sure our var starts out empty. + as_bin_set_empty(&var.bin); + var.bin.particle = NULL; + + // Prepend our var to the list. + var.next = argsp->vl; + argsp->vl = &var; + + // Traverse the collection. + as_val* lval = as_bin_particle_to_asval(&lwbin.bin); + as_arraylist* list = (as_arraylist*) as_list_fromval(lval); + as_arraylist_iterator it; + as_arraylist_iterator_init(&it, list); + while (as_arraylist_iterator_has_next(&it)) { + // Set our var to the element's value. + as_val* val = (as_val*) as_arraylist_iterator_next(&it); + int old_arena = cf_alloc_clear_ns_arena(); + int rv = as_bin_particle_replace_from_asval(&var.bin, val); + cf_alloc_restore_ns_arena(old_arena); + if (rv != 0) { + cf_warning(AS_PREDEXP, + "eval_list_iter: particle from asval failed"); + continue; + } + + switch (dp->tag) { + case AS_PREDEXP_LIST_ITERATE_OR: + switch ((*dp->lchild->eval_fn)(dp->lchild, argsp, NULL)) { + case PREDEXP_TRUE: + // Shortcut, skip remaining children. + retval = PREDEXP_TRUE; + goto Done; + case PREDEXP_UNKNOWN: + // Upgrade our return value, continue scanning children. + retval = PREDEXP_UNKNOWN; + break; + case PREDEXP_FALSE: + // Continue scanning children. + break; + case PREDEXP_VALUE: + case PREDEXP_NOVALUE: + // Child can't be value node; shouldn't ever happen. + cf_crash(AS_PREDEXP, "eval_list_iter child was value node"); + } + break; + case AS_PREDEXP_LIST_ITERATE_AND: + switch ((*dp->lchild->eval_fn)(dp->lchild, argsp, NULL)) { + case PREDEXP_FALSE: + // Shortcut, skip remaining children. + retval = PREDEXP_FALSE; + goto Done; + case PREDEXP_UNKNOWN: + // Downgrade our return value, continue scanning children. + retval = PREDEXP_UNKNOWN; + break; + case PREDEXP_TRUE: + // Continue scanning children. + break; + case PREDEXP_VALUE: + case PREDEXP_NOVALUE: + // Child can't be value node; shouldn't ever happen. + cf_crash(AS_PREDEXP, "eval_list_iter child was value node"); + } + break; + default: + cf_crash(AS_PREDEXP, "eval_list_iter called with bogus tag: %d", + dp->tag); + } + + } + + Done: + as_bin_particle_destroy(&var.bin, true); + as_bin_set_empty(&var.bin); + var.bin.particle = NULL; + + as_arraylist_iterator_destroy(&it); + + as_val_destroy(lval); + + // Remove our var from the list. + argsp->vl = var.next; + + if (lwbin.must_free) { + cf_crash(AS_PREDEXP, "eval_list_iter need bin cleanup, didn't before"); + } + + return retval; +} + +static predexp_retval_t +eval_map_iter(predexp_eval_t* bp, predexp_args_t* argsp, wrapped_as_bin_t* wbinp) +{ + predexp_eval_iter_t* dp = (predexp_eval_iter_t *) bp; + + predexp_retval_t retval = PREDEXP_UNKNOWN; // init makes compiler happy + switch (dp->tag) { + case AS_PREDEXP_MAPKEY_ITERATE_OR: + case AS_PREDEXP_MAPVAL_ITERATE_OR: + // Start pessimistically. + retval = PREDEXP_FALSE; + break; + case AS_PREDEXP_MAPKEY_ITERATE_AND: + case AS_PREDEXP_MAPVAL_ITERATE_AND: + // Start optimistically. + retval = PREDEXP_TRUE; + break; + default: + cf_crash(AS_PREDEXP, "eval_map_iter called with bogus tag: %d", + dp->tag); + } + + wrapped_as_bin_t lwbin; + lwbin.must_free = false; + if ((*dp->rchild->eval_fn)(dp->rchild, argsp, &lwbin) == + PREDEXP_NOVALUE) { + return argsp->rd ? PREDEXP_FALSE : PREDEXP_UNKNOWN; + } + + as_predexp_var_t var; + memcpy(var.vname, dp->vname, sizeof(var.vname)); + + // Make sure our var starts out empty. + as_bin_set_empty(&var.bin); + var.bin.particle = NULL; + + // Prepend our var to the list. + var.next = argsp->vl; + argsp->vl = &var; + + // Traverse the collection. + as_val* mval = as_bin_particle_to_asval(&lwbin.bin); + as_hashmap* map = (as_hashmap*) as_map_fromval(mval); + as_hashmap_iterator it; + as_hashmap_iterator_init(&it, map); + while (as_hashmap_iterator_has_next(&it)) { + // Set our var to the element's value. + as_pair* pair = (as_pair*) as_hashmap_iterator_next(&it); + as_val* val = NULL; // init makes compiler happy + switch (dp->tag) { + case AS_PREDEXP_MAPKEY_ITERATE_OR: + case AS_PREDEXP_MAPKEY_ITERATE_AND: + val = as_pair_1(pair); + break; + case AS_PREDEXP_MAPVAL_ITERATE_OR: + case AS_PREDEXP_MAPVAL_ITERATE_AND: + val = as_pair_2(pair); + break; + default: + cf_crash(AS_PREDEXP, "eval_map_iter called with bogus tag (2): %d", + dp->tag); + } + + int old_arena = cf_alloc_clear_ns_arena(); + int rv = as_bin_particle_replace_from_asval(&var.bin, val); + cf_alloc_restore_ns_arena(old_arena); + if (rv != 0) { + cf_warning(AS_PREDEXP, "eval_map_iter: particle from asval failed"); + continue; + } + + switch (dp->tag) { + case AS_PREDEXP_MAPKEY_ITERATE_OR: + case AS_PREDEXP_MAPVAL_ITERATE_OR: + switch ((*dp->lchild->eval_fn)(dp->lchild, argsp, NULL)) { + case PREDEXP_TRUE: + // Shortcut, skip remaining children. + retval = PREDEXP_TRUE; + goto Done; + case PREDEXP_UNKNOWN: + // Upgrade our return value, continue scanning children. + retval = PREDEXP_UNKNOWN; + break; + case PREDEXP_FALSE: + // Continue scanning children. + break; + case PREDEXP_VALUE: + case PREDEXP_NOVALUE: + // Child can't be value node; shouldn't ever happen. + cf_crash(AS_PREDEXP, "eval_map_iter child was value node"); + } + break; + case AS_PREDEXP_MAPKEY_ITERATE_AND: + case AS_PREDEXP_MAPVAL_ITERATE_AND: + switch ((*dp->lchild->eval_fn)(dp->lchild, argsp, NULL)) { + case PREDEXP_FALSE: + // Shortcut, skip remaining children. + retval = PREDEXP_FALSE; + goto Done; + case PREDEXP_UNKNOWN: + // Downgrade our return value, continue scanning children. + retval = PREDEXP_UNKNOWN; + break; + case PREDEXP_TRUE: + // Continue scanning children. + break; + case PREDEXP_VALUE: + case PREDEXP_NOVALUE: + // Child can't be value node; shouldn't ever happen. + cf_crash(AS_PREDEXP, "eval_map_iter child was value node"); + } + break; + default: + cf_crash(AS_PREDEXP, "eval_map_iter called with bogus tag: %d", + dp->tag); + } + + } + + Done: + as_bin_particle_destroy(&var.bin, true); + as_bin_set_empty(&var.bin); + var.bin.particle = NULL; + + as_hashmap_iterator_destroy(&it); + + as_val_destroy(mval); + + // Remove our var from the list. + argsp->vl = var.next; + + if (lwbin.must_free) { + cf_crash(AS_PREDEXP, "eval_map_iter need bin cleanup, didn't before"); + } + return retval; +} + +static bool +build_iter(predexp_eval_t** stackpp, uint32_t len, uint8_t* pp, uint16_t tag) +{ + predexp_eval_iter_t* dp = (predexp_eval_iter_t *) + cf_malloc(sizeof(predexp_eval_iter_t)); + + switch (tag) { + case AS_PREDEXP_LIST_ITERATE_OR: + case AS_PREDEXP_LIST_ITERATE_AND: + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_iter, + eval_list_iter, + 0, + AS_PARTICLE_TYPE_NULL); + dp->type = AS_PARTICLE_TYPE_LIST; + break; + case AS_PREDEXP_MAPKEY_ITERATE_OR: + case AS_PREDEXP_MAPVAL_ITERATE_OR: + case AS_PREDEXP_MAPKEY_ITERATE_AND: + case AS_PREDEXP_MAPVAL_ITERATE_AND: + predexp_eval_base_init((predexp_eval_t *) dp, + destroy_iter, + eval_map_iter, + 0, + AS_PARTICLE_TYPE_NULL); + dp->type = AS_PARTICLE_TYPE_MAP; + break; + default: + cf_crash(AS_PREDEXP, "build_iter called with bogus tag: %d", tag); + } + + dp->tag = tag; + dp->lchild = NULL; + dp->rchild = NULL; + + uint8_t* endp = pp + len; + + if (len >= sizeof(dp->vname)) { + cf_warning(AS_PREDEXP, "build_iter: varname too long"); + goto Failed; + } + uint8_t vnlen = (uint8_t) len; + memcpy(dp->vname, pp, vnlen); + dp->vname[vnlen] = '\0'; + pp += vnlen; + + // ---- Pop the right child (collection) off the stack. + + if (! *stackpp) { + cf_warning(AS_PREDEXP, "predexp_iterate: missing right child"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + dp->rchild = *stackpp; + *stackpp = dp->rchild->next; + dp->rchild->next = NULL; + + if ((dp->rchild->flags & PREDEXP_VALUE_NODE) == 0) { + cf_warning(AS_PREDEXP, + "predexp iterate: right child is not value node"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + if (dp->rchild->type != dp->type) { + cf_warning(AS_PREDEXP, "predexp iterate: right child is wrong type"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + // ---- Pop the left child (per-element expr) off the stack. + + if (! *stackpp) { + cf_warning(AS_PREDEXP, "predexp_iterate: missing left child"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + dp->lchild = *stackpp; + *stackpp = dp->lchild->next; + dp->lchild->next = NULL; + + if ((dp->lchild->flags & PREDEXP_VALUE_NODE) == 1) { + cf_warning(AS_PREDEXP, "predexp iterate: left child is value node"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + if (dp->lchild->type != AS_PARTICLE_TYPE_NULL) { + cf_warning(AS_PREDEXP, "predexp iterate: left child is wrong type"); + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; + } + + if (pp != endp) { + cf_warning(AS_PREDEXP, "build_iter: msg unaligned"); + goto Failed; + } + + // Success, push ourself onto the stack. + dp->base.next = *stackpp; // We point next at the old top. + *stackpp = (predexp_eval_t *) dp; // We're the new top + + switch (tag) { + case AS_PREDEXP_LIST_ITERATE_OR: + cf_debug(AS_PREDEXP, "%p: predexp_list_iterate_or()", stackpp); + break; + case AS_PREDEXP_LIST_ITERATE_AND: + cf_debug(AS_PREDEXP, "%p: predexp_list_iterate_and()", stackpp); + break; + case AS_PREDEXP_MAPKEY_ITERATE_OR: + cf_debug(AS_PREDEXP, "%p: predexp_mapkey_iterate_or()", stackpp); + break; + case AS_PREDEXP_MAPVAL_ITERATE_OR: + cf_debug(AS_PREDEXP, "%p: predexp_mapval_iterate_or()", stackpp); + break; + case AS_PREDEXP_MAPKEY_ITERATE_AND: + cf_debug(AS_PREDEXP, "%p: predexp_mapkey_iterate_and()", stackpp); + break; + case AS_PREDEXP_MAPVAL_ITERATE_AND: + cf_debug(AS_PREDEXP, "%p: predexp_mapval_iterate_and()", stackpp); + break; + default: + cf_crash(AS_PREDEXP, "build_iter called with bogus tag: %d", tag); + } + + return true; + + Failed: + (*dp->base.dtor_fn)((predexp_eval_t *) dp); + return false; +} + +// ---------------------------------------------------------------- +// External Interface +// ---------------------------------------------------------------- + + +static bool +build(predexp_eval_t** stackpp, uint16_t tag, uint32_t len, uint8_t* pp) +{ + switch (tag) { + case AS_PREDEXP_AND: + return build_and(stackpp, len, pp); + case AS_PREDEXP_OR: + return build_or(stackpp, len, pp); + case AS_PREDEXP_NOT: + return build_not(stackpp, len, pp); + case AS_PREDEXP_INTEGER_EQUAL: + case AS_PREDEXP_INTEGER_UNEQUAL: + case AS_PREDEXP_INTEGER_GREATER: + case AS_PREDEXP_INTEGER_GREATEREQ: + case AS_PREDEXP_INTEGER_LESS: + case AS_PREDEXP_INTEGER_LESSEQ: + case AS_PREDEXP_STRING_EQUAL: + case AS_PREDEXP_STRING_UNEQUAL: + case AS_PREDEXP_STRING_REGEX: + case AS_PREDEXP_GEOJSON_WITHIN: + case AS_PREDEXP_GEOJSON_CONTAINS: + return build_compare(stackpp, len, pp, tag); + case AS_PREDEXP_INTEGER_VALUE: + case AS_PREDEXP_STRING_VALUE: + case AS_PREDEXP_GEOJSON_VALUE: + return build_value(stackpp, len, pp, tag); + case AS_PREDEXP_INTEGER_BIN: + case AS_PREDEXP_STRING_BIN: + case AS_PREDEXP_GEOJSON_BIN: + case AS_PREDEXP_LIST_BIN: + case AS_PREDEXP_MAP_BIN: + return build_bin(stackpp, len, pp, tag); + case AS_PREDEXP_INTEGER_VAR: + case AS_PREDEXP_STRING_VAR: + case AS_PREDEXP_GEOJSON_VAR: + return build_var(stackpp, len, pp, tag); + case AS_PREDEXP_REC_DEVICE_SIZE: + return build_rec_device_size(stackpp, len, pp); + case AS_PREDEXP_REC_LAST_UPDATE: + return build_rec_last_update(stackpp, len, pp); + case AS_PREDEXP_REC_VOID_TIME: + return build_rec_void_time(stackpp, len, pp); + case AS_PREDEXP_REC_DIGEST_MODULO: + return build_rec_digest_modulo(stackpp, len, pp); + case AS_PREDEXP_LIST_ITERATE_OR: + case AS_PREDEXP_LIST_ITERATE_AND: + case AS_PREDEXP_MAPKEY_ITERATE_OR: + case AS_PREDEXP_MAPKEY_ITERATE_AND: + case AS_PREDEXP_MAPVAL_ITERATE_OR: + case AS_PREDEXP_MAPVAL_ITERATE_AND: + return build_iter(stackpp, len, pp, tag); + default: + cf_warning(AS_PREDEXP, "unexpected predexp tag: %d", tag); + return false; + } +} + +predexp_eval_t* +predexp_build(as_msg_field* pfp) +{ + predexp_eval_t* stackp = NULL; + + cf_debug(AS_PREDEXP, "%p: predexp_build starting", &stackp); + + uint8_t* pp = pfp->data; + uint32_t pdsize = as_msg_field_get_value_sz(pfp); + uint8_t* endp = pp + pdsize; + + // Minumum possible TLV token is 6 bytes. + while (pp + 6 <= endp) { + uint16_t tag = cf_swap_from_be16(* (uint16_t *) pp); + pp += sizeof(uint16_t); + + uint32_t len = cf_swap_from_be32(* (uint32_t *) pp); + pp += sizeof(uint32_t); + + if (pp + len > endp) { + cf_warning(AS_PREDEXP, "malformed predexp field"); + goto FAILED; + } + + if (!build(&stackp, tag, len, pp)) { + // Warning should already have happened + goto FAILED; + } + pp += len; + } + + // The cursor needs to neatly point at the end pointer. + if (pp != endp) { + cf_warning(AS_PREDEXP, "malformed predexp field"); + goto FAILED; + } + + // We'd better have exactly one node on the stack now. + if (!stackp) { + cf_warning(AS_PREDEXP, "no top level predexp"); + goto FAILED; + } + if (stackp->next) { + cf_warning(AS_PREDEXP, "multiple top-level predexp"); + goto FAILED; + } + + // The top node needs to be a matching node, not a value node. + if (stackp->flags & PREDEXP_VALUE_NODE) { + cf_warning(AS_PREDEXP, "top-level predexp is value node"); + goto FAILED; + } + + cf_debug(AS_PREDEXP, "%p: predexp_build finished", &stackp); + + // Return the root of the predicate expression tree. + return stackp; + + FAILED: + cf_debug(AS_PREDEXP, "%p: predexp_build failed", &stackp); + destroy_list(stackp); + return NULL; +} + +bool +predexp_matches_metadata(predexp_eval_t* bp, predexp_args_t* argsp) +{ + if (! bp) { + return true; + } + + return ((*bp->eval_fn)(bp, argsp, NULL) != PREDEXP_FALSE); +} + +bool +predexp_matches_record(predexp_eval_t* bp, predexp_args_t* argsp) +{ + if (! bp) { + return true; + } + + switch ((*bp->eval_fn)(bp, argsp, NULL)) { + case PREDEXP_TRUE: + return true; + case PREDEXP_FALSE: + return false; + default: + cf_crash(AS_PREDEXP, "predexp eval returned other then true/false " + "with record data present"); + return false; // makes compiler happy + } +} + +void +predexp_destroy(predexp_eval_t* bp) +{ + (*bp->dtor_fn)(bp); +} diff --git a/as/src/base/probes.d b/as/src/base/probes.d new file mode 100644 index 00000000..87b3f6d5 --- /dev/null +++ b/as/src/base/probes.d @@ -0,0 +1,25 @@ +provider asd { + probe trans__demarshal(uint64_t, uint64_t, uint64_t); + probe query__starting(uint64_t, uint64_t); + probe query__qtrsetup_starting(uint64_t, uint64_t); + probe query__qtrsetup_finished(uint64_t, uint64_t); + probe query__init(uint64_t, uint64_t); + probe query__done(uint64_t, uint64_t, uint64_t); + probe query__trans_done(uint64_t, uint64_t, uint64_t); + probe query__qtr_alloc(uint64_t, uint64_t, uint64_t); + probe query__qtr_free(uint64_t, uint64_t, uint64_t); + probe query__ioreq_starting(uint64_t, uint64_t); + probe query__ioreq_finished(uint64_t, uint64_t); + probe query__io_starting(uint64_t, uint64_t); + probe query__io_notmatch(uint64_t, uint64_t); + probe query__io_error(uint64_t, uint64_t); + probe query__io_finished(uint64_t, uint64_t); + probe query__netio_starting(uint64_t, uint64_t); + probe query__netio_finished(uint64_t, uint64_t); + probe query__addfin(uint64_t, uint64_t); + probe query__sendpacket_starting(uint64_t, uint32_t, uint32_t); + probe query__sendpacket_continue(uint64_t, uint32_t); + probe query__sendpacket_finished(uint64_t); + probe sindex__msgrange_starting(uint64_t, uint64_t); + probe sindex__msgrange_finished(uint64_t, uint64_t); +}; diff --git a/as/src/base/proto.c b/as/src/base/proto.c new file mode 100644 index 00000000..65db1709 --- /dev/null +++ b/as/src/base/proto.c @@ -0,0 +1,885 @@ +/* + * proto.c + * + * Copyright (C) 2008-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "base/proto.h" + +#include +#include +#include +#include +#include +#include + +#include "aerospike/as_val.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_byte_order.h" +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_queue.h" +#include "citrusleaf/cf_vector.h" + +#include "dynbuf.h" +#include "fault.h" +#include "socket.h" + +#include "base/as_stap.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/thr_tsvc.h" +#include "base/transaction.h" +#include "storage/storage.h" + + +//========================================================== +// Typedefs & constants. +// + +#define MSG_STACK_BUFFER_SZ (1024 * 16) +#define NETIO_MAX_IO_RETRY 5 + +static const char SUCCESS_BIN_NAME[] = "SUCCESS"; +static const char FAILURE_BIN_NAME[] = "FAILURE"; + + +//========================================================== +// Globals. +// + +static cf_queue g_netio_queue; +static cf_queue g_netio_slow_queue; + + +//========================================================== +// Forward declarations. +// + +static int send_reply_buf(as_file_handle *fd_h, uint8_t *msgp, size_t msg_sz); +static void *run_netio(void *q_to_wait_on); +static int netio_send_packet(as_file_handle *fd_h, cf_buf_builder *bb_r, uint32_t *offset, bool blocking); + + +//========================================================== +// Public API - byte swapping. +// + +void +as_proto_swap(as_proto *proto) +{ + uint8_t version = proto->version; + uint8_t type = proto->type; + + proto->version = proto->type = 0; + proto->sz = cf_swap_from_be64(*(uint64_t *)proto); + proto->version = version; + proto->type = type; +} + +void +as_msg_swap_header(as_msg *m) +{ + m->generation = cf_swap_from_be32(m->generation); + m->record_ttl = cf_swap_from_be32(m->record_ttl); + m->transaction_ttl = cf_swap_from_be32(m->transaction_ttl); + m->n_fields = cf_swap_from_be16(m->n_fields); + m->n_ops = cf_swap_from_be16(m->n_ops); +} + +void +as_msg_swap_field(as_msg_field *mf) +{ + mf->field_sz = cf_swap_from_be32(mf->field_sz); +} + +void +as_msg_swap_op(as_msg_op *op) +{ + op->op_sz = cf_swap_from_be32(op->op_sz); +} + + +//========================================================== +// Public API - generating internal transactions. +// + +// Allocates cl_msg returned - caller must free it. Everything is host-ordered. +// Will add more parameters (e.g. for set name) only as they become necessary. +cl_msg * +as_msg_create_internal(const char *ns_name, const cf_digest *keyd, + uint8_t info1, uint8_t info2, uint8_t info3) +{ + size_t ns_name_len = strlen(ns_name); + + size_t msg_sz = sizeof(cl_msg) + + sizeof(as_msg_field) + ns_name_len + + sizeof(as_msg_field) + sizeof(cf_digest); + + cl_msg *msgp = (cl_msg *)cf_malloc(msg_sz); + + msgp->proto.version = PROTO_VERSION; + msgp->proto.type = PROTO_TYPE_AS_MSG; + msgp->proto.sz = msg_sz - sizeof(as_proto); + + as_msg *m = &msgp->msg; + + m->header_sz = sizeof(as_msg); + m->info1 = info1; + m->info2 = info2; + m->info3 = info3; + m->unused = 0; + m->result_code = 0; + m->generation = 0; + m->record_ttl = 0; + m->transaction_ttl = 0; + m->n_fields = 2; + m->n_ops = 0; + + as_msg_field *mf = (as_msg_field *)(m->data); + + mf->type = AS_MSG_FIELD_TYPE_NAMESPACE; + mf->field_sz = (uint32_t)ns_name_len + 1; + memcpy(mf->data, ns_name, ns_name_len); + + mf = as_msg_field_get_next(mf); + + mf->type = AS_MSG_FIELD_TYPE_DIGEST_RIPE; + mf->field_sz = sizeof(cf_digest) + 1; + *(cf_digest *)mf->data = *keyd; + + return msgp; +} + + +//========================================================== +// Public API - packing responses. +// + +// Allocates cl_msg returned - caller must free it. +cl_msg * +as_msg_make_response_msg(uint32_t result_code, uint32_t generation, + uint32_t void_time, as_msg_op **ops, as_bin **bins, uint16_t bin_count, + as_namespace *ns, cl_msg *msgp_in, size_t *msg_sz_in, uint64_t trid) +{ + uint16_t n_fields = 0; + size_t msg_sz = sizeof(cl_msg); + + if (trid != 0) { + n_fields++; + msg_sz += sizeof(as_msg_field) + sizeof(trid); + } + + msg_sz += sizeof(as_msg_op) * bin_count; + + for (uint16_t i = 0; i < bin_count; i++) { + if (ops) { + msg_sz += ops[i]->name_sz; + } + else if (bins[i]) { + msg_sz += ns->single_bin ? + 0 : strlen(as_bin_get_name_from_id(ns, bins[i]->id)); + } + else { + cf_crash(AS_PROTO, "making response message with null bin and op"); + } + + if (bins[i]) { + msg_sz += as_bin_particle_client_value_size(bins[i]); + } + } + + uint8_t *buf; + + if (! msgp_in || *msg_sz_in < msg_sz) { + buf = cf_malloc(msg_sz); + } + else { + buf = (uint8_t *)msgp_in; + } + + *msg_sz_in = msg_sz; + + cl_msg *msgp = (cl_msg *)buf; + + msgp->proto.version = PROTO_VERSION; + msgp->proto.type = PROTO_TYPE_AS_MSG; + msgp->proto.sz = msg_sz - sizeof(as_proto); + + as_proto_swap(&msgp->proto); + + as_msg *m = &msgp->msg; + + m->header_sz = sizeof(as_msg); + m->info1 = 0; + m->info2 = 0; + m->info3 = 0; + m->unused = 0; + m->result_code = result_code; + m->generation = generation == 0 ? 0 : plain_generation(generation, ns); + m->record_ttl = void_time; + m->transaction_ttl = 0; + m->n_fields = n_fields; + m->n_ops = bin_count; + + as_msg_swap_header(m); + + buf = m->data; + + if (trid != 0) { + as_msg_field *mf = (as_msg_field *)buf; + + mf->field_sz = 1 + sizeof(uint64_t); + mf->type = AS_MSG_FIELD_TYPE_TRID; + *(uint64_t *)mf->data = cf_swap_to_be64(trid); + as_msg_swap_field(mf); + buf += sizeof(as_msg_field) + sizeof(uint64_t); + } + + for (uint16_t i = 0; i < bin_count; i++) { + as_msg_op *op = (as_msg_op *)buf; + + op->version = 0; + + if (ops) { + op->op = ops[i]->op; + memcpy(op->name, ops[i]->name, ops[i]->name_sz); + op->name_sz = ops[i]->name_sz; + } + else { + op->op = AS_MSG_OP_READ; + op->name_sz = as_bin_memcpy_name(ns, op->name, bins[i]); + } + + op->op_sz = 4 + op->name_sz; + + buf += sizeof(as_msg_op) + op->name_sz; + buf += as_bin_particle_to_client(bins[i], op); + + as_msg_swap_op(op); + } + + return msgp; +} + +// FIXME - only old batch sets include_key false - remove parameter ??? +// FIXME - only old batch sets skip_empty_records false - remove parameter ??? +// Pass NULL bb_r for sizing only. Return value is size if >= 0, error if < 0. +int32_t +as_msg_make_response_bufbuilder(cf_buf_builder **bb_r, as_storage_rd *rd, + bool no_bin_data, bool include_key, bool skip_empty_records, + cf_vector *select_bins) +{ + as_namespace *ns = rd->ns; + as_record *r = rd->r; + + size_t ns_len = strlen(ns->name); + const char *set_name = as_index_get_set_name(r, ns); + size_t set_name_len = set_name ? strlen(set_name) : 0; + + uint8_t* key = NULL; + uint32_t key_size = 0; + + if (include_key && r->key_stored == 1) { + if (! as_storage_record_get_key(rd)) { + cf_warning(AS_PROTO, "can't get key - skipping record"); + return -1; + } + + key = rd->key; + key_size = rd->key_size; + } + + uint16_t n_fields = 2; // always add namespace and digest + size_t msg_sz = sizeof(as_msg) + + sizeof(as_msg_field) + ns_len + + sizeof(as_msg_field) + sizeof(cf_digest); + + if (set_name) { + n_fields++; + msg_sz += sizeof(as_msg_field) + set_name_len; + } + + if (key) { + n_fields++; + msg_sz += sizeof(as_msg_field) + key_size; + } + + uint32_t n_select_bins = 0; + uint16_t n_bins_matched = 0; + uint16_t n_record_bins = 0; + + if (! no_bin_data) { + if (select_bins) { + n_select_bins = cf_vector_size(select_bins); + + for (uint32_t i = 0; i < n_select_bins; i++) { + char bin_name[AS_ID_BIN_SZ]; + + cf_vector_get(select_bins, i, (void*)&bin_name); + + as_bin *b = as_bin_get(rd, bin_name); + + if (! b) { + continue; + } + + msg_sz += sizeof(as_msg_op); + msg_sz += ns->single_bin ? 0 : strlen(bin_name); + msg_sz += as_bin_particle_client_value_size(b); + + n_bins_matched++; + } + + // Don't return an empty record. + if (skip_empty_records && n_bins_matched == 0) { + return 0; + } + } + else { + n_record_bins = as_bin_inuse_count(rd); + + msg_sz += sizeof(as_msg_op) * n_record_bins; + + for (uint16_t i = 0; i < n_record_bins; i++) { + as_bin *b = &rd->bins[i]; + + msg_sz += ns->single_bin ? + 0 : strlen(as_bin_get_name_from_id(ns, b->id)); + msg_sz += (int)as_bin_particle_client_value_size(b); + } + } + } + + // NULL buf-builder means just return size. + if (! bb_r) { + return (int32_t)msg_sz; + } + + uint8_t *buf; + + cf_buf_builder_reserve(bb_r, (int)msg_sz, &buf); + + as_msg *m = (as_msg *)buf; + + m->header_sz = sizeof(as_msg); + m->info1 = no_bin_data ? AS_MSG_INFO1_GET_NO_BINS : 0; + m->info2 = 0; + m->info3 = 0; + m->unused = 0; + m->result_code = AS_PROTO_RESULT_OK; + m->generation = plain_generation(r->generation, ns); + m->record_ttl = r->void_time; + m->transaction_ttl = 0; + m->n_fields = n_fields; + + if (no_bin_data) { + m->n_ops = 0; + } + else { + m->n_ops = select_bins ? n_bins_matched : n_record_bins; + } + + as_msg_swap_header(m); + + buf = m->data; + + as_msg_field *mf = (as_msg_field *)buf; + + mf->field_sz = ns_len + 1; + mf->type = AS_MSG_FIELD_TYPE_NAMESPACE; + memcpy(mf->data, ns->name, ns_len); + as_msg_swap_field(mf); + buf += sizeof(as_msg_field) + ns_len; + + mf = (as_msg_field *)buf; + mf->field_sz = sizeof(cf_digest) + 1; + mf->type = AS_MSG_FIELD_TYPE_DIGEST_RIPE; + memcpy(mf->data, &r->keyd, sizeof(cf_digest)); + as_msg_swap_field(mf); + buf += sizeof(as_msg_field) + sizeof(cf_digest); + + if (set_name) { + mf = (as_msg_field *)buf; + mf->field_sz = set_name_len + 1; + mf->type = AS_MSG_FIELD_TYPE_SET; + memcpy(mf->data, set_name, set_name_len); + as_msg_swap_field(mf); + buf += sizeof(as_msg_field) + set_name_len; + } + + if (key) { + mf = (as_msg_field *)buf; + mf->field_sz = key_size + 1; + mf->type = AS_MSG_FIELD_TYPE_KEY; + memcpy(mf->data, key, key_size); + as_msg_swap_field(mf); + buf += sizeof(as_msg_field) + key_size; + } + + if (no_bin_data) { + return (int32_t)msg_sz; + } + + if (select_bins) { + for (uint32_t i = 0; i < n_select_bins; i++) { + char bin_name[AS_ID_BIN_SZ]; + + cf_vector_get(select_bins, i, (void*)&bin_name); + + as_bin *b = as_bin_get(rd, bin_name); + + if (! b) { + continue; + } + + as_msg_op *op = (as_msg_op *)buf; + + op->op = AS_MSG_OP_READ; + op->version = 0; + op->name_sz = as_bin_memcpy_name(ns, op->name, b); + op->op_sz = 4 + op->name_sz; + + buf += sizeof(as_msg_op) + op->name_sz; + buf += as_bin_particle_to_client(b, op); + + as_msg_swap_op(op); + } + } + else { + for (uint16_t i = 0; i < n_record_bins; i++) { + as_msg_op *op = (as_msg_op *)buf; + + op->op = AS_MSG_OP_READ; + op->version = 0; + op->name_sz = as_bin_memcpy_name(ns, op->name, &rd->bins[i]); + op->op_sz = 4 + op->name_sz; + + buf += sizeof(as_msg_op) + op->name_sz; + buf += as_bin_particle_to_client(&rd->bins[i], op); + + as_msg_swap_op(op); + } + } + + return (int32_t)msg_sz; +} + +cl_msg * +as_msg_make_val_response(bool success, const as_val *val, uint32_t result_code, + uint32_t generation, uint32_t void_time, uint64_t trid, + size_t *p_msg_sz) +{ + const char *bin_name; + size_t bin_name_len; + + if (success) { + bin_name = SUCCESS_BIN_NAME; + bin_name_len = sizeof(SUCCESS_BIN_NAME) - 1; + } + else { + bin_name = FAILURE_BIN_NAME; + bin_name_len = sizeof(FAILURE_BIN_NAME) - 1; + } + + uint16_t n_fields = 0; + size_t msg_sz = sizeof(cl_msg); + + if (trid != 0) { + n_fields++; + msg_sz += sizeof(as_msg_field) + sizeof(trid); + } + + msg_sz += sizeof(as_msg_op) + bin_name_len + + as_particle_asval_client_value_size(val); + + uint8_t *buf = cf_malloc(msg_sz); + cl_msg *msgp = (cl_msg *)buf; + + msgp->proto.version = PROTO_VERSION; + msgp->proto.type = PROTO_TYPE_AS_MSG; + msgp->proto.sz = msg_sz - sizeof(as_proto); + + as_proto_swap(&msgp->proto); + + as_msg *m = &msgp->msg; + + m->header_sz = sizeof(as_msg); + m->info1 = 0; + m->info2 = 0; + m->info3 = 0; + m->unused = 0; + m->result_code = result_code; + m->generation = generation; + m->record_ttl = void_time; + m->transaction_ttl = 0; + m->n_fields = n_fields; + m->n_ops = 1; // only the one special bin + + as_msg_swap_header(m); + + buf = m->data; + + if (trid != 0) { + as_msg_field *mf = (as_msg_field *)buf; + + mf->field_sz = 1 + sizeof(uint64_t); + mf->type = AS_MSG_FIELD_TYPE_TRID; + *(uint64_t *)mf->data = cf_swap_to_be64(trid); + as_msg_swap_field(mf); + buf += sizeof(as_msg_field) + sizeof(uint64_t); + } + + as_msg_op *op = (as_msg_op *)buf; + + op->op = AS_MSG_OP_READ; + op->name_sz = (uint8_t)bin_name_len; + memcpy(op->name, bin_name, op->name_sz); + op->op_sz = 4 + op->name_sz; + op->version = 0; + + as_particle_asval_to_client(val, op); + + as_msg_swap_op(op); + + *p_msg_sz = msg_sz; + + return msgp; +} + +// Caller-provided val_sz must be the result of calling +// as_particle_asval_client_value_size() for same val. +void +as_msg_make_val_response_bufbuilder(const as_val *val, cf_buf_builder **bb_r, + uint32_t val_sz, bool success) +{ + const char *bin_name; + size_t bin_name_len; + + if (success) { + bin_name = SUCCESS_BIN_NAME; + bin_name_len = sizeof(SUCCESS_BIN_NAME) - 1; + } + else { + bin_name = FAILURE_BIN_NAME; + bin_name_len = sizeof(FAILURE_BIN_NAME) - 1; + } + + size_t msg_sz = sizeof(as_msg) + sizeof(as_msg_op) + bin_name_len + val_sz; + + uint8_t *buf; + + cf_buf_builder_reserve(bb_r, (int)msg_sz, &buf); + + as_msg *m = (as_msg *)buf; + + m->header_sz = sizeof(as_msg); + m->info1 = 0; + m->info2 = 0; + m->info3 = 0; + m->unused = 0; + m->result_code = AS_PROTO_RESULT_OK; + m->generation = 0; + m->record_ttl = 0; + m->transaction_ttl = 0; + m->n_fields = 0; + m->n_ops = 1; // only the one special bin + + as_msg_swap_header(m); + + as_msg_op *op = (as_msg_op *)m->data; + + op->op = AS_MSG_OP_READ; + op->name_sz = (uint8_t)bin_name_len; + memcpy(op->name, bin_name, op->name_sz); + op->op_sz = 4 + op->name_sz; + op->version = 0; + + as_particle_asval_to_client(val, op); + + as_msg_swap_op(op); +} + + +//========================================================== +// Public API - sending responses to client. +// + +// Make an individual transaction response and send it. +int +as_msg_send_reply(as_file_handle *fd_h, uint32_t result_code, + uint32_t generation, uint32_t void_time, as_msg_op **ops, as_bin **bins, + uint16_t bin_count, as_namespace *ns, uint64_t trid) +{ + uint8_t stack_buf[MSG_STACK_BUFFER_SZ]; + size_t msg_sz = sizeof(stack_buf); + uint8_t *msgp = (uint8_t *)as_msg_make_response_msg(result_code, generation, + void_time, ops, bins, bin_count, ns, (cl_msg *)stack_buf, &msg_sz, + trid); + + int rv = send_reply_buf(fd_h, msgp, msg_sz); + + if (msgp != stack_buf) { + cf_free(msgp); + } + + return rv; +} + +// Send a pre-made response saved in a dyn-buf. +int +as_msg_send_ops_reply(as_file_handle *fd_h, cf_dyn_buf *db) +{ + return send_reply_buf(fd_h, db->buf, db->used_sz); +} + +// Send a blocking "fin" message with default timeout. +bool +as_msg_send_fin(cf_socket *sock, uint32_t result_code) +{ + return as_msg_send_fin_timeout(sock, result_code, CF_SOCKET_TIMEOUT) != 0; +} + +// Send a blocking "fin" message with a specified timeout. +size_t +as_msg_send_fin_timeout(cf_socket *sock, uint32_t result_code, int32_t timeout) +{ + cl_msg msgp; + + msgp.proto.version = PROTO_VERSION; + msgp.proto.type = PROTO_TYPE_AS_MSG; + msgp.proto.sz = sizeof(as_msg); + + as_proto_swap(&msgp.proto); + + as_msg *m = &msgp.msg; + + m->header_sz = sizeof(as_msg); + m->info1 = 0; + m->info2 = 0; + m->info3 = AS_MSG_INFO3_LAST; + m->unused = 0; + m->result_code = result_code; + m->generation = 0; + m->record_ttl = 0; + m->transaction_ttl = 0; + m->n_fields = 0; + m->n_ops = 0; + + as_msg_swap_header(m); + + if (cf_socket_send_all(sock, (uint8_t*)&msgp, sizeof(msgp), MSG_NOSIGNAL, + timeout) < 0) { + cf_warning(AS_PROTO, "send error - fd %d %s", CSFD(sock), + cf_strerror(errno)); + return 0; + } + + return sizeof(cl_msg); +} + + +//========================================================== +// Public API - query "net-IO" responses. +// + +void +as_netio_init() +{ + cf_queue_init(&g_netio_queue, sizeof(as_netio), 64, true); + cf_queue_init(&g_netio_slow_queue, sizeof(as_netio), 64, true); + + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + if (pthread_create(&thread, &attrs, run_netio, + (void *)&g_netio_queue) != 0) { + cf_crash(AS_PROTO, "failed to create netio thread"); + } + + if (pthread_create(&thread, &attrs, run_netio, + (void *)&g_netio_slow_queue) != 0) { + cf_crash(AS_PROTO, "failed to create netio slow thread"); + } +} + +// Based on io object, send buffer to the network, or queue for retry. +// +// start_cb: Callback to the module before the real IO is started. Returns: +// AS_NETIO_OK: Everything ok, go ahead with IO. +// AS_NETIO_ERR: If there was issue like abort/err/timeout etc. +// +// finish_cb: Callback to module with status code of the IO call. Returns: +// AS_NETIO_OK: Everything ok. +// AS_NETIO_CONTINUE: The IO was requeued. +// AS_NETIO_ERR: IO erred out due to some issue. +// +// finish_cb should do the needful like release ref to user data etc. +// +// Returns: +// AS_NETIO_OK: Everything is fine, both start_cb & finish_cb were called. +// AS_NETIO_ERR: Something failed either calling start_cb or while doing +// network IO, finish_cb is called. +// +// This function consumes qtr reference. It calls finish_cb which releases ref +// to qtr. In case of AS_NETIO_CONTINUE: this function also consumes bb_r and +// ref for fd_h. The background thread is responsible for freeing up bb_r and +// releasing ref to fd_h. +int +as_netio_send(as_netio *io, bool slow, bool blocking) +{ + int ret = io->start_cb(io, io->seq); + + if (ret == AS_NETIO_OK) { + ret = io->finish_cb(io, netio_send_packet(io->fd_h, io->bb_r, + &io->offset, blocking)); + } + else { + ret = io->finish_cb(io, ret); + } + + // If needs requeue then requeue it. + switch (ret) { + case AS_NETIO_CONTINUE: + if (slow) { + io->slow = true; + cf_queue_push(&g_netio_slow_queue, io); + } + else { + cf_queue_push(&g_netio_queue, io); + } + break; + default: + ret = AS_NETIO_OK; + break; + } + + return ret; +} + + +//========================================================== +// Local helpers. +// + +static int +send_reply_buf(as_file_handle *fd_h, uint8_t *msgp, size_t msg_sz) +{ + cf_assert(cf_socket_exists(&fd_h->sock), AS_PROTO, "fd is invalid"); + + if (cf_socket_send_all(&fd_h->sock, msgp, msg_sz, MSG_NOSIGNAL, + CF_SOCKET_TIMEOUT) < 0) { + // Common when a client aborts. + cf_debug(AS_PROTO, "protocol write fail: fd %d sz %zu errno %d", + CSFD(&fd_h->sock), msg_sz, errno); + + as_end_of_transaction_force_close(fd_h); + return -1; + } + + as_end_of_transaction_ok(fd_h); + return 0; +} + +static void * +run_netio(void *q_to_wait_on) +{ + cf_queue *q = (cf_queue*)q_to_wait_on; + + while (true) { + as_netio io; + + if (cf_queue_pop(q, &io, CF_QUEUE_FOREVER) != 0) { + cf_crash(AS_PROTO, "failed to pop from IO worker queue."); + } + + if (io.slow) { + usleep(g_config.proto_slow_netio_sleep_ms * 1000); + } + + as_netio_send(&io, true, false); + } + + return NULL; +} + +static int +netio_send_packet(as_file_handle *fd_h, cf_buf_builder *bb_r, uint32_t *offset, + bool blocking) +{ +#if defined(USE_SYSTEMTAP) + uint64_t nodeid = g_config.self_node; +#endif + + uint32_t len = bb_r->used_sz; + uint8_t *buf = bb_r->buf; + + as_proto proto; + + proto.version = PROTO_VERSION; + proto.type = PROTO_TYPE_AS_MSG; + proto.sz = len - 8; + as_proto_swap(&proto); + + memcpy(bb_r->buf, &proto, 8); + + uint32_t pos = *offset; + + ASD_QUERY_SENDPACKET_STARTING(nodeid, pos, len); + + int retry = 0; + + cf_detail(AS_PROTO," start at %p %d %d", buf, pos, len); + + while (pos < len) { + int rv = cf_socket_send(&fd_h->sock, buf + pos, len - pos, + MSG_NOSIGNAL); + + if (rv <= 0) { + if (errno != EAGAIN) { + cf_debug(AS_PROTO, "packet send response error returned %d errno %d fd %d", + rv, errno, CSFD(&fd_h->sock)); + return AS_NETIO_IO_ERR; + } + + if (! blocking && (retry > NETIO_MAX_IO_RETRY)) { + *offset = pos; + cf_detail(AS_PROTO," end at %p %d %d", buf, pos, len); + ASD_QUERY_SENDPACKET_CONTINUE(nodeid, pos); + return AS_NETIO_CONTINUE; + } + + retry++; + // bigger packets so try few extra times + usleep(100); + } + else { + pos += rv; + } + } + + ASD_QUERY_SENDPACKET_FINISHED(nodeid); + return AS_NETIO_OK; +} diff --git a/as/src/base/rec_props.c b/as/src/base/rec_props.c new file mode 100644 index 00000000..e26016f7 --- /dev/null +++ b/as/src/base/rec_props.c @@ -0,0 +1,230 @@ +/* + * rec_props.c + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * A list of record properties. + * + */ + +//========================================================== +// Includes +// + +#include +#include +#include + +#include "citrusleaf/alloc.h" + +#include "base/rec_props.h" + + +//========================================================== +// Private "Class Members" +// + +//------------------------------------------------ +// Function Declarations +// + +//------------------------------------------------ +// Data +// + +//------------------------------------------------ +// Constants +// + + +//========================================================== +// Typedefs +// + +typedef struct as_rec_prop_field_s { + as_rec_props_field_id id; + uint32_t value_size; + uint8_t value[]; +} __attribute__ ((__packed__)) as_rec_prop_field; + + +//========================================================== +// Public API +// + +//------------------------------------------------ +// Clear the object. +// +void +as_rec_props_clear(as_rec_props *this) +{ + this->p_data = NULL; + this->size = 0; +} + +//------------------------------------------------ +// Parse a specific field. +// +int +as_rec_props_get_value(const as_rec_props *this, + as_rec_props_field_id id, uint32_t *p_value_size, uint8_t **pp_value) +{ + const uint8_t *p_read = this->p_data; + const uint8_t *p_end = p_read + this->size - sizeof(as_rec_prop_field); + + while (p_read < p_end) { + as_rec_prop_field* p_field = (as_rec_prop_field*)p_read; + + if (p_field->id == id) { + if (p_value_size) { + *p_value_size = p_field->value_size; + } + + if (pp_value) { + *pp_value = p_field->value; + } + + return 0; + } + + p_read += sizeof(as_rec_prop_field) + p_field->value_size; + } + + return -1; +} + +//------------------------------------------------ +// Get packed size of field, given value size. +// +uint32_t +as_rec_props_sizeof_field(uint32_t value_size) +{ + return sizeof(as_rec_prop_field) + value_size; +} + +//------------------------------------------------ +// Set p_data member to external buffer. (The size +// member will be used like a write pointer in add +// methods, so it starts at 0 here.) +// +void +as_rec_props_init(as_rec_props *this, uint8_t *p_data) +{ + this->p_data = p_data; + this->size = 0; +} + +//------------------------------------------------ +// Allocate memory for data. (The size member will +// be used like a write pointer in add methods, so +// it starts at 0 here.) +// +void +as_rec_props_init_malloc(as_rec_props *this, uint32_t malloc_size) +{ + this->p_data = cf_malloc(malloc_size); + this->size = 0; +} + +//------------------------------------------------ +// Append a field, trusting that: +// - this->p_data has been allocated big enough +// - this->size is the size added so far +// +void +as_rec_props_add_field(as_rec_props *this, + as_rec_props_field_id id, uint32_t value_size, const uint8_t *p_value) +{ + as_rec_prop_field* p_field = + (as_rec_prop_field*)(this->p_data + this->size); + + p_field->id = id; + p_field->value_size = value_size; + memcpy(p_field->value, p_value, value_size); + + this->size += as_rec_props_sizeof_field(value_size); +} + +//------------------------------------------------ +// Same as as_rec_props_add_field(), but where +// p_value is to be a null-terminated string. +// +void +as_rec_props_add_field_null_terminate(as_rec_props *this, + as_rec_props_field_id id, uint32_t value_len, const uint8_t *p_value) +{ + as_rec_prop_field* p_field = + (as_rec_prop_field*)(this->p_data + this->size); + + p_field->id = id; + p_field->value_size = value_len + 1; + memcpy(p_field->value, p_value, value_len); + p_field->value[value_len] = 0; + + this->size += as_rec_props_sizeof_field(p_field->value_size); +} + +//------------------------------------------------ +// Returns size required for as_rec_props p_data +// buffer for specified fields. +// +size_t +as_rec_props_size_all(const uint8_t *set_name, size_t set_name_len, + const uint8_t *key, size_t key_size) +{ + size_t rec_props_data_size = 0; + + if (set_name) { + rec_props_data_size += as_rec_props_sizeof_field(set_name_len + 1); + } + + if (key) { + rec_props_data_size += as_rec_props_sizeof_field(key_size); + } + + return rec_props_data_size; +} + +//------------------------------------------------ +// Add all specified fields, trusting that: +// - this->p_data has been allocated big enough +// +void +as_rec_props_fill_all(as_rec_props *this, uint8_t *p_data, + const uint8_t *set_name, size_t set_name_len, const uint8_t *key, + size_t key_size) +{ + as_rec_props_init(this, p_data); + + if (set_name) { + as_rec_props_add_field_null_terminate(this, CL_REC_PROPS_FIELD_SET_NAME, + set_name_len, set_name); + } + + if (key) { + as_rec_props_add_field(this, CL_REC_PROPS_FIELD_KEY, key_size, key); + } +} + + +//========================================================== +// Private Functions +// diff --git a/as/src/base/record.c b/as/src/base/record.c new file mode 100644 index 00000000..4366bfad --- /dev/null +++ b/as/src/base/record.c @@ -0,0 +1,958 @@ +/* + * record.c + * + * Copyright (C) 2012-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_byte_order.h" +#include "citrusleaf/cf_digest.h" + +#include "arenax.h" +#include "dynbuf.h" +#include "fault.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/rec_props.h" +#include "base/secondary_index.h" +#include "base/truncate.h" +#include "base/xdr_serverside.h" +#include "storage/storage.h" +#include "transaction/rw_utils.h" + + +//========================================================== +// Typedefs & constants. +// + +#define STACK_PARTICLES_SIZE (1024 * 1024) + + +//========================================================== +// Forward declarations. +// + +void record_replace_failed(as_remote_record *rr, as_index_ref* r_ref, as_storage_rd* rd, bool is_create); + +int record_apply_dim_single_bin(as_remote_record *rr, as_storage_rd *rd, bool *is_delete); +int record_apply_dim(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex, bool *is_delete); +int record_apply_ssd_single_bin(as_remote_record *rr, as_storage_rd *rd, bool *is_delete); +int record_apply_ssd(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex, bool *is_delete); + +void update_index_metadata(as_remote_record *rr, index_metadata *old, as_record *r); +void unwind_index_metadata(const index_metadata *old, as_record *r); +void unwind_dim_single_bin(as_bin* old_bin, as_bin* new_bin); + +int unpickle_bins(as_remote_record *rr, as_storage_rd *rd, cf_ll_buf *particles_llb); + +void xdr_write_replica(as_remote_record *rr, bool is_delete, uint32_t set_id); + + +//========================================================== +// Inlines & macros. +// + +static inline int +resolve_generation_direct(uint16_t left, uint16_t right) +{ + return left == right ? 0 : (right > left ? 1 : -1); +} + +static inline int +resolve_generation(uint16_t left, uint16_t right) +{ + return left == right ? 0 : (as_gen_less_than(left, right) ? 1 : -1); +} + +// Assumes remote generation is not 0. (Local may be 0 if creating record.) +static inline bool +next_generation(uint16_t local, uint16_t remote, as_namespace* ns) +{ + local = plain_generation(local, ns); + remote = plain_generation(remote, ns); + + return local == 0xFFFF ? remote == 1 : remote - local == 1; +} + + +//========================================================== +// Public API - record lock lifecycle. +// + +// Returns: +// 1 - created new record +// 0 - found existing record +// -1 - failure - could not allocate arena stage +int +as_record_get_create(as_index_tree *tree, cf_digest *keyd, as_index_ref *r_ref, + as_namespace *ns) +{ + int rv; + + while ((rv = as_index_get_insert_vlock(tree, keyd, r_ref)) == -2) { + // rv = -2 - found "half created" or deleted record, wait for other + // thread to finish, and try again. + usleep(50); + } + + if (rv == 1) { + cf_atomic64_incr(&ns->n_objects); + } + + return rv; +} + + +// Returns: +// 0 - found +// -1 - not found +int +as_record_get(as_index_tree *tree, cf_digest *keyd, as_index_ref *r_ref) +{ + return as_index_get_vlock(tree, keyd, r_ref); +} + + +// Done with record - unlock, release, and if ref-count hits 0, destroy record +// and free arena element. +void +as_record_done(as_index_ref *r_ref, as_namespace *ns) +{ + if (! r_ref->skip_lock) { + cf_mutex_unlock(r_ref->olock); + } + + int rc = as_index_release(r_ref->r); + + if (rc > 0) { + return; + } + + cf_assert(rc == 0, AS_RECORD, "index ref-count %d", rc); + + as_record_destroy(r_ref->r, ns); + cf_arenax_free(ns->arena, r_ref->r_h); +} + + +//========================================================== +// Public API - record lifecycle utilities. +// + +// Returns: +// 0 - found +// -1 - not found +int +as_record_exists(as_index_tree *tree, cf_digest *keyd) +{ + return as_index_exists(tree, keyd); +} + + +// TODO - inline this, if/when we unravel header files. +bool +as_record_is_expired(const as_record *r) +{ + return r->void_time != 0 && r->void_time < as_record_void_time_get(); +} + + +// Called when writes encounter a "doomed" record, to delete the doomed record +// and create a new one in place without giving up the record lock. +void +as_record_rescue(as_index_ref *r_ref, as_namespace *ns) +{ + record_delete_adjust_sindex(r_ref->r, ns); + as_record_destroy(r_ref->r, ns); + as_index_clear_record_info(r_ref->r); + cf_atomic64_incr(&ns->n_objects); +} + + +// Called only after last reference is released. Called by as_record_done(), +// also given to index trees to be called when tree releases record reference. +void +as_record_destroy(as_record *r, as_namespace *ns) +{ + if (ns->storage_data_in_memory) { + // Note - rd is a limited container here - not calling + // as_storage_record_create(), _open(), _close(). + as_storage_rd rd; + + rd.r = r; + rd.ns = ns; + as_storage_rd_load_n_bins(&rd); + as_storage_rd_load_bins(&rd, NULL); + + as_storage_record_drop_from_mem_stats(&rd); + + as_record_destroy_bins(&rd); + + if (! ns->single_bin) { + as_record_free_bin_space(r); + + if (r->dim) { + cf_free(r->dim); // frees the key + } + } + } + + as_record_drop_stats(r, ns); + + // Dereference record's storage used-size. + as_storage_record_destroy(ns, r); + + return; +} + + +// Called only if data-in-memory, and not single-bin. +void +as_record_free_bin_space(as_record *r) +{ + as_bin_space *bin_space = as_index_get_bin_space(r); + + if (bin_space) { + cf_free((void*)bin_space); + as_index_set_bin_space(r, NULL); + } +} + + +// Destroy all particles in all bins. +void +as_record_destroy_bins(as_storage_rd *rd) +{ + as_record_destroy_bins_from(rd, 0); +} + + +// Destroy particles in specified bins. +void +as_record_destroy_bins_from(as_storage_rd *rd, uint16_t from) +{ + for (uint16_t i = from; i < rd->n_bins; i++) { + as_bin *b = &rd->bins[i]; + + if (! as_bin_inuse(b)) { + return; // no more used bins - there are never unused bin gaps + } + + as_bin_particle_destroy(b, rd->ns->storage_data_in_memory); + as_bin_set_empty(b); + } +} + + +// Called only for data-in-memory multi-bin, with no key currently stored. +// Note - have to modify if/when other metadata joins key in as_rec_space. +void +as_record_allocate_key(as_record *r, const uint8_t *key, uint32_t key_size) +{ + as_rec_space *rec_space = (as_rec_space *) + cf_malloc_ns(sizeof(as_rec_space) + key_size); + + rec_space->bin_space = (as_bin_space *)r->dim; + rec_space->key_size = key_size; + memcpy((void*)rec_space->key, (const void*)key, key_size); + + r->dim = (void*)rec_space; +} + + +// Called only for data-in-memory multi-bin, with a key currently stored. +// Note - have to modify if/when other metadata joins key in as_rec_space. +void +as_record_remove_key(as_record *r) +{ + as_bin_space *p_bin_space = ((as_rec_space *)r->dim)->bin_space; + + cf_free(r->dim); + r->dim = (void *)p_bin_space; +} + + +//========================================================== +// Public API - pickled record utilities. +// + +// Flatten record's bins into "pickle" format for fabric. +uint8_t * +as_record_pickle(as_storage_rd *rd, size_t *len_r) +{ + as_namespace *ns = rd->ns; + + uint32_t sz = 2; // always 2 bytes for number of bins + uint16_t n_bins_in_use; + + for (n_bins_in_use = 0; n_bins_in_use < rd->n_bins; n_bins_in_use++) { + as_bin *b = &rd->bins[n_bins_in_use]; + + if (! as_bin_inuse(b)) { + break; + } + + sz += 1; // for bin name length + sz += ns->single_bin ? + 0 : strlen(as_bin_get_name_from_id(ns, b->id)); // for bin name + sz += 1; // was for version - currently not used + + sz += as_bin_particle_pickled_size(b); + } + + uint8_t *pickle = cf_malloc(sz); + uint8_t *buf = pickle; + + (*(uint16_t *)buf) = cf_swap_to_be16(n_bins_in_use); // number of bins + buf += 2; + + for (uint16_t i = 0; i < n_bins_in_use; i++) { + as_bin *b = &rd->bins[i]; + + // Copy bin name, skipping a byte for name length. + uint8_t name_len = (uint8_t)as_bin_memcpy_name(ns, buf + 1, b); + + *buf++ = name_len; // fill in bin name length + buf += name_len; // skip past bin name + *buf++ = 0; // was version - currently not used + + buf += as_bin_particle_to_pickled(b, buf); + } + + *len_r = sz; + + return pickle; +} + + +// If remote record is better than local record, replace local with remote. +int +as_record_replace_if_better(as_remote_record *rr, bool is_repl_write, + bool skip_sindex, bool do_xdr_write) +{ + as_namespace *ns = rr->rsv->ns; + + if (! as_storage_has_space(ns)) { + cf_warning(AS_RECORD, "{%s} record replace: drives full", ns->name); + return AS_PROTO_RESULT_FAIL_OUT_OF_SPACE; + } + + CF_ALLOC_SET_NS_ARENA(ns); + + as_index_tree *tree = rr->rsv->tree; + + as_index_ref r_ref; + r_ref.skip_lock = false; + + int rv = as_record_get_create(tree, rr->keyd, &r_ref, ns); + + if (rv < 0) { + return AS_PROTO_RESULT_FAIL_OUT_OF_SPACE; + } + + bool is_create = rv == 1; + as_index *r = r_ref.r; + + int result; + + conflict_resolution_pol policy = ns->conflict_resolution_policy; + + if (is_repl_write) { + bool from_replica; + + if ((result = as_partition_check_source(ns, rr->rsv->p, rr->src, + &from_replica)) != AS_PROTO_RESULT_OK) { + record_replace_failed(rr, &r_ref, NULL, is_create); + return result; + } + + repl_write_init_repl_state(rr, from_replica); + policy = repl_write_conflict_resolution_policy(ns); + } + + if (! is_create && record_replace_check(r, ns) < 0) { + record_replace_failed(rr, &r_ref, NULL, is_create); + return AS_PROTO_RESULT_FAIL_FORBIDDEN; + } + + // If local record is better, no-op or fail. + if (! is_create && (result = as_record_resolve_conflict(policy, + r->generation, r->last_update_time, (uint16_t)rr->generation, + rr->last_update_time)) <= 0) { + record_replace_failed(rr, &r_ref, NULL, is_create); + return result == 0 ? + AS_PROTO_RESULT_FAIL_RECORD_EXISTS : + AS_PROTO_RESULT_FAIL_GENERATION; + } + // else - remote winner - apply it. + + // If creating record, write set-ID into index. + if (is_create) { + if (rr->set_name && (result = as_index_set_set_w_len(r, ns, + rr->set_name, rr->set_name_len, false)) < 0) { + record_replace_failed(rr, &r_ref, NULL, is_create); + return -result; + } + + r->last_update_time = rr->last_update_time; + + // Don't write record if it would be truncated. + if (as_truncate_record_is_truncated(r, ns)) { + record_replace_failed(rr, &r_ref, NULL, is_create); + return AS_PROTO_RESULT_OK; + } + } + // else - not bothering to check that sets match. + + as_storage_rd rd; + + if (is_create) { + as_storage_record_create(ns, r, &rd); + } + else { + as_storage_record_open(ns, r, &rd); + } + + // Assemble rec-props. + size_t rec_props_data_size = as_rec_props_size_all( + (const uint8_t *)rr->set_name, rr->set_name_len, rr->key, + rr->key_size); + uint8_t rec_props_data[rec_props_data_size]; + + as_rec_props_fill_all(&rd.rec_props, rec_props_data, + (const uint8_t *)rr->set_name, rr->set_name_len, rr->key, + rr->key_size); + + // Split according to configuration to replace local record. + bool is_delete = false; + + if (ns->storage_data_in_memory) { + if (ns->single_bin) { + result = record_apply_dim_single_bin(rr, &rd, &is_delete); + } + else { + result = record_apply_dim(rr, &rd, skip_sindex, &is_delete); + } + } + else { + if (ns->single_bin) { + result = record_apply_ssd_single_bin(rr, &rd, &is_delete); + } + else { + result = record_apply_ssd(rr, &rd, skip_sindex, &is_delete); + } + } + + if (result != 0) { + record_replace_failed(rr, &r_ref, &rd, is_create); + return result; + } + + uint16_t set_id = as_index_get_set_id(r); // save for XDR write + + record_replaced(r, rr); + + as_storage_record_close(&rd); + as_record_done(&r_ref, ns); + + if (do_xdr_write) { + xdr_write_replica(rr, is_delete, set_id); + } + + return AS_PROTO_RESULT_OK; +} + + +//========================================================== +// Public API - conflict resolution. +// + +// Returns -1 if left wins, 1 if right wins, and 0 for tie. +int +as_record_resolve_conflict(conflict_resolution_pol policy, uint16_t left_gen, + uint64_t left_lut, uint16_t right_gen, uint64_t right_lut) +{ + int result = 0; + + switch (policy) { + case AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION: + // Doesn't use resolve_generation() - direct comparison gives much + // better odds of picking the record with more history after a split + // brain where one side starts the record from scratch. + result = resolve_generation_direct(left_gen, right_gen); + if (result == 0) { + result = resolve_last_update_time(left_lut, right_lut); + } + break; + case AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME: + result = resolve_last_update_time(left_lut, right_lut); + if (result == 0) { + result = resolve_generation(left_gen, right_gen); + } + break; + case AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_CP: + result = record_resolve_conflict_cp(left_gen, left_lut, right_gen, + right_lut); + break; + default: + cf_crash(AS_RECORD, "invalid conflict resolution policy"); + break; + } + + return result; +} + + +//========================================================== +// Local helpers. +// + +void +record_replace_failed(as_remote_record *rr, as_index_ref* r_ref, + as_storage_rd* rd, bool is_create) +{ + if (is_create) { + as_index_delete(rr->rsv->tree, rr->keyd); + } + + if (rd) { + as_storage_record_close(rd); + } + + as_record_done(r_ref, rr->rsv->ns); +} + + +// TODO - as_storage_record_get_n_bytes_memory() could check bins in use. +int +record_apply_dim_single_bin(as_remote_record *rr, as_storage_rd *rd, + bool *is_delete) +{ + as_namespace* ns = rr->rsv->ns; + as_record* r = rd->r; + + rd->n_bins = 1; + + // Set rd->bins! + as_storage_rd_load_bins(rd, NULL); + + // For memory accounting, note current usage. + uint64_t memory_bytes = 0; + + if (as_bin_inuse(rd->bins)) { + memory_bytes = as_storage_record_get_n_bytes_memory(rd); + } + + uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->record_buf); + + if (n_new_bins > 1) { + cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: single-bin got %u bins ", ns->name, n_new_bins); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + // Keep old bin intact for unwinding, clear record bin for incoming. + as_bin old_bin; + + as_single_bin_copy(&old_bin, rd->bins); + as_bin_set_empty(rd->bins); + + int result; + + // Fill the new bins and particles. + if (n_new_bins == 1 && + (result = unpickle_bins(rr, rd, NULL)) != 0) { + cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bin ", ns->name); + unwind_dim_single_bin(&old_bin, rd->bins); + return result; + } + + // Apply changes to metadata in as_index needed for and writing. + index_metadata old_metadata; + + update_index_metadata(rr, &old_metadata, r); + + // Write the record to storage. + if ((result = as_record_write_from_pickle(rd)) < 0) { + cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name); + unwind_index_metadata(&old_metadata, r); + unwind_dim_single_bin(&old_bin, rd->bins); + return -result; + } + + // Cleanup - destroy old bin, can't unwind after. + as_bin_particle_destroy(&old_bin, true); + + as_storage_record_adjust_mem_stats(rd, memory_bytes); + *is_delete = n_new_bins == 0; + + return AS_PROTO_RESULT_OK; +} + + +int +record_apply_dim(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex, + bool *is_delete) +{ + as_namespace* ns = rr->rsv->ns; + as_record* r = rd->r; + + // Set rd->n_bins! + as_storage_rd_load_n_bins(rd); + + // Set rd->bins! + as_storage_rd_load_bins(rd, NULL); + + // For memory accounting, note current usage. + uint64_t memory_bytes = as_storage_record_get_n_bytes_memory(rd); + + // Keep old bins intact for sindex adjustment and unwinding. + uint16_t n_old_bins = rd->n_bins; + as_bin* old_bins = rd->bins; + + uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->record_buf); + as_bin new_bins[n_new_bins]; + + memset(new_bins, 0, sizeof(new_bins)); + rd->n_bins = n_new_bins; + rd->bins = new_bins; + + // Fill the new bins and particles. + int result = unpickle_bins(rr, rd, NULL); + + if (result != 0) { + cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bins ", ns->name); + destroy_stack_bins(new_bins, n_new_bins); + return result; + } + + // Apply changes to metadata in as_index needed for and writing. + index_metadata old_metadata; + + update_index_metadata(rr, &old_metadata, r); + + // Write the record to storage. + if ((result = as_record_write_from_pickle(rd)) < 0) { + cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name); + unwind_index_metadata(&old_metadata, r); + destroy_stack_bins(new_bins, n_new_bins); + return -result; + } + + // Success - adjust sindex, looking at old and new bins. + if (! (skip_sindex && + next_generation(r->generation, (uint16_t)rr->generation, ns)) && + record_has_sindex(r, ns)) { + write_sindex_update(ns, as_index_get_set_name(r, ns), rr->keyd, + old_bins, n_old_bins, new_bins, n_new_bins); + } + + // Cleanup - destroy relevant bins, can't unwind after. + destroy_stack_bins(old_bins, n_old_bins); + + // Fill out new_bin_space. + as_bin_space* new_bin_space = NULL; + + if (n_new_bins != 0) { + new_bin_space = (as_bin_space*) + cf_malloc_ns(sizeof(as_bin_space) + sizeof(new_bins)); + + new_bin_space->n_bins = rd->n_bins; + memcpy((void*)new_bin_space->bins, new_bins, sizeof(new_bins)); + } + + // Swizzle the index element's as_bin_space pointer. + as_bin_space* old_bin_space = as_index_get_bin_space(r); + + if (old_bin_space) { + cf_free(old_bin_space); + } + + as_index_set_bin_space(r, new_bin_space); + + // Accommodate a new stored key - wasn't needed for pickling and writing. + if (r->key_stored == 0 && rd->key) { + as_record_allocate_key(r, rd->key, rd->key_size); + r->key_stored = 1; + } + + as_storage_record_adjust_mem_stats(rd, memory_bytes); + *is_delete = n_new_bins == 0; + + return AS_PROTO_RESULT_OK; +} + + +int +record_apply_ssd_single_bin(as_remote_record *rr, as_storage_rd *rd, + bool *is_delete) +{ + as_namespace* ns = rr->rsv->ns; + as_record* r = rd->r; + + uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->record_buf); + + if (n_new_bins > 1) { + cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: single-bin got %u bins ", ns->name, n_new_bins); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + as_bin stack_bin = { { 0 } }; + + rd->n_bins = 1; + rd->bins = &stack_bin; + + // Fill the new bin and particle. + cf_ll_buf_define(particles_llb, STACK_PARTICLES_SIZE); + + int result; + + if (n_new_bins == 1 && + (result = unpickle_bins(rr, rd, &particles_llb)) != 0) { + cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bin ", ns->name); + cf_ll_buf_free(&particles_llb); + return result; + } + + // Apply changes to metadata in as_index needed for and writing. + index_metadata old_metadata; + + update_index_metadata(rr, &old_metadata, r); + + // Write the record to storage. + if ((result = as_record_write_from_pickle(rd)) < 0) { + cf_warning_digest(AS_RECORD, rr->keyd, "{%s} write_master: failed write ", ns->name); + unwind_index_metadata(&old_metadata, r); + cf_ll_buf_free(&particles_llb); + return -result; + } + + // Accommodate a new stored key - wasn't needed for writing. + if (r->key_stored == 0 && rr->key) { + r->key_stored = 1; + } + + *is_delete = n_new_bins == 0; + + return AS_PROTO_RESULT_OK; +} + + +int +record_apply_ssd(as_remote_record *rr, as_storage_rd *rd, bool skip_sindex, + bool *is_delete) +{ + as_namespace* ns = rr->rsv->ns; + as_record* r = rd->r; + bool has_sindex = ! (skip_sindex && + next_generation(r->generation, (uint16_t)rr->generation, ns)) && + record_has_sindex(r, ns); + + uint16_t n_old_bins = 0; + int result; + + if (has_sindex) { + // Set rd->n_bins! + if ((result = as_storage_rd_load_n_bins(rd)) < 0) { + cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed load n-bins ", ns->name); + return -result; + } + + n_old_bins = rd->n_bins; + } + + as_bin old_bins[n_old_bins]; + + if (has_sindex) { + // Set rd->bins! + if ((result = as_storage_rd_load_bins(rd, old_bins)) < 0) { + cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed load bins ", ns->name); + return -result; + } + } + + // Stack space for resulting record's bins. + uint16_t n_new_bins = cf_swap_from_be16(*(uint16_t *)rr->record_buf); + as_bin new_bins[n_new_bins]; + + memset(new_bins, 0, sizeof(new_bins)); + rd->n_bins = n_new_bins; + rd->bins = new_bins; + + // Fill the new bins and particles. + cf_ll_buf_define(particles_llb, STACK_PARTICLES_SIZE); + + if ((result = unpickle_bins(rr, rd, &particles_llb)) != 0) { + cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed unpickle bins ", ns->name); + cf_ll_buf_free(&particles_llb); + return result; + } + + // Apply changes to metadata in as_index needed for and writing. + index_metadata old_metadata; + + update_index_metadata(rr, &old_metadata, r); + + // Write the record to storage. + if ((result = as_record_write_from_pickle(rd)) < 0) { + cf_warning_digest(AS_RECORD, rr->keyd, "{%s} record replace: failed write ", ns->name); + unwind_index_metadata(&old_metadata, r); + cf_ll_buf_free(&particles_llb); + return -result; + } + + // Success - adjust sindex, looking at old and new bins. + if (has_sindex) { + write_sindex_update(ns, as_index_get_set_name(r, ns), rr->keyd, + old_bins, n_old_bins, new_bins, n_new_bins); + } + + // Accommodate a new stored key - wasn't needed for writing. + if (r->key_stored == 0 && rr->key) { + r->key_stored = 1; + } + + *is_delete = n_new_bins == 0; + + return 0; +} + + +void +update_index_metadata(as_remote_record *rr, index_metadata *old, as_record *r) +{ + old->void_time = r->void_time; + old->last_update_time = r->last_update_time; + old->generation = r->generation; + + r->generation = (uint16_t)rr->generation; + r->void_time = truncate_void_time(rr->rsv->ns, rr->void_time); + r->last_update_time = rr->last_update_time; +} + + +void +unwind_index_metadata(const index_metadata *old, as_record *r) +{ + r->void_time = old->void_time; + r->last_update_time = old->last_update_time; + r->generation = old->generation; +} + + +void +unwind_dim_single_bin(as_bin* old_bin, as_bin* new_bin) +{ + if (as_bin_inuse(new_bin)) { + as_bin_particle_destroy(new_bin, true); + } + + as_single_bin_copy(new_bin, old_bin); +} + + +int +unpickle_bins(as_remote_record *rr, as_storage_rd *rd, cf_ll_buf *particles_llb) +{ + as_namespace *ns = rd->ns; + + const uint8_t *end = rr->record_buf + rr->record_buf_sz; + const uint8_t *buf = rr->record_buf + 2; + + for (uint16_t i = 0; i < rd->n_bins; i++) { + if (buf >= end) { + cf_warning(AS_RECORD, "incomplete pickled record"); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + uint8_t name_sz = *buf++; + const uint8_t *name = buf; + + buf += name_sz; + buf++; // skipped byte was version + + if (buf > end) { + cf_warning(AS_RECORD, "incomplete pickled record"); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + int result; + as_bin *b = as_bin_create_from_buf(rd, name, name_sz, &result); + + if (! b) { + return result; + } + + if (ns->storage_data_in_memory) { + if ((result = as_bin_particle_alloc_from_pickled(b, + &buf, end)) < 0) { + return -result; + } + } + else { + if ((result = as_bin_particle_stack_from_pickled(b, particles_llb, + &buf, end)) < 0) { + return -result; + } + } + } + + if (buf != end) { + cf_warning(AS_RECORD, "extra bytes on pickled record"); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + return AS_PROTO_RESULT_OK; +} + + +void +xdr_write_replica(as_remote_record *rr, bool is_delete, uint32_t set_id) +{ + uint16_t generation = (uint16_t)rr->generation; + xdr_op_type op_type = XDR_OP_TYPE_WRITE; + + // Note - in this code path, only durable deletes get here. + if (is_delete) { + generation = 0; + op_type = XDR_OP_TYPE_DURABLE_DELETE; + } + + // Don't send an XDR delete if it's disallowed. + if (is_delete && ! is_xdr_delete_shipping_enabled()) { + // TODO - should we also not ship if there was no record here before? + return; + } + + xdr_write(rr->rsv->ns, rr->keyd, generation, rr->src, op_type, set_id, + NULL); +} diff --git a/as/src/base/record_ce.c b/as/src/base/record_ce.c new file mode 100644 index 00000000..f0f1f5f1 --- /dev/null +++ b/as/src/base/record_ce.c @@ -0,0 +1,136 @@ +/* + * record_ce.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_digest.h" + +#include "fault.h" + +#include "base/datamodel.h" +#include "base/index.h" +#include "storage/storage.h" + + +//========================================================== +// Public API. +// + +uint32_t +clock_skew_stop_writes_sec() +{ + return 0; +} + + +void +handle_clock_skew(as_namespace* ns, uint64_t skew_ms) +{ +} + + +uint16_t +plain_generation(uint16_t regime_generation, const as_namespace* ns) +{ + return regime_generation; +} + + +void +as_record_set_lut(as_record *r, uint32_t regime, uint64_t now_ms, + const as_namespace* ns) +{ + // Note - last-update-time is not allowed to go backwards! + if (r->last_update_time < now_ms) { + r->last_update_time = now_ms; + } +} + + +void +as_record_increment_generation(as_record *r, const as_namespace* ns) +{ + // The generation might wrap - 0 is reserved as "uninitialized". + if (++r->generation == 0) { + r->generation = 1; + } +} + + +bool +as_record_is_live(const as_record* r) +{ + return true; +} + + +int +as_record_get_live(as_index_tree* tree, cf_digest* keyd, as_index_ref* r_ref, + as_namespace* ns) +{ + return as_index_get_vlock(tree, keyd, r_ref); +} + + +int +as_record_exists_live(as_index_tree* tree, cf_digest* keyd, as_namespace* ns) +{ + return as_record_exists(tree, keyd); +} + + +void +as_record_drop_stats(as_record* r, as_namespace* ns) +{ + as_namespace_release_set_id(ns, as_index_get_set_id(r)); + + cf_atomic64_decr(&ns->n_objects); +} + + +int +as_record_write_from_pickle(as_storage_rd* rd) +{ + cf_assert(as_bin_inuse_has(rd), AS_RECORD, "unexpected binless pickle"); + + return as_storage_record_write(rd); +} + + +//========================================================== +// Private API - for enterprise separation only. +// + +int +record_resolve_conflict_cp(uint16_t left_gen, uint64_t left_lut, + uint16_t right_gen, uint64_t right_lut) +{ + cf_crash(AS_RECORD, "CE code called record_resolve_conflict_cp()"); + + return 0; +} diff --git a/as/src/base/scan.c b/as/src/base/scan.c new file mode 100644 index 00000000..0602d7b3 --- /dev/null +++ b/as/src/base/scan.c @@ -0,0 +1,1409 @@ +/* + * scan.c + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//============================================================================== +// Includes. +// + +#include "base/scan.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "aerospike/as_list.h" +#include "aerospike/as_module.h" +#include "aerospike/as_string.h" +#include "aerospike/as_val.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_ll.h" +#include "citrusleaf/cf_vector.h" + +#include "dynbuf.h" +#include "fault.h" +#include "socket.h" + +#include "base/aggr.h" +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/job_manager.h" +#include "base/monitor.h" +#include "base/predexp.h" +#include "base/proto.h" +#include "base/secondary_index.h" +#include "base/thr_tsvc.h" +#include "base/transaction.h" +#include "base/udf_memtracker.h" +#include "fabric/exchange.h" +#include "fabric/partition.h" +#include "transaction/udf.h" + + + +//============================================================================== +// Typedefs and forward declarations. +// + +//---------------------------------------------------------- +// Scan types. +// + +typedef enum { + SCAN_TYPE_BASIC = 0, + SCAN_TYPE_AGGR = 1, + SCAN_TYPE_UDF_BG = 2, + + SCAN_TYPE_UNKNOWN = -1 +} scan_type; + +static inline const char* +scan_type_str(scan_type type) +{ + switch (type) { + case SCAN_TYPE_BASIC: + return "basic"; + case SCAN_TYPE_AGGR: + return "aggregation"; + case SCAN_TYPE_UDF_BG: + return "background-udf"; + default: + return "?"; + } +} + +//---------------------------------------------------------- +// scan_job - derived classes' public methods. +// + +int basic_scan_job_start(as_transaction* tr, as_namespace* ns, uint16_t set_id); +int aggr_scan_job_start(as_transaction* tr, as_namespace* ns, uint16_t set_id); +int udf_bg_scan_job_start(as_transaction* tr, as_namespace* ns, + uint16_t set_id); + +//---------------------------------------------------------- +// Non-class-specific utilities. +// + +typedef struct scan_options_s { + int priority; + bool fail_on_cluster_change; + uint32_t sample_pct; +} scan_options; + +int get_scan_set_id(as_transaction* tr, as_namespace* ns, uint16_t* p_set_id); +scan_type get_scan_type(as_transaction* tr); +bool get_scan_options(as_transaction* tr, scan_options* options); +bool get_scan_socket_timeout(as_transaction* tr, uint32_t* timeout); +bool get_scan_predexp(as_transaction* tr, predexp_eval_t** p_predexp); +size_t send_blocking_response_chunk(as_file_handle* fd_h, uint8_t* buf, size_t size, int32_t timeout); +static inline bool excluded_set(as_index* r, uint16_t set_id); + + + +//============================================================================== +// Constants. +// + +const size_t INIT_BUF_BUILDER_SIZE = 1024 * 1024 * 2; +const size_t SCAN_CHUNK_LIMIT = 1024 * 1024; + + + +//============================================================================== +// Globals. +// + +static as_job_manager g_scan_manager; + + + +//============================================================================== +// Public API. +// + +void +as_scan_init() +{ + as_job_manager_init(&g_scan_manager, g_config.scan_max_active, + g_config.scan_max_done, g_config.scan_threads); +} + +int +as_scan(as_transaction* tr, as_namespace* ns) +{ + int result; + uint16_t set_id = INVALID_SET_ID; + + if ((result = get_scan_set_id(tr, ns, &set_id)) != AS_PROTO_RESULT_OK) { + return result; + } + + switch (get_scan_type(tr)) { + case SCAN_TYPE_BASIC: + result = basic_scan_job_start(tr, ns, set_id); + break; + case SCAN_TYPE_AGGR: + result = aggr_scan_job_start(tr, ns, set_id); + break; + case SCAN_TYPE_UDF_BG: + result = udf_bg_scan_job_start(tr, ns, set_id); + break; + default: + cf_warning(AS_SCAN, "can't identify scan type"); + result = AS_PROTO_RESULT_FAIL_PARAMETER; + break; + } + + return result; +} + +void +as_scan_limit_active_jobs(uint32_t max_active) +{ + as_job_manager_limit_active_jobs(&g_scan_manager, max_active); +} + +void +as_scan_limit_finished_jobs(uint32_t max_done) +{ + as_job_manager_limit_finished_jobs(&g_scan_manager, max_done); +} + +void +as_scan_resize_thread_pool(uint32_t n_threads) +{ + as_job_manager_resize_thread_pool(&g_scan_manager, n_threads); +} + +int +as_scan_get_active_job_count() +{ + return as_job_manager_get_active_job_count(&g_scan_manager); +} + +int +as_scan_list(char* name, cf_dyn_buf* db) +{ + as_mon_info_cmd(AS_MON_MODULES[SCAN_MOD], NULL, 0, 0, db); + return 0; +} + +as_mon_jobstat* +as_scan_get_jobstat(uint64_t trid) +{ + return as_job_manager_get_job_info(&g_scan_manager, trid); +} + +as_mon_jobstat* +as_scan_get_jobstat_all(int* size) +{ + return as_job_manager_get_info(&g_scan_manager, size); +} + +int +as_scan_abort(uint64_t trid) +{ + return as_job_manager_abort_job(&g_scan_manager, trid) ? 0 : -1; +} + +int +as_scan_abort_all() +{ + return as_job_manager_abort_all_jobs(&g_scan_manager); +} + +int +as_scan_change_job_priority(uint64_t trid, uint32_t priority) +{ + return as_job_manager_change_job_priority(&g_scan_manager, trid, + (int)priority) ? 0 : -1; +} + + +//============================================================================== +// Non-class-specific utilities. +// + +int +get_scan_set_id(as_transaction* tr, as_namespace* ns, uint16_t* p_set_id) +{ + uint16_t set_id = INVALID_SET_ID; + as_msg_field* f = as_transaction_has_set(tr) ? + as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_SET) : NULL; + + if (f && as_msg_field_get_value_sz(f) != 0) { + uint32_t set_name_len = as_msg_field_get_value_sz(f); + char set_name[set_name_len + 1]; + + memcpy(set_name, f->data, set_name_len); + set_name[set_name_len] = '\0'; + set_id = as_namespace_get_set_id(ns, set_name); + + if (set_id == INVALID_SET_ID) { + cf_warning(AS_SCAN, "scan msg from %s has unrecognized set %s", + tr->from.proto_fd_h->client, set_name); + return AS_PROTO_RESULT_FAIL_NOT_FOUND; + } + } + + *p_set_id = set_id; + + return AS_PROTO_RESULT_OK; +} + +scan_type +get_scan_type(as_transaction* tr) +{ + if (! as_transaction_is_udf(tr)) { + return SCAN_TYPE_BASIC; + } + + as_msg_field* udf_op_f = as_msg_field_get(&tr->msgp->msg, + AS_MSG_FIELD_TYPE_UDF_OP); + + if (udf_op_f && *udf_op_f->data == (uint8_t)AS_UDF_OP_AGGREGATE) { + return SCAN_TYPE_AGGR; + } + + if (udf_op_f && *udf_op_f->data == (uint8_t)AS_UDF_OP_BACKGROUND) { + return SCAN_TYPE_UDF_BG; + } + + return SCAN_TYPE_UNKNOWN; +} + +bool +get_scan_options(as_transaction* tr, scan_options* options) +{ + if (! as_transaction_has_scan_options(tr)) { + return true; + } + + as_msg_field* f = as_msg_field_get(&tr->msgp->msg, + AS_MSG_FIELD_TYPE_SCAN_OPTIONS); + + if (as_msg_field_get_value_sz(f) != 2) { + cf_warning(AS_SCAN, "scan msg options field size not 2"); + return false; + } + + options->priority = AS_MSG_FIELD_SCAN_PRIORITY(f->data[0]); + options->fail_on_cluster_change = + (AS_MSG_FIELD_SCAN_FAIL_ON_CLUSTER_CHANGE & f->data[0]) != 0; + options->sample_pct = f->data[1]; + + return true; +} + +bool +get_scan_socket_timeout(as_transaction* tr, uint32_t* timeout) +{ + if (! as_transaction_has_socket_timeout(tr)) { + return true; + } + + as_msg_field* f = as_msg_field_get(&tr->msgp->msg, + AS_MSG_FIELD_TYPE_SOCKET_TIMEOUT); + + if (as_msg_field_get_value_sz(f) != 4) { + cf_warning(AS_SCAN, "scan socket timeout field size not 4"); + return false; + } + + *timeout = cf_swap_from_be32(*(uint32_t*)f->data); + + return true; +} + +bool +get_scan_predexp(as_transaction* tr, predexp_eval_t** p_predexp) +{ + if (! as_transaction_has_predexp(tr)) { + return true; + } + + as_msg_field* f = as_msg_field_get(&tr->msgp->msg, + AS_MSG_FIELD_TYPE_PREDEXP); + + *p_predexp = predexp_build(f); + + return *p_predexp != NULL; +} + +size_t +send_blocking_response_chunk(as_file_handle* fd_h, uint8_t* buf, size_t size, + int32_t timeout) +{ + cf_socket* sock = &fd_h->sock; + as_proto proto; + + proto.version = PROTO_VERSION; + proto.type = PROTO_TYPE_AS_MSG; + proto.sz = size; + as_proto_swap(&proto); + + if (cf_socket_send_all(sock, (uint8_t*)&proto, sizeof(as_proto), + MSG_NOSIGNAL | MSG_MORE, timeout) < 0) { + cf_warning(AS_SCAN, "error sending to %s - fd %d %s", fd_h->client, + CSFD(sock), cf_strerror(errno)); + return 0; + } + + if (cf_socket_send_all(sock, buf, size, MSG_NOSIGNAL, timeout) < 0) { + cf_warning(AS_SCAN, "error sending to %s - fd %d sz %lu %s", + fd_h->client, CSFD(sock), size, cf_strerror(errno)); + return 0; + } + + return sizeof(as_proto) + size; +} + +static inline bool +excluded_set(as_index* r, uint16_t set_id) +{ + return set_id != INVALID_SET_ID && set_id != as_index_get_set_id(r); +} + + + +//============================================================================== +// conn_scan_job derived class implementation - not final class. +// + +//---------------------------------------------------------- +// conn_scan_job typedefs and forward declarations. +// + +typedef struct conn_scan_job_s { + // Base object must be first: + as_job _base; + + // Derived class data: + pthread_mutex_t fd_lock; + as_file_handle* fd_h; + int32_t fd_timeout; + + uint64_t net_io_bytes; +} conn_scan_job; + +void conn_scan_job_own_fd(conn_scan_job* job, as_file_handle* fd_h, uint32_t timeout); +void conn_scan_job_disown_fd(conn_scan_job* job); +void conn_scan_job_finish(conn_scan_job* job); +bool conn_scan_job_send_response(conn_scan_job* job, uint8_t* buf, size_t size); +void conn_scan_job_release_fd(conn_scan_job* job, bool force_close); +void conn_scan_job_info(conn_scan_job* job, as_mon_jobstat* stat); + +//---------------------------------------------------------- +// conn_scan_job API. +// + +void +conn_scan_job_own_fd(conn_scan_job* job, as_file_handle* fd_h, uint32_t timeout) +{ + pthread_mutex_init(&job->fd_lock, NULL); + + job->fd_h = fd_h; + job->fd_h->fh_info |= FH_INFO_DONOT_REAP; + job->fd_timeout = timeout == 0 ? -1 : (int32_t)timeout; + + job->net_io_bytes = 0; +} + +void +conn_scan_job_disown_fd(conn_scan_job* job) +{ + // Just undo conn_scan_job_own_fd(), nothing more. + + job->fd_h->fh_info &= ~FH_INFO_DONOT_REAP; + + pthread_mutex_destroy(&job->fd_lock); +} + +void +conn_scan_job_finish(conn_scan_job* job) +{ + as_job* _job = (as_job*)job; + + if (job->fd_h) { + // TODO - perhaps reflect in monitor if send fails? + size_t size_sent = as_msg_send_fin_timeout(&job->fd_h->sock, + _job->abandoned, job->fd_timeout); + + job->net_io_bytes += size_sent; + conn_scan_job_release_fd(job, size_sent == 0); + } + + pthread_mutex_destroy(&job->fd_lock); +} + +bool +conn_scan_job_send_response(conn_scan_job* job, uint8_t* buf, size_t size) +{ + as_job* _job = (as_job*)job; + + pthread_mutex_lock(&job->fd_lock); + + if (! job->fd_h) { + pthread_mutex_unlock(&job->fd_lock); + // Job already abandoned. + return false; + } + + size_t size_sent = send_blocking_response_chunk(job->fd_h, buf, size, + job->fd_timeout); + + if (size_sent == 0) { + int reason = errno == ETIMEDOUT ? + AS_JOB_FAIL_RESPONSE_TIMEOUT : AS_JOB_FAIL_RESPONSE_ERROR; + + conn_scan_job_release_fd(job, true); + pthread_mutex_unlock(&job->fd_lock); + as_job_manager_abandon_job(_job->mgr, _job, reason); + return false; + } + + job->net_io_bytes += size_sent; + + pthread_mutex_unlock(&job->fd_lock); + return true; +} + +void +conn_scan_job_release_fd(conn_scan_job* job, bool force_close) +{ + job->fd_h->fh_info &= ~FH_INFO_DONOT_REAP; + job->fd_h->last_used = cf_getms(); + as_end_of_transaction(job->fd_h, force_close); + job->fd_h = NULL; +} + +void +conn_scan_job_info(conn_scan_job* job, as_mon_jobstat* stat) +{ + stat->net_io_bytes = job->net_io_bytes; +} + + + +//============================================================================== +// basic_scan_job derived class implementation. +// + +//---------------------------------------------------------- +// basic_scan_job typedefs and forward declarations. +// + +typedef struct basic_scan_job_s { + // Base object must be first: + conn_scan_job _base; + + // Derived class data: + uint64_t cluster_key; + bool fail_on_cluster_change; + bool no_bin_data; + uint32_t sample_pct; + predexp_eval_t* predexp; + cf_vector* bin_names; +} basic_scan_job; + +void basic_scan_job_slice(as_job* _job, as_partition_reservation* rsv); +void basic_scan_job_finish(as_job* _job); +void basic_scan_job_destroy(as_job* _job); +void basic_scan_job_info(as_job* _job, as_mon_jobstat* stat); + +const as_job_vtable basic_scan_job_vtable = { + basic_scan_job_slice, + basic_scan_job_finish, + basic_scan_job_destroy, + basic_scan_job_info +}; + +typedef struct basic_scan_slice_s { + basic_scan_job* job; + cf_buf_builder** bb_r; +} basic_scan_slice; + +void basic_scan_job_reduce_cb(as_index_ref* r_ref, void* udata); +cf_vector* bin_names_from_op(as_msg* m, int* result); + +//---------------------------------------------------------- +// basic_scan_job public API. +// + +int +basic_scan_job_start(as_transaction* tr, as_namespace* ns, uint16_t set_id) +{ + basic_scan_job* job = cf_malloc(sizeof(basic_scan_job)); + as_job* _job = (as_job*)job; + + scan_options options = { .sample_pct = 100 }; + uint32_t timeout = CF_SOCKET_TIMEOUT; + predexp_eval_t* predexp = NULL; + + if (! get_scan_options(tr, &options) || + ! get_scan_socket_timeout(tr, &timeout) || + ! get_scan_predexp(tr, &predexp)) { + cf_warning(AS_SCAN, "basic scan job failed msg field processing"); + cf_free(job); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + as_job_init(_job, &basic_scan_job_vtable, &g_scan_manager, RSV_WRITE, + as_transaction_trid(tr), ns, set_id, options.priority); + + job->cluster_key = as_exchange_cluster_key(); + job->fail_on_cluster_change = options.fail_on_cluster_change; + job->no_bin_data = (tr->msgp->msg.info1 & AS_MSG_INFO1_GET_NO_BINS) != 0; + job->sample_pct = options.sample_pct; + job->predexp = predexp; + + int result; + + job->bin_names = bin_names_from_op(&tr->msgp->msg, &result); + + if (! job->bin_names && result != AS_PROTO_RESULT_OK) { + as_job_destroy(_job); + return result; + } + + if (job->fail_on_cluster_change && + (cf_atomic_int_get(ns->migrate_tx_partitions_remaining) != 0 || + cf_atomic_int_get(ns->migrate_rx_partitions_remaining) != 0)) { + // TODO - was AS_PROTO_RESULT_FAIL_UNAVAILABLE - ok? + cf_warning(AS_SCAN, "basic scan job not started - migration"); + as_job_destroy(_job); + return AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH; + } + + // Take ownership of socket from transaction. + conn_scan_job_own_fd((conn_scan_job*)job, tr->from.proto_fd_h, timeout); + + cf_info(AS_SCAN, "starting basic scan job %lu {%s:%s} priority %u, sample-pct %u%s%s", + _job->trid, ns->name, as_namespace_get_set_name(ns, set_id), + _job->priority, job->sample_pct, + job->no_bin_data ? ", metadata-only" : "", + job->fail_on_cluster_change ? ", fail-on-cluster-change" : ""); + + if ((result = as_job_manager_start_job(_job->mgr, _job)) != 0) { + cf_warning(AS_SCAN, "basic scan job %lu failed to start (%d)", + _job->trid, result); + conn_scan_job_disown_fd((conn_scan_job*)job); + as_job_destroy(_job); + return result; + } + + return AS_PROTO_RESULT_OK; +} + +//---------------------------------------------------------- +// basic_scan_job mandatory scan_job interface. +// + +void +basic_scan_job_slice(as_job* _job, as_partition_reservation* rsv) +{ + basic_scan_job* job = (basic_scan_job*)_job; + as_index_tree* tree = rsv->tree; + cf_buf_builder* bb = cf_buf_builder_create_size(INIT_BUF_BUILDER_SIZE); + + if (! bb) { + as_job_manager_abandon_job(_job->mgr, _job, + AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + uint64_t slice_start = cf_getms(); + basic_scan_slice slice = { job, &bb }; + + if (job->sample_pct == 100) { + as_index_reduce_live(tree, basic_scan_job_reduce_cb, (void*)&slice); + } + else { + uint64_t sample_count = + ((as_index_tree_size(tree) * job->sample_pct) / 100); + + as_index_reduce_partial_live(tree, sample_count, + basic_scan_job_reduce_cb, (void*)&slice); + } + + if (bb->used_sz != 0) { + conn_scan_job_send_response((conn_scan_job*)job, bb->buf, bb->used_sz); + } + + // TODO - guts don't check buf_builder realloc failures rigorously. + cf_buf_builder_free(bb); + + cf_detail(AS_SCAN, "%s:%u basic scan job %lu in thread %lu took %lu ms", + rsv->ns->name, rsv->p->id, _job->trid, pthread_self(), + cf_getms() - slice_start); +} + +void +basic_scan_job_finish(as_job* _job) +{ + conn_scan_job_finish((conn_scan_job*)_job); + + switch (_job->abandoned) { + case 0: + cf_atomic_int_incr(&_job->ns->n_scan_basic_complete); + break; + case AS_JOB_FAIL_USER_ABORT: + cf_atomic_int_incr(&_job->ns->n_scan_basic_abort); + break; + case AS_JOB_FAIL_UNKNOWN: + case AS_JOB_FAIL_CLUSTER_KEY: + case AS_JOB_FAIL_RESPONSE_ERROR: + case AS_JOB_FAIL_RESPONSE_TIMEOUT: + default: + cf_atomic_int_incr(&_job->ns->n_scan_basic_error); + break; + } + + cf_info(AS_SCAN, "finished basic scan job %lu (%d)", _job->trid, + _job->abandoned); +} + +void +basic_scan_job_destroy(as_job* _job) +{ + basic_scan_job* job = (basic_scan_job*)_job; + + if (job->bin_names) { + cf_vector_destroy(job->bin_names); + } + + if (job->predexp) { + predexp_destroy(job->predexp); + } +} + +void +basic_scan_job_info(as_job* _job, as_mon_jobstat* stat) +{ + strcpy(stat->job_type, scan_type_str(SCAN_TYPE_BASIC)); + conn_scan_job_info((conn_scan_job*)_job, stat); +} + +//---------------------------------------------------------- +// basic_scan_job utilities. +// + +void +basic_scan_job_reduce_cb(as_index_ref* r_ref, void* udata) +{ + basic_scan_slice* slice = (basic_scan_slice*)udata; + basic_scan_job* job = slice->job; + as_job* _job = (as_job*)job; + as_namespace* ns = _job->ns; + + if (_job->abandoned != 0) { + as_record_done(r_ref, ns); + return; + } + + if (job->fail_on_cluster_change && + job->cluster_key != as_exchange_cluster_key()) { + as_record_done(r_ref, ns); + as_job_manager_abandon_job(_job->mgr, _job, + AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH); + return; + } + + as_index* r = r_ref->r; + + if (excluded_set(r, _job->set_id) || as_record_is_doomed(r, ns)) { + as_record_done(r_ref, ns); + return; + } + + predexp_args_t predargs = { .ns = ns, .md = r, .vl = NULL, .rd = NULL }; + + if (job->predexp && ! predexp_matches_metadata(job->predexp, &predargs)) { + as_record_done(r_ref, ns); + return; + } + + as_storage_rd rd; + + as_storage_record_open(ns, r, &rd); + + if (job->no_bin_data) { + // TODO - suppose the predexp needs bin values??? + + as_msg_make_response_bufbuilder(slice->bb_r, &rd, true, true, true, + NULL); + } + else { + as_storage_rd_load_n_bins(&rd); // TODO - handle error returned + + as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; + + as_storage_rd_load_bins(&rd, stack_bins); // TODO - handle error returned + + predargs.rd = &rd; + + if (job->predexp && ! predexp_matches_record(job->predexp, &predargs)) { + as_storage_record_close(&rd); + as_record_done(r_ref, ns); + return; + } + + as_msg_make_response_bufbuilder(slice->bb_r, &rd, false, true, true, + job->bin_names); + } + + as_storage_record_close(&rd); + as_record_done(r_ref, ns); + + cf_atomic64_incr(&_job->n_records_read); + + cf_buf_builder* bb = *slice->bb_r; + + // If we exceed the proto size limit, send accumulated data back to client + // and reset the buf-builder to start a new proto. + if (bb->used_sz > SCAN_CHUNK_LIMIT) { + if (! conn_scan_job_send_response((conn_scan_job*)job, bb->buf, + bb->used_sz)) { + return; + } + + cf_buf_builder_reset(bb); + } +} + +cf_vector* +bin_names_from_op(as_msg* m, int* result) +{ + *result = AS_PROTO_RESULT_OK; + + if (m->n_ops == 0) { + return NULL; + } + + cf_vector* v = cf_vector_create(AS_ID_BIN_SZ, m->n_ops, 0); + + as_msg_op* op = NULL; + int n = 0; + + while ((op = as_msg_op_iterate(m, op, &n)) != NULL) { + if (op->name_sz >= AS_ID_BIN_SZ) { + cf_warning(AS_SCAN, "basic scan job bin name too long"); + cf_vector_destroy(v); + *result = AS_PROTO_RESULT_FAIL_BIN_NAME; + return NULL; + } + + char bin_name[AS_ID_BIN_SZ]; + + memcpy(bin_name, op->name, op->name_sz); + bin_name[op->name_sz] = 0; + cf_vector_append_unique(v, (void*)bin_name); + } + + return v; +} + + + +//============================================================================== +// aggr_scan_job derived class implementation. +// + +//---------------------------------------------------------- +// aggr_scan_job typedefs and forward declarations. +// + +typedef struct aggr_scan_job_s { + // Base object must be first: + conn_scan_job _base; + + // Derived class data: + as_aggr_call aggr_call; +} aggr_scan_job; + +void aggr_scan_job_slice(as_job* _job, as_partition_reservation* rsv); +void aggr_scan_job_finish(as_job* _job); +void aggr_scan_job_destroy(as_job* _job); +void aggr_scan_job_info(as_job* _job, as_mon_jobstat* stat); + +const as_job_vtable aggr_scan_job_vtable = { + aggr_scan_job_slice, + aggr_scan_job_finish, + aggr_scan_job_destroy, + aggr_scan_job_info +}; + +typedef struct aggr_scan_slice_s { + aggr_scan_job* job; + cf_ll* ll; + cf_buf_builder** bb_r; + as_partition_reservation* rsv; +} aggr_scan_slice; + +bool aggr_scan_init(as_aggr_call* call, const as_transaction* tr); +void aggr_scan_job_reduce_cb(as_index_ref* r_ref, void* udata); +bool aggr_scan_add_digest(cf_ll* ll, cf_digest* keyd); +as_partition_reservation* aggr_scan_ptn_reserve(void* udata, as_namespace* ns, + uint32_t pid, as_partition_reservation* rsv); +as_stream_status aggr_scan_ostream_write(void* udata, as_val* val); + +const as_aggr_hooks scan_aggr_hooks = { + .ostream_write = aggr_scan_ostream_write, + .set_error = NULL, + .ptn_reserve = aggr_scan_ptn_reserve, + .ptn_release = NULL, + .pre_check = NULL +}; + +void aggr_scan_add_val_response(aggr_scan_slice* slice, const as_val* val, + bool success); + +//---------------------------------------------------------- +// aggr_scan_job public API. +// + +int +aggr_scan_job_start(as_transaction* tr, as_namespace* ns, uint16_t set_id) +{ + aggr_scan_job* job = cf_malloc(sizeof(aggr_scan_job)); + as_job* _job = (as_job*)job; + + scan_options options = { .sample_pct = 100 }; + uint32_t timeout = CF_SOCKET_TIMEOUT; + + if (! get_scan_options(tr, &options) || + ! get_scan_socket_timeout(tr, &timeout)) { + cf_warning(AS_SCAN, "aggregation scan job failed msg field processing"); + cf_free(job); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (as_transaction_has_predexp(tr)) { + cf_warning(AS_SCAN, "aggregation scans do not support predexp filters"); + cf_free(job); + return AS_PROTO_RESULT_FAIL_UNSUPPORTED_FEATURE; + } + + as_job_init(_job, &aggr_scan_job_vtable, &g_scan_manager, RSV_WRITE, + as_transaction_trid(tr), ns, set_id, options.priority); + + if (! aggr_scan_init(&job->aggr_call, tr)) { + cf_warning(AS_SCAN, "aggregation scan job failed call init"); + as_job_destroy(_job); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + // Take ownership of socket from transaction. + conn_scan_job_own_fd((conn_scan_job*)job, tr->from.proto_fd_h, timeout); + + cf_info(AS_SCAN, "starting aggregation scan job %lu {%s:%s} priority %u", + _job->trid, ns->name, as_namespace_get_set_name(ns, set_id), + _job->priority); + + int result = as_job_manager_start_job(_job->mgr, _job); + + if (result != 0) { + cf_warning(AS_SCAN, "aggregation scan job %lu failed to start (%d)", + _job->trid, result); + conn_scan_job_disown_fd((conn_scan_job*)job); + as_job_destroy(_job); + return result; + } + + return AS_PROTO_RESULT_OK; +} + +//---------------------------------------------------------- +// aggr_scan_job mandatory scan_job interface. +// + +void +aggr_scan_job_slice(as_job* _job, as_partition_reservation* rsv) +{ + aggr_scan_job* job = (aggr_scan_job*)_job; + cf_ll ll; + cf_buf_builder* bb = cf_buf_builder_create_size(INIT_BUF_BUILDER_SIZE); + + if (! bb) { + as_job_manager_abandon_job(_job->mgr, _job, + AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + cf_ll_init(&ll, as_index_keys_ll_destroy_fn, false); + + aggr_scan_slice slice = { job, &ll, &bb, rsv }; + + as_index_reduce_live(rsv->tree, aggr_scan_job_reduce_cb, (void*)&slice); + + if (cf_ll_size(&ll) != 0) { + as_result result; + as_result_init(&result); + + int ret = as_aggr_process(_job->ns, &job->aggr_call, &ll, (void*)&slice, + &result); + + if (ret != 0) { + char* rs = as_module_err_string(ret); + + if (result.value) { + as_string* lua_s = as_string_fromval(result.value); + char* lua_err = (char*)as_string_tostring(lua_s); + + if (lua_err) { + int l_rs_len = strlen(rs); + + rs = cf_realloc(rs, l_rs_len + strlen(lua_err) + 4); + sprintf(&rs[l_rs_len], " : %s", lua_err); + } + } + + const as_val* v = (as_val*)as_string_new(rs, false); + + aggr_scan_add_val_response(&slice, v, false); + as_val_destroy(v); + cf_free(rs); + as_job_manager_abandon_job(_job->mgr, _job, + AS_PROTO_RESULT_FAIL_UNKNOWN); + } + + as_result_destroy(&result); + } + + cf_ll_reduce(&ll, true, as_index_keys_ll_reduce_fn, NULL); + + if (bb->used_sz != 0) { + conn_scan_job_send_response((conn_scan_job*)job, bb->buf, bb->used_sz); + } + + // TODO - guts don't check buf_builder realloc failures rigorously. + cf_buf_builder_free(bb); +} + +void +aggr_scan_job_finish(as_job* _job) +{ + aggr_scan_job* job = (aggr_scan_job*)_job; + + conn_scan_job_finish((conn_scan_job*)job); + + if (job->aggr_call.def.arglist) { + as_list_destroy(job->aggr_call.def.arglist); + job->aggr_call.def.arglist = NULL; + } + + switch (_job->abandoned) { + case 0: + cf_atomic_int_incr(&_job->ns->n_scan_aggr_complete); + break; + case AS_JOB_FAIL_USER_ABORT: + cf_atomic_int_incr(&_job->ns->n_scan_aggr_abort); + break; + case AS_JOB_FAIL_UNKNOWN: + case AS_JOB_FAIL_CLUSTER_KEY: + case AS_JOB_FAIL_RESPONSE_ERROR: + case AS_JOB_FAIL_RESPONSE_TIMEOUT: + default: + cf_atomic_int_incr(&_job->ns->n_scan_aggr_error); + break; + } + + cf_info(AS_SCAN, "finished aggregation scan job %lu (%d)", _job->trid, + _job->abandoned); +} + +void +aggr_scan_job_destroy(as_job* _job) +{ + aggr_scan_job* job = (aggr_scan_job*)_job; + + if (job->aggr_call.def.arglist) { + as_list_destroy(job->aggr_call.def.arglist); + } +} + +void +aggr_scan_job_info(as_job* _job, as_mon_jobstat* stat) +{ + strcpy(stat->job_type, scan_type_str(SCAN_TYPE_AGGR)); + conn_scan_job_info((conn_scan_job*)_job, stat); +} + +//---------------------------------------------------------- +// aggr_scan_job utilities. +// + +bool +aggr_scan_init(as_aggr_call* call, const as_transaction* tr) +{ + if (! udf_def_init_from_msg(&call->def, tr)) { + return false; + } + + call->aggr_hooks = &scan_aggr_hooks; + + return true; +} + +void +aggr_scan_job_reduce_cb(as_index_ref* r_ref, void* udata) +{ + aggr_scan_slice* slice = (aggr_scan_slice*)udata; + aggr_scan_job* job = slice->job; + as_job* _job = (as_job*)job; + as_namespace* ns = _job->ns; + + if (_job->abandoned != 0) { + as_record_done(r_ref, ns); + return; + } + + as_index* r = r_ref->r; + + if (excluded_set(r, _job->set_id) || as_record_is_doomed(r, ns)) { + as_record_done(r_ref, ns); + return; + } + + if (! aggr_scan_add_digest(slice->ll, &r->keyd)) { + as_record_done(r_ref, ns); + as_job_manager_abandon_job(_job->mgr, _job, + AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + cf_atomic64_incr(&_job->n_records_read); + as_record_done(r_ref, ns); +} + +bool +aggr_scan_add_digest(cf_ll* ll, cf_digest* keyd) +{ + as_index_keys_ll_element* tail_e = (as_index_keys_ll_element*)ll->tail; + as_index_keys_arr* keys_arr; + + if (tail_e) { + keys_arr = tail_e->keys_arr; + + if (keys_arr->num == AS_INDEX_KEYS_PER_ARR) { + tail_e = NULL; + } + } + + if (! tail_e) { + if (! (keys_arr = as_index_get_keys_arr())) { + return false; + } + + tail_e = cf_malloc(sizeof(as_index_keys_ll_element)); + + tail_e->keys_arr = keys_arr; + cf_ll_append(ll, (cf_ll_element*)tail_e); + } + + keys_arr->pindex_digs[keys_arr->num] = *keyd; + keys_arr->num++; + + return true; +} + +as_partition_reservation* +aggr_scan_ptn_reserve(void* udata, as_namespace* ns, uint32_t pid, + as_partition_reservation* rsv) +{ + aggr_scan_slice* slice = (aggr_scan_slice*)udata; + + return slice->rsv; +} + +as_stream_status +aggr_scan_ostream_write(void* udata, as_val* val) +{ + aggr_scan_slice* slice = (aggr_scan_slice*)udata; + + if (val) { + aggr_scan_add_val_response(slice, val, true); + as_val_destroy(val); + } + + return AS_STREAM_OK; +} + +void +aggr_scan_add_val_response(aggr_scan_slice* slice, const as_val* val, + bool success) +{ + uint32_t size = as_particle_asval_client_value_size(val); + + as_msg_make_val_response_bufbuilder(val, slice->bb_r, size, success); + + cf_buf_builder* bb = *slice->bb_r; + conn_scan_job* conn_job = (conn_scan_job*)slice->job; + + // If we exceed the proto size limit, send accumulated data back to client + // and reset the buf-builder to start a new proto. + if (bb->used_sz > SCAN_CHUNK_LIMIT) { + if (! conn_scan_job_send_response(conn_job, bb->buf, bb->used_sz)) { + return; + } + + cf_buf_builder_reset(bb); + } +} + + + +//============================================================================== +// udf_bg_scan_job derived class implementation. +// + +//---------------------------------------------------------- +// udf_bg_scan_job typedefs and forward declarations. +// + +typedef struct udf_bg_scan_job_s { + // Base object must be first: + as_job _base; + + // Derived class data: + iudf_origin origin; + bool is_durable_delete; // enterprise only + cf_atomic32 n_active_tr; + + cf_atomic64 n_successful_tr; + cf_atomic64 n_failed_tr; +} udf_bg_scan_job; + +void udf_bg_scan_job_slice(as_job* _job, as_partition_reservation* rsv); +void udf_bg_scan_job_finish(as_job* _job); +void udf_bg_scan_job_destroy(as_job* _job); +void udf_bg_scan_job_info(as_job* _job, as_mon_jobstat* stat); + +const as_job_vtable udf_bg_scan_job_vtable = { + udf_bg_scan_job_slice, + udf_bg_scan_job_finish, + udf_bg_scan_job_destroy, + udf_bg_scan_job_info +}; + +void udf_bg_scan_job_reduce_cb(as_index_ref* r_ref, void* udata); +int udf_bg_scan_tr_complete(void* udata, int retcode); + +//---------------------------------------------------------- +// udf_bg_scan_job public API. +// + +int +udf_bg_scan_job_start(as_transaction* tr, as_namespace* ns, uint16_t set_id) +{ + udf_bg_scan_job* job = cf_malloc(sizeof(udf_bg_scan_job)); + as_job* _job = (as_job*)job; + + scan_options options = { .sample_pct = 100 }; + predexp_eval_t* predexp = NULL; + + if (! get_scan_options(tr, &options) || ! get_scan_predexp(tr, &predexp)) { + cf_warning(AS_SCAN, "udf-bg scan job failed msg field processing"); + cf_free(job); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + as_job_init(_job, &udf_bg_scan_job_vtable, &g_scan_manager, RSV_WRITE, + as_transaction_trid(tr), ns, set_id, options.priority); + + job->origin.predexp = predexp; + job->is_durable_delete = as_transaction_is_durable_delete(tr); + job->n_active_tr = 0; + job->n_successful_tr = 0; + job->n_failed_tr = 0; + + if (! udf_def_init_from_msg(&job->origin.def, tr)) { + cf_warning(AS_SCAN, "udf-bg scan job failed def init"); + as_job_destroy(_job); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + job->origin.cb = udf_bg_scan_tr_complete; + job->origin.udata = (void*)job; + + cf_info(AS_SCAN, "starting udf-bg scan job %lu {%s:%s} priority %u", + _job->trid, ns->name, as_namespace_get_set_name(ns, set_id), + _job->priority); + + int result = as_job_manager_start_job(_job->mgr, _job); + + if (result != 0) { + cf_warning(AS_SCAN, "udf-bg scan job %lu failed to start (%d)", + _job->trid, result); + as_job_destroy(_job); + return result; + } + + if (as_msg_send_fin(&tr->from.proto_fd_h->sock, AS_PROTO_RESULT_OK)) { + tr->from.proto_fd_h->last_used = cf_getms(); + as_end_of_transaction_ok(tr->from.proto_fd_h); + } + else { + cf_warning(AS_SCAN, "udf-bg scan job error sending fin"); + as_end_of_transaction_force_close(tr->from.proto_fd_h); + // No point returning an error - it can't be reported on this socket. + } + + tr->from.proto_fd_h = NULL; + + return AS_PROTO_RESULT_OK; +} + +//---------------------------------------------------------- +// udf_bg_scan_job mandatory scan_job interface. +// + +void +udf_bg_scan_job_slice(as_job* _job, as_partition_reservation* rsv) +{ + as_index_reduce_live(rsv->tree, udf_bg_scan_job_reduce_cb, (void*)_job); +} + +void +udf_bg_scan_job_finish(as_job* _job) +{ + udf_bg_scan_job* job = (udf_bg_scan_job*)_job; + + while (cf_atomic32_get(job->n_active_tr) != 0) { + usleep(100); + } + + switch (_job->abandoned) { + case 0: + cf_atomic_int_incr(&_job->ns->n_scan_udf_bg_complete); + break; + case AS_JOB_FAIL_USER_ABORT: + cf_atomic_int_incr(&_job->ns->n_scan_udf_bg_abort); + break; + case AS_JOB_FAIL_UNKNOWN: + case AS_JOB_FAIL_CLUSTER_KEY: + default: + cf_atomic_int_incr(&_job->ns->n_scan_udf_bg_error); + break; + } + + cf_info(AS_SCAN, "finished udf-bg scan job %lu (%d)", _job->trid, + _job->abandoned); +} + +void +udf_bg_scan_job_destroy(as_job* _job) +{ + udf_bg_scan_job* job = (udf_bg_scan_job*)_job; + + iudf_origin_destroy(&job->origin); +} + +void +udf_bg_scan_job_info(as_job* _job, as_mon_jobstat* stat) +{ + strcpy(stat->job_type, scan_type_str(SCAN_TYPE_UDF_BG)); + stat->net_io_bytes = sizeof(cl_msg); // size of original synchronous fin + + udf_bg_scan_job* job = (udf_bg_scan_job*)_job; + char* extra = stat->jdata + strlen(stat->jdata); + + sprintf(extra, ":udf-filename=%s:udf-function=%s:udf-active=%u:udf-success=%lu:udf-failed=%lu", + job->origin.def.filename, job->origin.def.function, + cf_atomic32_get(job->n_active_tr), + cf_atomic64_get(job->n_successful_tr), + cf_atomic64_get(job->n_failed_tr)); +} + +//---------------------------------------------------------- +// udf_bg_scan_job utilities. +// + +void +udf_bg_scan_job_reduce_cb(as_index_ref* r_ref, void* udata) +{ + as_job* _job = (as_job*)udata; + udf_bg_scan_job* job = (udf_bg_scan_job*)_job; + as_namespace* ns = _job->ns; + + if (_job->abandoned != 0) { + as_record_done(r_ref, ns); + return; + } + + as_index* r = r_ref->r; + + if (excluded_set(r, _job->set_id) || as_record_is_doomed(r, ns)) { + as_record_done(r_ref, ns); + return; + } + + predexp_args_t predargs = { .ns = ns, .md = r, .vl = NULL, .rd = NULL }; + + if (job->origin.predexp && + ! predexp_matches_metadata(job->origin.predexp, &predargs)) { + as_record_done(r_ref, ns); + return; + } + + // Save this before releasing record. + cf_digest d = r->keyd; + + // Release record lock before enqueuing transaction. + as_record_done(r_ref, ns); + + // TODO - replace this mechanism with signal-based counter? + while (cf_atomic32_get(job->n_active_tr) > + g_config.scan_max_udf_transactions) { + usleep(50); + } + + as_transaction tr; + + as_transaction_init_iudf(&tr, ns, &d, &job->origin, job->is_durable_delete); + + cf_atomic64_incr(&_job->n_records_read); + cf_atomic32_incr(&job->n_active_tr); + + as_tsvc_enqueue(&tr); +} + +int +udf_bg_scan_tr_complete(void* udata, int retcode) +{ + udf_bg_scan_job* job = (udf_bg_scan_job*)udata; + + cf_atomic32_decr(&job->n_active_tr); + cf_atomic64_incr(retcode == 0 ? &job->n_successful_tr : &job->n_failed_tr); + + return 0; +} diff --git a/as/src/base/secondary_index.c b/as/src/base/secondary_index.c new file mode 100644 index 00000000..d698702c --- /dev/null +++ b/as/src/base/secondary_index.c @@ -0,0 +1,4539 @@ +/* + * secondary_index.c + * + * Copyright (C) 2012-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +/* + * SYNOPSIS + * Abstraction to support secondary indexes with multiple implementations. + * Currently there are two variants of secondary indexes supported. + * + * - Aerospike Index B-tree, this is full fledged index implementation and + * maintains its own metadata and data structure for list of those indexes. + * + * - Citrusleaf foundation indexes which are bare bone tree implementation + * with ability to insert delete update indexes. For these the current code + * manage all the data structure to manage different trees. [Will be + * implemented when required] + * + * This file implements all the translation function which can be called from + * citrusleaf to prepare to do the operations on secondary index. Also + * implements locking to make Aerospike Index (single threaded) code multi threaded. + * + */ + +/* Code flow -- + * + * DDLs + * + * as_sindex_create --> ai_btree_create + * + * as_sindex_destroy --> Releases the si and change the state to AS_SINDEX_DESTROY + * + * BOOT INDEX + * + * as_sindex_boot_populateall --> If fast restart or data in memory and load at start up --> as_sbld_build_all + * + * SBIN creation + * + * as_sindex_sbins_from_rd --> (For every bin in the record) as_sindex_sbins_from_bin + * + * as_sindex_sbins_from_bin --> as_sindex_sbins_from_bin_buf + * + * as_sindex_sbins_from_bin_buf --> (For every macthing sindex) --> as_sindex_sbin_from_sindex + * + * as_sindex_sbin_from_sindex --> (If bin value macthes with sindex defn) --> as_sindex_add_asval_to_itype_sindex + * + * SBIN updates + * + * as_sindex_update_by_sbin --> For every sbin --> as_sindex__op_by_sbin + * + * as_sindex__op_by_sbin --> If op == AS_SINDEX_OP_INSERT --> ai_btree_put + * | + * --> If op == AS_SINDEX_OP_DELETE --> ai_btree_delete + * + * DMLs using RECORD + * + * as_sindex_put_rd --> For each bin in the record --> as_sindex_sbin_from_sindex + * + * as_sindex_putall_rd --> For each sindex --> as_sindex_put_rd + * + */ + +#include "base/secondary_index.h" + +#include +#include +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_queue.h" + +#include "aerospike/as_arraylist.h" +#include "aerospike/as_arraylist_iterator.h" +#include "aerospike/as_buffer.h" +#include "aerospike/as_hashmap.h" +#include "aerospike/as_hashmap_iterator.h" +#include "aerospike/as_msgpack.h" +#include "aerospike/as_pair.h" +#include "aerospike/as_serializer.h" +#include "aerospike/as_val.h" + +#include "ai_btree.h" +#include "bt_iterator.h" +#include "cf_str.h" +#include "fault.h" +#include "shash.h" + +#include "base/cdt.h" +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/stats.h" +#include "base/system_metadata.h" +#include "base/thr_sindex.h" +#include "base/thr_info.h" +#include "fabric/partition.h" +#include "geospatial/geospatial.h" +#include "transaction/udf.h" + + +#define SINDEX_CRASH(str, ...) \ + cf_crash(AS_SINDEX, "SINDEX_ASSERT: "str, ##__VA_ARGS__); + +#define AS_SINDEX_PROP_KEY_SIZE (AS_SET_NAME_MAX_SIZE + 20) // setname_binid_typeid + + +// ************************************************************************************************ +// BINID HAS SINDEX +// Maintains a bit array where binid'th bit represents the existence of atleast one index over the +// bin with bin id as binid. +// Set, reset should be called under SINDEX_GWLOCK +// get should be called under SINDEX_GRLOCK + +void +as_sindex_set_binid_has_sindex(as_namespace *ns, int binid) +{ + int index = binid / 32; + uint32_t temp = ns->binid_has_sindex[index]; + temp |= (1 << (binid % 32)); + ns->binid_has_sindex[index] = temp; +} + +void +as_sindex_reset_binid_has_sindex(as_namespace *ns, int binid) +{ + int i = 0; + int j = 0; + as_sindex * si = NULL; + + while (i < AS_SINDEX_MAX && j < ns->sindex_cnt) { + si = &ns->sindex[i]; + if (si != NULL) { + if (si->state == AS_SINDEX_ACTIVE) { + j++; + if (si->imd->binid == binid) { + return; + } + } + } + i++; + } + + int index = binid / 32; + uint32_t temp = ns->binid_has_sindex[index]; + temp &= ~(1 << (binid % 32)); + ns->binid_has_sindex[index] = temp; +} + +bool +as_sindex_binid_has_sindex(as_namespace *ns, int binid) +{ + int index = binid / 32; + uint32_t temp = ns->binid_has_sindex[index]; + return (temp & (1 << (binid % 32))) ? true : false; +} +// END - BINID HAS SINDEX +// ************************************************************************************************ +// ************************************************************************************************ +// UTILITY +// Translation from sindex error code to string. In alphabetic order +const char *as_sindex_err_str(int op_code) { + switch (op_code) { + case AS_SINDEX_ERR: return "ERR GENERIC"; + case AS_SINDEX_ERR_BIN_NOTFOUND: return "BIN NOT FOUND"; + case AS_SINDEX_ERR_FOUND: return "INDEX FOUND"; + case AS_SINDEX_ERR_INAME_MAXLEN: return "INDEX NAME EXCEED MAX LIMIT"; + case AS_SINDEX_ERR_MAXCOUNT: return "INDEX COUNT EXCEEDS MAX LIMIT"; + case AS_SINDEX_ERR_NOTFOUND: return "NO INDEX"; + case AS_SINDEX_ERR_NOT_READABLE: return "INDEX NOT READABLE"; + case AS_SINDEX_ERR_NO_MEMORY: return "NO MEMORY"; + case AS_SINDEX_ERR_PARAM: return "ERR PARAM"; + case AS_SINDEX_ERR_SET_MISMATCH: return "SET MISMATCH"; + case AS_SINDEX_ERR_TYPE_MISMATCH: return "KEY TYPE MISMATCH"; + case AS_SINDEX_ERR_UNKNOWN_KEYTYPE: return "UNKNOWN KEYTYPE"; + case AS_SINDEX_OK: return "OK"; + default: return "Unknown Code"; + } +} + +inline bool as_sindex_isactive(as_sindex *si) +{ + if (! si) { + cf_warning(AS_SINDEX, "si is null in as_sindex_isactive"); + return false; + } + + return si->state == AS_SINDEX_ACTIVE; +} + +// Translation from sindex internal error code to generic client visible Aerospike error code +uint8_t as_sindex_err_to_clienterr(int err, char *fname, int lineno) { + switch (err) { + case AS_SINDEX_ERR_FOUND: return AS_PROTO_RESULT_FAIL_INDEX_FOUND; + case AS_SINDEX_ERR_INAME_MAXLEN: return AS_PROTO_RESULT_FAIL_INDEX_NAME_MAXLEN; + case AS_SINDEX_ERR_MAXCOUNT: return AS_PROTO_RESULT_FAIL_INDEX_MAXCOUNT; + case AS_SINDEX_ERR_NOTFOUND: return AS_PROTO_RESULT_FAIL_INDEX_NOTFOUND; + case AS_SINDEX_ERR_NOT_READABLE: return AS_PROTO_RESULT_FAIL_INDEX_NOTREADABLE; + case AS_SINDEX_ERR_NO_MEMORY: return AS_PROTO_RESULT_FAIL_INDEX_OOM; + case AS_SINDEX_ERR_PARAM: return AS_PROTO_RESULT_FAIL_PARAMETER; + case AS_SINDEX_OK: return AS_PROTO_RESULT_OK; + + // Defensive internal error + case AS_SINDEX_ERR: + case AS_SINDEX_ERR_BIN_NOTFOUND: + case AS_SINDEX_ERR_SET_MISMATCH: + case AS_SINDEX_ERR_TYPE_MISMATCH: + case AS_SINDEX_ERR_UNKNOWN_KEYTYPE: + default: cf_warning(AS_SINDEX, "%s %d Error at %s,%d", + as_sindex_err_str(err), err, fname, lineno); + return AS_PROTO_RESULT_FAIL_INDEX_GENERIC; + } +} + +bool +as_sindex__setname_match(as_sindex_metadata *imd, const char *setname) +{ + // NULL SET being a valid set, logic is a bit complex + if (setname && ((!imd->set) || strcmp(imd->set, setname))) { + goto Fail; + } + else if (!setname && imd->set) { + goto Fail; + } + return true; +Fail: + cf_debug(AS_SINDEX, "Index Mismatch %s %s", imd->set, setname); + return false; +} + +/* Returns + * AS_SINDEX_GC_ERROR if cannot defrag + * AS_SINDEX_GC_OK if can defrag + * AS_SINDEX_GC_SKIP_ITERATION if partition lock timed out + */ +as_sindex_gc_status +as_sindex_can_defrag_record(as_namespace *ns, cf_digest *keyd) +{ + as_partition_reservation rsv; + uint32_t pid = as_partition_getid(keyd); + + int timeout_ms = 2; + if (as_partition_reserve_timeout(ns, pid, &rsv, timeout_ms) != 0 ) { + cf_atomic64_incr(&g_stats.sindex_gc_timedout); + return AS_SINDEX_GC_SKIP_ITERATION; + } + + int rv = AS_SINDEX_GC_ERROR; + if (as_record_exists_live(rsv.tree, keyd, rsv.ns) != 0) { + rv = AS_SINDEX_GC_OK; + } + as_partition_release(&rsv); + return rv; + +} + +/* + * Function as_sindex_pktype + * Returns the type of particle indexed + * + * Returns - + * On failure - AS_SINDEX_ERR_UNKNOWN_KEYTYPE + */ +as_particle_type +as_sindex_pktype(as_sindex_metadata * imd) +{ + switch (imd->sktype) { + case COL_TYPE_LONG: { + return AS_PARTICLE_TYPE_INTEGER; + } + case COL_TYPE_DIGEST: { + return AS_PARTICLE_TYPE_STRING; + } + case COL_TYPE_GEOJSON: { + return AS_PARTICLE_TYPE_GEOJSON; + } + default: { + cf_warning(AS_SINDEX, "UNKNOWN KEY TYPE FOUND. VERY BAD STATE"); + } + } + return AS_SINDEX_ERR_UNKNOWN_KEYTYPE; +} + +/* + * Function as_sindex_key_str + * Returns a static string representing the key type + * + */ +char const * +as_sindex_ktype_str(as_sindex_ktype type) +{ + switch (type) { + case COL_TYPE_LONG: return "NUMERIC"; + case COL_TYPE_DIGEST: return "STRING"; + case COL_TYPE_GEOJSON: return "GEOJSON"; + default: + cf_warning(AS_SINDEX, "UNSUPPORTED KEY TYPE %d", type); + return "??????"; + } +} + +as_sindex_ktype +as_sindex_ktype_from_string(char const * type_str) +{ + if (! type_str) { + cf_warning(AS_SINDEX, "missing secondary index key type"); + return COL_TYPE_INVALID; + } + else if (strncasecmp(type_str, "string", 6) == 0) { + return COL_TYPE_DIGEST; + } + else if (strncasecmp(type_str, "numeric", 7) == 0) { + return COL_TYPE_LONG; + } + else if (strncasecmp(type_str, "geo2dsphere", 11) == 0) { + return COL_TYPE_GEOJSON; + } + else { + cf_warning(AS_SINDEX, "UNRECOGNIZED KEY TYPE %s", type_str); + return COL_TYPE_INVALID; + } +} + +as_sindex_ktype +as_sindex_sktype_from_pktype(as_particle_type t) +{ + switch (t) { + case AS_PARTICLE_TYPE_INTEGER : return COL_TYPE_LONG; + case AS_PARTICLE_TYPE_STRING : return COL_TYPE_DIGEST; + case AS_PARTICLE_TYPE_GEOJSON : return COL_TYPE_GEOJSON; + default : return COL_TYPE_INVALID; + } + return COL_TYPE_INVALID; +} + +/* + * Client API to check if there is secondary index on given namespace + */ +int +as_sindex_ns_has_sindex(as_namespace *ns) +{ + return (ns->sindex_cnt > 0); +} + +char *as_sindex_type_defs[] = +{ "NONE", "LIST", "MAPKEYS", "MAPVALUES" +}; + +bool +as_sindex_can_query(as_sindex *si) +{ + // Still building. Do not allow reads + return (si->flag & AS_SINDEX_FLAG_RACTIVE) ? true : false; +} + +/* + * Create duplicate copy of sindex metadata. New lock is created + * used by index create by user at runtime or index creation at the boot time + */ +void +as_sindex__dup_meta(as_sindex_metadata *imd, as_sindex_metadata **qimd) +{ + if (!imd) return; + + as_sindex_metadata *qimdp = cf_rc_alloc(sizeof(as_sindex_metadata)); + + memset(qimdp, 0, sizeof(as_sindex_metadata)); + + qimdp->ns_name = cf_strdup(imd->ns_name); + + // Set name is optional for create + if (imd->set) { + qimdp->set = cf_strdup(imd->set); + } else { + qimdp->set = NULL; + } + + qimdp->iname = cf_strdup(imd->iname); + qimdp->itype = imd->itype; + qimdp->nprts = imd->nprts; + qimdp->path_str = cf_strdup(imd->path_str); + qimdp->path_length = imd->path_length; + memcpy(qimdp->path, imd->path, AS_SINDEX_MAX_DEPTH*sizeof(as_sindex_path)); + qimdp->bname = cf_strdup(imd->bname); + qimdp->sktype = imd->sktype; + qimdp->binid = imd->binid; + + *qimd = qimdp; +} + +/* + * Function to perform validation check on the return type and increment + * decrement all the statistics. + */ +void +as_sindex__process_ret(as_sindex *si, int ret, as_sindex_op op, + uint64_t starttime, int pos) +{ + switch (op) { + case AS_SINDEX_OP_INSERT: + if (ret && ret != AS_SINDEX_KEY_FOUND) { + cf_debug(AS_SINDEX, + "SINDEX_FAIL: Insert into %s failed at %d with %d", + si->imd->iname, pos, ret); + cf_atomic64_incr(&si->stats.write_errs); + } else if (!ret) { + cf_atomic64_incr(&si->stats.n_objects); + } + cf_atomic64_incr(&si->stats.n_writes); + SINDEX_HIST_INSERT_DATA_POINT(si, write_hist, starttime); + break; + case AS_SINDEX_OP_DELETE: + if (ret && ret != AS_SINDEX_KEY_NOTFOUND) { + cf_debug(AS_SINDEX, + "SINDEX_FAIL: Delete from %s failed at %d with %d", + si->imd->iname, pos, ret); + cf_atomic64_incr(&si->stats.delete_errs); + } else if (!ret) { + cf_atomic64_decr(&si->stats.n_objects); + } + cf_atomic64_incr(&si->stats.n_deletes); + SINDEX_HIST_INSERT_DATA_POINT(si, delete_hist, starttime); + break; + case AS_SINDEX_OP_READ: + if (ret < 0) { // AS_SINDEX_CONTINUE(1) also OK + cf_debug(AS_SINDEX, + "SINDEX_FAIL: Read from %s failed at %d with %d", + si->imd->iname, pos, ret); + cf_atomic64_incr(&si->stats.read_errs); + } + cf_atomic64_incr(&si->stats.n_reads); + break; + default: + cf_crash(AS_SINDEX, "Invalid op"); + } +} + +// Bin id should be around +// if not create it +// TODO is it not needed +int +as_sindex__populate_binid(as_namespace *ns, as_sindex_metadata *imd) +{ + int len = strlen(imd->bname); + if (len >= AS_ID_BIN_SZ) { + cf_warning(AS_SINDEX, "bin name %s of size %d too big. Max size allowed is %d", + imd->bname, len, AS_ID_BIN_SZ-1); + return AS_SINDEX_ERR; + } + + if(!as_bin_name_within_quota(ns, imd->bname)) { + cf_warning(AS_SINDEX, "Bin %s not added. Quota is full", imd->bname); + return AS_SINDEX_ERR; + } + + // An extra strncpy to remove valgrind warning + char bname[AS_ID_BIN_SZ]; + strncpy(bname, imd->bname, AS_ID_BIN_SZ); + imd->binid = as_bin_get_or_assign_id(ns, bname); + cf_debug(AS_SINDEX, " Assigned %d for %s", imd->binid, imd->bname); + + return AS_SINDEX_OK; +} + +// Free if IMD has allocated the info in it +int +as_sindex_imd_free(as_sindex_metadata *imd) +{ + if (!imd) { + cf_warning(AS_SINDEX, "imd is null in as_sindex_imd_free"); + return AS_SINDEX_ERR; + } + + if (imd->ns_name) { + cf_free(imd->ns_name); + imd->ns_name = NULL; + } + + if (imd->iname) { + cf_free(imd->iname); + imd->iname = NULL; + } + + if (imd->set) { + cf_free(imd->set); + imd->set = NULL; + } + + if (imd->path_str) { + cf_free(imd->path_str); + imd->path_str = NULL; + } + + if (imd->bname) { + cf_free(imd->bname); + imd->bname = NULL; + } + + return AS_SINDEX_OK; +} +// END - UTILITY +// ************************************************************************************************ +// ************************************************************************************************ +// METADATA +typedef struct sindex_set_binid_hash_ele_s { + cf_ll_element ele; + int simatch; +} sindex_set_binid_hash_ele; + +void +as_sindex__set_binid_hash_destroy(cf_ll_element * ele) { + cf_free((sindex_set_binid_hash_ele * ) ele); +} + +/* + * Should happen under SINDEX_GWLOCK + */ +as_sindex_status +as_sindex__put_in_set_binid_hash(as_namespace * ns, char * set, int binid, int chosen_id) +{ + // Create fixed size key for hash + // Get the linked list from the hash + // If linked list does not exist then make one and put it in the hash + // Append the chosen id in the linked list + + if (chosen_id < 0 || chosen_id > AS_SINDEX_MAX) { + cf_debug(AS_SINDEX, "Put in set_binid hash got invalid simatch %d", chosen_id); + return AS_SINDEX_ERR; + } + cf_ll * simatch_ll = NULL; + // Create fixed size key for hash + char si_prop[AS_SINDEX_PROP_KEY_SIZE]; + memset(si_prop, 0, AS_SINDEX_PROP_KEY_SIZE); + + if (set == NULL ) { + sprintf(si_prop, "_%d", binid); + } + else { + sprintf(si_prop, "%s_%d", set, binid); + } + + // Get the linked list from the hash + int rv = cf_shash_get(ns->sindex_set_binid_hash, (void *)si_prop, (void *)&simatch_ll); + + // If linked list does not exist then make one and put it in the hash + if (rv && rv != CF_SHASH_ERR_NOT_FOUND) { + cf_debug(AS_SINDEX, "shash get failed with error %d", rv); + return AS_SINDEX_ERR; + }; + if (rv == CF_SHASH_ERR_NOT_FOUND) { + simatch_ll = cf_malloc(sizeof(cf_ll)); + cf_ll_init(simatch_ll, as_sindex__set_binid_hash_destroy, false); + cf_shash_put(ns->sindex_set_binid_hash, (void *)si_prop, (void *)&simatch_ll); + } + if (!simatch_ll) { + return AS_SINDEX_ERR; + } + + // Append the chosen id in the linked list + sindex_set_binid_hash_ele * ele = cf_malloc(sizeof(sindex_set_binid_hash_ele)); + ele->simatch = chosen_id; + cf_ll_append(simatch_ll, (cf_ll_element*)ele); + return AS_SINDEX_OK; +} + +/* + * Should happen under SINDEX_GWLOCK + */ +as_sindex_status +as_sindex__delete_from_set_binid_hash(as_namespace * ns, as_sindex_metadata * imd) +{ + // Make a key + // Get the sindex list corresponding to key + // If the list does not exist, return does not exist + // If the list exist + // match the path and type of incoming si to the existing sindexes in the list + // If any element matches + // Delete from the list + // If the list size becomes 0 + // Delete the entry from the hash + // If none of the element matches, return does not exist. + // + + // Make a key + char si_prop[AS_SINDEX_PROP_KEY_SIZE]; + memset(si_prop, 0, AS_SINDEX_PROP_KEY_SIZE); + if (imd->set == NULL ) { + sprintf(si_prop, "_%d", imd->binid); + } + else { + sprintf(si_prop, "%s_%d", imd->set, imd->binid); + } + + // Get the sindex list corresponding to key + cf_ll * simatch_ll = NULL; + int rv = cf_shash_get(ns->sindex_set_binid_hash, (void *)si_prop, (void *)&simatch_ll); + + // If the list does not exist, return does not exist + if (rv && rv != CF_SHASH_ERR_NOT_FOUND) { + cf_debug(AS_SINDEX, "shash get failed with error %d", rv); + return AS_SINDEX_ERR_NOTFOUND; + }; + if (rv == CF_SHASH_ERR_NOT_FOUND) { + return AS_SINDEX_ERR_NOTFOUND; + } + + // If the list exist + // match the path and type of incoming si to the existing sindexes in the list + bool to_delete = false; + cf_ll_element * ele = NULL; + sindex_set_binid_hash_ele * prop_ele = NULL; + if (simatch_ll) { + ele = cf_ll_get_head(simatch_ll); + while (ele) { + prop_ele = ( sindex_set_binid_hash_ele * ) ele; + as_sindex * si = &(ns->sindex[prop_ele->simatch]); + if (strcmp(si->imd->path_str, imd->path_str) == 0 && + si->imd->sktype == imd->sktype && si->imd->itype == imd->itype) { + to_delete = true; + break; + } + ele = ele->next; + } + } + else { + return AS_SINDEX_ERR_NOTFOUND; + } + + // If any element matches + // Delete from the list + if (to_delete && ele) { + cf_ll_delete(simatch_ll, ele); + } + + // If the list size becomes 0 + // Delete the entry from the hash + if (cf_ll_size(simatch_ll) == 0) { + rv = cf_shash_delete(ns->sindex_set_binid_hash, si_prop); + if (rv) { + cf_debug(AS_SINDEX, "shash_delete fails with error %d", rv); + } + } + + // If none of the element matches, return does not exist. + if (!to_delete) { + return AS_SINDEX_ERR_NOTFOUND; + } + return AS_SINDEX_OK; +} + + +// END - METADATA +// ************************************************************************************************ +// ************************************************************************************************ +// LOOKUP +/* + * Should happen under SINDEX_GRLOCK if called directly. + */ +as_sindex_status +as_sindex__simatch_list_by_set_binid(as_namespace * ns, const char *set, int binid, cf_ll ** simatch_ll) +{ + // Make the fixed size key (set_binid) + // Look for the key in set_binid_hash + // If found return the value (list of simatches) + // Else return NULL + + // Make the fixed size key (set_binid) + char si_prop[AS_SINDEX_PROP_KEY_SIZE]; + memset(si_prop, 0, AS_SINDEX_PROP_KEY_SIZE); + if (!set) { + sprintf(si_prop, "_%d", binid); + } + else { + sprintf(si_prop, "%s_%d", set, binid); + } + + // Look for the key in set_binid_hash + int rv = cf_shash_get(ns->sindex_set_binid_hash, (void *)si_prop, (void *)simatch_ll); + + // If not found return NULL + if (rv || !(*simatch_ll)) { + cf_debug(AS_SINDEX, "shash get failed with error %d", rv); + return AS_SINDEX_ERR_NOTFOUND; + }; + + // Else return simatch_ll + return AS_SINDEX_OK; +} + +/* + * Should happen under SINDEX_GRLOCK + */ +int +as_sindex__simatch_by_set_binid(as_namespace *ns, char * set, int binid, as_sindex_ktype type, as_sindex_type itype, char * path) +{ + // get the list corresponding to the list from the hash + // if list does not exist return -1 + // If list exist + // Iterate through all the elements in the list and match the path and type + // If matches + // return the simatch + // If none of the si matches + // return -1 + + cf_ll * simatch_ll = NULL; + as_sindex__simatch_list_by_set_binid(ns, set, binid, &simatch_ll); + + // If list exist + // Iterate through all the elements in the list and match the path and type + int simatch = -1; + sindex_set_binid_hash_ele * prop_ele = NULL; + cf_ll_element * ele = NULL; + if (simatch_ll) { + ele = cf_ll_get_head(simatch_ll); + while (ele) { + prop_ele = ( sindex_set_binid_hash_ele * ) ele; + as_sindex * si = &(ns->sindex[prop_ele->simatch]); + if (strcmp(si->imd->path_str, path) == 0 && + si->imd->sktype == type && si->imd->itype == itype) { + simatch = prop_ele->simatch; + break; + } + ele = ele->next; + } + } + else { + return -1; + } + + // If matches + // return the simatch + // If none of the si matches + // return -1 + return simatch; +} + +// Populates the si_arr with all the sindexes which matches set and binid +// Each sindex is reserved as well. Enough space is provided by caller in si_arr +// Currently only 8 sindexes can be create on one combination of set and binid +// i.e number_of_sindex_types * number_of_sindex_data_type (4 * 2) +int +as_sindex_arr_lookup_by_set_binid_lockfree(as_namespace * ns, const char *set, int binid, as_sindex ** si_arr) +{ + cf_ll * simatch_ll=NULL; + + int sindex_count = 0; + if (!as_sindex_binid_has_sindex(ns, binid) ) { + return sindex_count; + } + + as_sindex__simatch_list_by_set_binid(ns, set, binid, &simatch_ll); + if (!simatch_ll) { + return sindex_count; + } + + cf_ll_element * ele = cf_ll_get_head(simatch_ll); + sindex_set_binid_hash_ele * si_ele = NULL; + int simatch = -1; + as_sindex * si = NULL; + while (ele) { + si_ele = (sindex_set_binid_hash_ele *) ele; + simatch = si_ele->simatch; + + if (simatch == -1) { + cf_warning(AS_SINDEX, "A matching simatch comes out to be -1."); + ele = ele->next; + continue; + } + + si = &ns->sindex[simatch]; + // Reserve only active sindexes. + // Do not break this rule + if (!as_sindex_isactive(si)) { + ele = ele->next; + continue; + } + + if (simatch != si->simatch) { + cf_warning(AS_SINDEX, "Inconsistent simatch reference between simatch stored in" + "si and simatch stored in hash"); + ele = ele->next; + continue; + } + + AS_SINDEX_RESERVE(si); + + si_arr[sindex_count++] = si; + ele = ele->next; + } + return sindex_count; +} + +// Populates the si_arr with all the sindexes which matches setname +// Each sindex is reserved as well. Enough space is provided by caller in si_arr +int +as_sindex_arr_lookup_by_setname_lockfree(as_namespace * ns, const char *setname, as_sindex ** si_arr) +{ + int sindex_count = 0; + as_sindex * si = NULL; + + for (int i=0; i= ns->sindex_cnt) { + break; + } + si = &ns->sindex[i]; + // Reserve only active sindexes. + // Do not break this rule + if (!as_sindex_isactive(si)) { + continue; + } + + if (!as_sindex__setname_match(si->imd, setname)) { + continue; + } + + AS_SINDEX_RESERVE(si); + + si_arr[sindex_count++] = si; + } + + return sindex_count; +} +int +as_sindex__simatch_by_iname(as_namespace *ns, char *idx_name) +{ + if (strlen(idx_name) >= AS_ID_INAME_SZ) { + return -1; + } + + char iname[AS_ID_INAME_SZ] = { 0 }; // must pad key + strcpy(iname, idx_name); + + int simatch = -1; + int rv = cf_shash_get(ns->sindex_iname_hash, (void *)iname, (void *)&simatch); + cf_detail(AS_SINDEX, "Found iname simatch %s->%d rv=%d", iname, simatch, rv); + + if (rv) { + return -1; + } + return simatch; +} +/* + * Single cluttered interface for lookup. iname precedes binid + * i.e if both are specified search is done with iname + */ +#define AS_SINDEX_LOOKUP_FLAG_SETCHECK 0x01 +#define AS_SINDEX_LOOKUP_FLAG_ISACTIVE 0x02 +#define AS_SINDEX_LOOKUP_FLAG_NORESERVE 0x04 +as_sindex * +as_sindex__lookup_lockfree(as_namespace *ns, char *iname, char *set, int binid, + as_sindex_ktype type, as_sindex_type itype, char * path, char flag) +{ + + // If iname is not null then search in iname hash and store the simatch + // Else then + // Check the possible existence of sindex over bin in the bit array + // If no possibility return NULL + // Search in the set_binid hash using setname, binid, itype and binid + // If found store simatch + // If not found return NULL + // Get the sindex corresponding to the simatch. + // Apply the flags applied by caller. + // Validate the simatch + + int simatch = -1; + as_sindex *si = NULL; + // If iname is not null then search in iname hash and store the simatch + if (iname) { + simatch = as_sindex__simatch_by_iname(ns, iname); + } + // Else then + // Check the possible existence of sindex over bin in the bit array + else { + if (!as_sindex_binid_has_sindex(ns, binid) ) { + // If no possibility return NULL + goto END; + } + // Search in the set_binid hash using setname, binid, itype and binid + // If found store simatch + simatch = as_sindex__simatch_by_set_binid(ns, set, binid, type, itype, path); + } + // If not found return NULL + // Get the sindex corresponding to the simatch. + if (simatch != -1) { + si = &ns->sindex[simatch]; + // Apply the flags applied by caller. + if ((flag & AS_SINDEX_LOOKUP_FLAG_ISACTIVE) + && !as_sindex_isactive(si)) { + si = NULL; + goto END; + } + // Validate the simatch + if (simatch != si->simatch) { + cf_warning(AS_SINDEX, "Inconsistent simatch reference between simatch stored in" + "si and simatch stored in hash"); + } + if (!(flag & AS_SINDEX_LOOKUP_FLAG_NORESERVE)) + AS_SINDEX_RESERVE(si); + } +END: + return si; +} + +as_sindex * +as_sindex__lookup(as_namespace *ns, char *iname, char *set, int binid, as_sindex_ktype type, + as_sindex_type itype, char * path, char flag) +{ + SINDEX_GRLOCK(); + as_sindex *si = as_sindex__lookup_lockfree(ns, iname, set, binid, type, itype, path, flag); + SINDEX_GRUNLOCK(); + return si; +} + +as_sindex * +as_sindex_lookup_by_iname(as_namespace *ns, char * iname, char flag) +{ + return as_sindex__lookup(ns, iname, NULL, -1, 0, 0, NULL, flag); +} + +as_sindex * +as_sindex_lookup_by_defns(as_namespace *ns, char *set, int binid, as_sindex_ktype type, as_sindex_type itype, char * path, char flag) +{ + return as_sindex__lookup(ns, NULL, set, binid, type, itype, path, flag); +} + +as_sindex * +as_sindex_lookup_by_iname_lockfree(as_namespace *ns, char * iname, char flag) +{ + return as_sindex__lookup_lockfree(ns, iname, NULL, -1, 0, 0, NULL, flag); +} + +as_sindex * +as_sindex_lookup_by_defns_lockfree(as_namespace *ns, char *set, int binid, as_sindex_ktype type, as_sindex_type itype, char * path, char flag) +{ + return as_sindex__lookup_lockfree(ns, NULL, set, binid, type, itype, path, flag); +} + + +// END LOOKUP +// ************************************************************************************************ +// ************************************************************************************************ +// STAT/CONFIG/HISTOGRAM +void +as_sindex__stats_clear(as_sindex *si) { + as_sindex_stat *s = &si->stats; + + s->n_objects = 0; + + s->n_reads = 0; + s->read_errs = 0; + + s->n_writes = 0; + s->write_errs = 0; + + s->n_deletes = 0; + s->delete_errs = 0; + + s->loadtime = 0; + s->recs_pending = 0; + + s->n_defrag_records = 0; + s->defrag_time = 0; + + // Aggregation stat + s->n_aggregation = 0; + s->agg_response_size = 0; + s->agg_num_records = 0; + s->agg_errs = 0; + // Lookup stats + s->n_lookup = 0; + s->lookup_response_size = 0; + s->lookup_num_records = 0; + s->lookup_errs = 0; + + si->enable_histogram = false; + if (s->_write_hist) { + histogram_clear(s->_write_hist); + } + if (s->_si_prep_hist) { + histogram_clear(s->_si_prep_hist); + } + if (s->_delete_hist) { + histogram_clear(s->_delete_hist); + } + if (s->_query_hist) { + histogram_clear(s->_query_hist); + } + if (s->_query_batch_io) { + histogram_clear(s->_query_batch_io); + } + if (s->_query_batch_lookup) { + histogram_clear(s->_query_batch_lookup); + } + if (s->_query_rcnt_hist) { + histogram_clear(s->_query_rcnt_hist); + } + if (s->_query_diff_hist) { + histogram_clear(s->_query_diff_hist); + } +} + +void +as_sindex_gconfig_default(as_config *c) +{ + c->sindex_builder_threads = 4; + c->sindex_gc_max_rate = 50000; // 50,000 per second + c->sindex_gc_period = 10; // every 10 seconds +} + +void +as_sindex__config_default(as_sindex *si) +{ + si->config.flag = AS_SINDEX_FLAG_WACTIVE; +} + +void +as_sindex__setup_histogram(as_sindex *si) +{ + char hist_name[AS_ID_INAME_SZ + 64]; + + sprintf(hist_name, "%s_write_us", si->imd->iname); + si->stats._write_hist = histogram_create(hist_name, HIST_MICROSECONDS); + + sprintf(hist_name, "%s_si_prep_us", si->imd->iname); + si->stats._si_prep_hist = histogram_create(hist_name, HIST_MICROSECONDS); + + sprintf(hist_name, "%s_delete_us", si->imd->iname); + si->stats._delete_hist = histogram_create(hist_name, HIST_MICROSECONDS); + + sprintf(hist_name, "%s_query", si->imd->iname); + si->stats._query_hist = histogram_create(hist_name, HIST_MILLISECONDS); + + sprintf(hist_name, "%s_query_batch_lookup_us", si->imd->iname); + si->stats._query_batch_lookup = histogram_create(hist_name, HIST_MICROSECONDS); + + sprintf(hist_name, "%s_query_batch_io_us", si->imd->iname); + si->stats._query_batch_io = histogram_create(hist_name, HIST_MICROSECONDS); + + sprintf(hist_name, "%s_query_row_count", si->imd->iname); + si->stats._query_rcnt_hist = histogram_create(hist_name, HIST_COUNT); + + sprintf(hist_name, "%s_query_diff_count", si->imd->iname); + si->stats._query_diff_hist = histogram_create(hist_name, HIST_COUNT); +} + +int +as_sindex__destroy_histogram(as_sindex *si) +{ + if (si->stats._write_hist) cf_free(si->stats._write_hist); + if (si->stats._si_prep_hist) cf_free(si->stats._si_prep_hist); + if (si->stats._delete_hist) cf_free(si->stats._delete_hist); + if (si->stats._query_hist) cf_free(si->stats._query_hist); + if (si->stats._query_batch_lookup) cf_free(si->stats._query_batch_lookup); + if (si->stats._query_batch_io) cf_free(si->stats._query_batch_io); + if (si->stats._query_rcnt_hist) cf_free(si->stats._query_rcnt_hist); + if (si->stats._query_diff_hist) cf_free(si->stats._query_diff_hist); + return 0; +} + +int +as_sindex_stats_str(as_namespace *ns, char * iname, cf_dyn_buf *db) +{ + as_sindex *si = as_sindex_lookup_by_iname(ns, iname, AS_SINDEX_LOOKUP_FLAG_ISACTIVE); + + if (!si) { + cf_warning(AS_SINDEX, "SINDEX STAT : sindex %s not found", iname); + return AS_SINDEX_ERR_NOTFOUND; + } + + // A good thing to cache the stats first. + uint64_t ns_objects = ns->n_objects; + uint64_t si_objects = cf_atomic64_get(si->stats.n_objects); + uint64_t pending = cf_atomic64_get(si->stats.recs_pending); + + uint64_t n_keys = ai_btree_get_numkeys(si->imd); + uint64_t i_size = ai_btree_get_isize(si->imd); + uint64_t n_size = ai_btree_get_nsize(si->imd); + + info_append_uint64(db, "keys", n_keys); + info_append_uint64(db, "entries", si_objects); + info_append_uint64(db, "ibtr_memory_used", i_size); + info_append_uint64(db, "nbtr_memory_used", n_size); + info_append_uint64(db, "si_accounted_memory", i_size + n_size); + if (si->flag & AS_SINDEX_FLAG_RACTIVE) { + info_append_string(db, "load_pct", "100"); + } else { + if (pending > ns_objects) { + info_append_uint64(db, "load_pct", 100); + } else { + info_append_uint64(db, "load_pct", (ns_objects == 0) ? 100 : 100 - ((100 * pending) / ns_objects)); + } + } + + info_append_uint64(db, "loadtime", cf_atomic64_get(si->stats.loadtime)); + // writes + info_append_uint64(db, "write_success", cf_atomic64_get(si->stats.n_writes) - cf_atomic64_get(si->stats.write_errs)); + info_append_uint64(db, "write_error", cf_atomic64_get(si->stats.write_errs)); + // delete + info_append_uint64(db, "delete_success", cf_atomic64_get(si->stats.n_deletes) - cf_atomic64_get(si->stats.delete_errs)); + info_append_uint64(db, "delete_error", cf_atomic64_get(si->stats.delete_errs)); + // defrag + info_append_uint64(db, "stat_gc_recs", cf_atomic64_get(si->stats.n_defrag_records)); + info_append_uint64(db, "stat_gc_time", cf_atomic64_get(si->stats.defrag_time)); + + // Cache values + uint64_t agg = cf_atomic64_get(si->stats.n_aggregation); + uint64_t agg_rec = cf_atomic64_get(si->stats.agg_num_records); + uint64_t agg_size = cf_atomic64_get(si->stats.agg_response_size); + uint64_t lkup = cf_atomic64_get(si->stats.n_lookup); + uint64_t lkup_rec = cf_atomic64_get(si->stats.lookup_num_records); + uint64_t lkup_size = cf_atomic64_get(si->stats.lookup_response_size); + uint64_t query = agg + lkup; + uint64_t query_rec = agg_rec + lkup_rec; + uint64_t query_size = agg_size + lkup_size; + + // Query + info_append_uint64(db, "query_reqs", query); + info_append_uint64(db, "query_avg_rec_count", query ? query_rec / query : 0); + info_append_uint64(db, "query_avg_record_size", query_rec ? query_size / query_rec : 0); + // Aggregation + info_append_uint64(db, "query_agg", agg); + info_append_uint64(db, "query_agg_avg_rec_count", agg ? agg_rec / agg : 0); + info_append_uint64(db, "query_agg_avg_record_size", agg_rec ? agg_size / agg_rec : 0); + //Lookup + info_append_uint64(db, "query_lookups", lkup); + info_append_uint64(db, "query_lookup_avg_rec_count", lkup ? lkup_rec / lkup : 0); + info_append_uint64(db, "query_lookup_avg_record_size", lkup_rec ? lkup_size / lkup_rec : 0); + + info_append_bool(db, "histogram", si->enable_histogram); + + cf_dyn_buf_chomp(db); + + AS_SINDEX_RELEASE(si); + // Release reference + return AS_SINDEX_OK; +} + +int +as_sindex_histogram_dumpall(as_namespace *ns) +{ + if (!ns) + return AS_SINDEX_ERR_PARAM; + SINDEX_GRLOCK(); + + for (int i = 0; i < ns->sindex_cnt; i++) { + if (ns->sindex[i].state != AS_SINDEX_ACTIVE) continue; + if (!ns->sindex[i].enable_histogram) continue; + as_sindex *si = &ns->sindex[i]; + if (si->stats._write_hist) + histogram_dump(si->stats._write_hist); + if (si->stats._si_prep_hist) + histogram_dump(si->stats._si_prep_hist); + if (si->stats._delete_hist) + histogram_dump(si->stats._delete_hist); + if (si->stats._query_hist) + histogram_dump(si->stats._query_hist); + if (si->stats._query_batch_lookup) + histogram_dump(si->stats._query_batch_lookup); + if (si->stats._query_batch_io) + histogram_dump(si->stats._query_batch_io); + if (si->stats._query_rcnt_hist) + histogram_dump(si->stats._query_rcnt_hist); + if (si->stats._query_diff_hist) + histogram_dump(si->stats._query_diff_hist); + } + SINDEX_GRUNLOCK(); + return AS_SINDEX_OK; +} + +int +as_sindex_histogram_enable(as_namespace *ns, char * iname, bool enable) +{ + as_sindex *si = as_sindex_lookup_by_iname(ns, iname, AS_SINDEX_LOOKUP_FLAG_ISACTIVE); + if (!si) { + cf_warning(AS_SINDEX, "SINDEX HISTOGRAM : sindex %s not found", iname); + return AS_SINDEX_ERR_NOTFOUND; + } + + si->enable_histogram = enable; + AS_SINDEX_RELEASE(si); + return AS_SINDEX_OK; +} + +/* + * Client API to list all the indexes in a namespace, returns list of imd with + * index information, Caller should free it up + */ +int +as_sindex_list_str(as_namespace *ns, cf_dyn_buf *db) +{ + SINDEX_GRLOCK(); + for (int i = 0; i < AS_SINDEX_MAX; i++) { + if (&(ns->sindex[i]) && (ns->sindex[i].imd)) { + as_sindex si = ns->sindex[i]; + + cf_dyn_buf_append_string(db, "ns="); + cf_dyn_buf_append_string(db, ns->name); + cf_dyn_buf_append_string(db, ":set="); + cf_dyn_buf_append_string(db, (si.imd->set) ? si.imd->set : "NULL"); + cf_dyn_buf_append_string(db, ":indexname="); + cf_dyn_buf_append_string(db, si.imd->iname); + cf_dyn_buf_append_string(db, ":bin="); + cf_dyn_buf_append_buf(db, (uint8_t *)si.imd->bname, strlen(si.imd->bname)); + cf_dyn_buf_append_string(db, ":type="); + cf_dyn_buf_append_string(db, as_sindex_ktype_str(si.imd->sktype)); + cf_dyn_buf_append_string(db, ":indextype="); + cf_dyn_buf_append_string(db, as_sindex_type_defs[si.imd->itype]); + + cf_dyn_buf_append_string(db, ":path="); + cf_dyn_buf_append_string(db, si.imd->path_str); + + // Index State + if (si.state == AS_SINDEX_ACTIVE) { + if (si.flag & AS_SINDEX_FLAG_RACTIVE) { + cf_dyn_buf_append_string(db, ":state=RW;"); + } + else if (si.flag & AS_SINDEX_FLAG_WACTIVE) { + cf_dyn_buf_append_string(db, ":state=WO;"); + } + else { + // should never come here. + cf_dyn_buf_append_string(db, ":state=A;"); + } + } + else if (si.state == AS_SINDEX_INACTIVE) { + cf_dyn_buf_append_string(db, ":state=I;"); + } + else { + cf_dyn_buf_append_string(db, ":state=D;"); + } + } + } + SINDEX_GRUNLOCK(); + return AS_SINDEX_OK; +} +// END - STAT/CONFIG/HISTOGRAM +// ************************************************************************************************ +// ************************************************************************************************ +// SI REFERENCE +// Reserve the sindex so it does not get deleted under the hood +int +as_sindex_reserve(as_sindex *si, char *fname, int lineno) +{ + if (! as_sindex_isactive(si)) { + cf_warning(AS_SINDEX, "Trying to reserve sindex %s in a state other than active. State is %d", + si->imd->iname, si->state); + } + + if (si->imd) { + cf_rc_reserve(si->imd); + } + + return AS_SINDEX_OK; +} + +/* + * Release, queue up the request for the destroy to clean up Aerospike Index thread, + * Not done inline because main write thread could release the last reference. + */ +void +as_sindex_release(as_sindex *si, char *fname, int lineno) +{ + if (! si) { + return; + } + + uint64_t val = cf_rc_release(si->imd); + + if (val == 0) { + si->flag |= AS_SINDEX_FLAG_DESTROY_CLEANUP; + cf_queue_push(g_sindex_destroy_q, &si); + } +} + +as_sindex_status +as_sindex_populator_reserve_all(as_namespace * ns) +{ + if (!ns) { + cf_warning(AS_SINDEX, "namespace found NULL"); + return AS_SINDEX_ERR; + } + + int count = 0 ; + int valid = 0; + SINDEX_GRLOCK(); + while (valid < ns->sindex_cnt && count < AS_SINDEX_MAX) { + as_sindex * si = &ns->sindex[count]; + if (as_sindex_isactive(si)) { + AS_SINDEX_RESERVE(si); + valid++; + } + count++; + } + SINDEX_GRUNLOCK(); + return AS_SINDEX_OK; +} + +as_sindex_status +as_sindex_populator_release_all(as_namespace * ns) +{ + if (!ns) { + cf_warning(AS_SINDEX, "namespace found NULL"); + return AS_SINDEX_ERR; + } + + int count = 0 ; + int valid = 0; + SINDEX_GRLOCK(); + while (valid < ns->sindex_cnt && count < AS_SINDEX_MAX) { + as_sindex * si = &ns->sindex[count]; + if (as_sindex_isactive(si)) { + AS_SINDEX_RELEASE(si); + valid++; + } + count++; + } + SINDEX_GRUNLOCK(); + return AS_SINDEX_OK; +} + +// Complementary function of as_sindex_arr_lookup_by_set_binid +void +as_sindex_release_arr(as_sindex *si_arr[], int si_arr_sz) +{ + for (int i=0; iimd->pimd = cf_malloc(nptr * sizeof(as_sindex_pmetadata)); + memset(si->imd->pimd, 0, nptr*sizeof(as_sindex_pmetadata)); + + pthread_rwlockattr_t rwattr; + if (pthread_rwlockattr_init(&rwattr)) + cf_crash(AS_AS, + "pthread_rwlockattr_init: %s", cf_strerror(errno)); + if (pthread_rwlockattr_setkind_np(&rwattr, + PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP)) + cf_crash(AS_TSVC, + "pthread_rwlockattr_setkind_np: %s",cf_strerror(errno)); + + for (int i = 0; i < nptr; i++) { + as_sindex_pmetadata *pimd = &si->imd->pimd[i]; + if (pthread_rwlock_init(&pimd->slock, &rwattr)) { + cf_crash(AS_SINDEX, + "Could not create secondary index dml mutex "); + } + } +} + +/* + * Description : + * Checks the parameters passed to as_sindex_create function + * + * Parameters: + * namespace, index metadata + * + * Returns: + * AS_SINDEX_OK - for valid parameters. + * Appropriate error codes - otherwise + * + * Synchronization: + * This function does not explicitly acquire any lock. + * TODO : Check if exits_by_defn can be used instead of this + */ +int +as_sindex_create_check_params(as_namespace* ns, as_sindex_metadata* imd) +{ + SINDEX_GRLOCK(); + + int ret = AS_SINDEX_OK; + if (ns->sindex_cnt >= AS_SINDEX_MAX) { + ret = AS_SINDEX_ERR_MAXCOUNT; + goto END; + } + + int simatch = as_sindex__simatch_by_iname(ns, imd->iname); + if (simatch != -1) { + ret = AS_SINDEX_ERR_FOUND; + } else { + int16_t binid = as_bin_get_id(ns, imd->bname); + if (binid != -1) + { + int simatch = as_sindex__simatch_by_set_binid(ns, imd->set, binid, imd->sktype, imd->itype, imd->path_str); + if (simatch != -1) { + ret = AS_SINDEX_ERR_FOUND; + goto END; + } + } + } + +END: + SINDEX_GRUNLOCK(); + return ret; +} + +static int +sindex_create_lockless(as_namespace *ns, as_sindex_metadata *imd) +{ + int chosen_id = AS_SINDEX_MAX; + as_sindex *si = NULL; + for (int i = 0; i < AS_SINDEX_MAX; i++) { + if (ns->sindex[i].state == AS_SINDEX_INACTIVE) { + si = &ns->sindex[i]; + chosen_id = i; + break; + } + } + + if (! si || (chosen_id == AS_SINDEX_MAX)) { + cf_warning(AS_SINDEX, "SINDEX CREATE : Maxed out secondary index limit no more indexes allowed"); + return AS_SINDEX_ERR; + } + + as_set *p_set = NULL; + + if (imd->set) { + if (as_namespace_get_create_set_w_len(ns, imd->set, strlen(imd->set), &p_set, NULL) != 0) { + cf_warning(AS_SINDEX, "SINDEX CREATE : failed get-create set %s", imd->set); + return AS_SINDEX_ERR; + } + } + + imd->nprts = ns->sindex_num_partitions; + int id = chosen_id; + si = &ns->sindex[id]; + as_sindex_metadata *qimd; + + if (as_sindex__populate_binid(ns, imd)) { + cf_warning(AS_SINDEX, "SINDEX CREATE : Popluating bin id failed"); + return AS_SINDEX_ERR_PARAM; + } + + as_sindex_status rv = as_sindex__put_in_set_binid_hash(ns, imd->set, imd->binid, id); + if (rv != AS_SINDEX_OK) { + cf_warning(AS_SINDEX, "SINDEX CREATE : Put in set_binid hash fails with error %d", rv); + return AS_SINDEX_ERR; + } + + cf_detail(AS_SINDEX, "Put binid simatch %d->%d", imd->binid, chosen_id); + + char iname[AS_ID_INAME_SZ]; + memset(iname, 0, AS_ID_INAME_SZ); + snprintf(iname, strlen(imd->iname)+1, "%s", imd->iname); + cf_shash_put(ns->sindex_iname_hash, (void *)iname, (void *)&chosen_id); + cf_detail(AS_SINDEX, "Put iname simatch %s:%zu->%d", iname, strlen(imd->iname), chosen_id); + + // Init SI + si->ns = ns; + si->simatch = chosen_id; + si->state = AS_SINDEX_ACTIVE; + si->flag = AS_SINDEX_FLAG_WACTIVE; + si->recreate_imd = NULL; + as_sindex__config_default(si); + + // Init IMD + as_sindex__dup_meta(imd, &qimd); + si->imd = qimd; + qimd->si = si; + + // Init PIMD + as_sindex__create_pmeta(si, id, imd->nprts); + ai_btree_create(si->imd); + as_sindex_set_binid_has_sindex(ns, si->imd->binid); + + + // Update Counter + as_sindex__setup_histogram(si); + as_sindex__stats_clear(si); + ns->sindex_cnt++; + if (p_set) { + p_set->n_sindexes++; + } else { + ns->n_setless_sindexes++; + } + cf_atomic64_add(&ns->n_bytes_sindex_memory, ai_btree_get_isize(si->imd)); + + // Queue this for secondary index builder if create is done after boot. + // At the boot time single builder request is queued for entire namespace. + if (g_sindex_boot_done) { + // Reserve for ref in queue + AS_SINDEX_RESERVE(si); + cf_queue_push(g_sindex_populate_q, &si); + } + + return AS_SINDEX_OK; +} + +int +as_sindex_create(as_namespace *ns, as_sindex_metadata *imd) +{ + // Ideally there should be one lock per namespace, but because the + // Aerospike Index metadata is single global structure we need a overriding + // lock for that. NB if it becomes per namespace have a file lock + SINDEX_GWLOCK(); + if (as_sindex_lookup_by_iname_lockfree(ns, imd->iname, AS_SINDEX_LOOKUP_FLAG_NORESERVE)) { + cf_detail(AS_SINDEX,"Index %s already exists", imd->iname); + SINDEX_GWUNLOCK(); + return AS_SINDEX_ERR_FOUND; + } + + int rv = sindex_create_lockless(ns, imd); + SINDEX_GWUNLOCK(); + return rv; +} + +void +as_sindex_smd_create(as_namespace *ns, as_sindex_metadata *imd) +{ + SINDEX_GWLOCK(); + + // FIXME - wrong place for check + // If one node cannot have > AS_SINDEX_MAX then neither + // can majority in cluster. + // if (ns->sindex_cnt >= AS_SINDEX_MAX) { + // cf_warning(AS_SINDEX, "Failed to SMD create index '%s' on namespace '%s', maximum allowed number of indexes %d reached !!", + // imd->ns_name, imd->iname, ns->sindex_cnt); + // SINDEX_GWUNLOCK(); + // return; + // } + + bool found_exact_defn = false; // ns:iname ns:binid / set / sktype / itype / path_str + bool found_defn = false; // ns:binid / set / sktype / itype / path_str + bool found_iname = false; // ns:iname + + int simatch_defn = -1; + int16_t binid = as_bin_get_id(ns, imd->bname); + if (binid != -1) { + simatch_defn = as_sindex__simatch_by_set_binid(ns, imd->set, binid, + imd->sktype, imd->itype, imd->path_str); + if (simatch_defn != -1) { + as_sindex *si = &ns->sindex[simatch_defn]; + if (! strcmp(si->imd->iname, imd->iname)) { + found_exact_defn = true; + } else { + found_defn = true; + } + } + } + + int simatch_iname = as_sindex__simatch_by_iname(ns, imd->iname); + if (simatch_iname != -1) { + found_iname = true; + } + + if (found_exact_defn) { + as_sindex *si = &ns->sindex[simatch_defn]; + if (si->state == AS_SINDEX_ACTIVE) { + SINDEX_GWUNLOCK(); + return; + } + } + + if (found_defn) { + as_sindex *si = &ns->sindex[simatch_defn]; + if (si->state == AS_SINDEX_ACTIVE) { + si->state = AS_SINDEX_DESTROY; + as_sindex_reset_binid_has_sindex(ns, si->imd->binid); + AS_SINDEX_RELEASE(si); + } + } + + if (found_iname) { + as_sindex *si = &ns->sindex[simatch_iname]; + if (si->state == AS_SINDEX_ACTIVE) { + si->state = AS_SINDEX_DESTROY; + as_sindex_reset_binid_has_sindex(ns, si->imd->binid); + AS_SINDEX_RELEASE(si); + } + } + + // If found set setop; Use si found with same definition to set op. + if (found_defn || found_exact_defn || found_iname) { + if (simatch_defn != -1) { + as_sindex *si = &ns->sindex[simatch_defn]; + as_sindex__dup_meta(imd, &si->recreate_imd); + SINDEX_GWUNLOCK(); + return; + } + + as_sindex *si = &ns->sindex[simatch_iname]; + as_sindex__dup_meta(imd, &si->recreate_imd); + SINDEX_GWUNLOCK(); + return; + } + + // Not found. + sindex_create_lockless(ns, imd); + SINDEX_GWUNLOCK(); + return; +} + +/* + * Description : When a index has to be dropped and recreated during cluster state change + * this function is called. + * Parameters : imd, which is constructed from the final index defn given by paxos principal. + * + * Returns : 0 on all cases. Check log for errors. + * + * Synchronization : Does not explicitly take any locks + */ +int +as_sindex_recreate(as_sindex_metadata* imd) +{ + as_namespace *ns = as_namespace_get_byname(imd->ns_name); + int ret = as_sindex_create(ns, imd); + if (ret != 0) { + cf_warning(AS_SINDEX,"Index %s creation failed at the accept callback", imd->iname); + } + return 0; +} +// END - SINDEX CREATE +// ************************************************************************************************ +// ************************************************************************************************ +// SINDEX DELETE + +void +as_sindex_destroy_pmetadata(as_sindex *si) +{ + for (int i = 0; i < si->imd->nprts; i++) { + as_sindex_pmetadata *pimd = &si->imd->pimd[i]; + pthread_rwlock_destroy(&pimd->slock); + } + as_sindex__destroy_histogram(si); + cf_free(si->imd->pimd); + si->imd->pimd = NULL; +} + +// TODO : Will not harm if it reserves and releases the sindex +// Keep it simple +bool +as_sindex_delete_checker(as_namespace *ns, as_sindex_metadata *imd) +{ + if (as_sindex_lookup_by_iname_lockfree(ns, imd->iname, + AS_SINDEX_LOOKUP_FLAG_NORESERVE | AS_SINDEX_LOOKUP_FLAG_ISACTIVE)) { + return true; + } else { + return false; + } +} + +/* + * Client API to destroy secondary index, mark destroy + * Deletes via smd or info-command user-delete requests. + */ +int +as_sindex_destroy(as_namespace *ns, as_sindex_metadata *imd) +{ + SINDEX_GWLOCK(); + as_sindex *si = NULL; + + if (imd->iname) { + si = as_sindex_lookup_by_iname_lockfree(ns, imd->iname, + AS_SINDEX_LOOKUP_FLAG_NORESERVE | AS_SINDEX_LOOKUP_FLAG_ISACTIVE); + } + else { + int16_t bin_id = as_bin_get_id(ns, imd->bname); + + if (bin_id == -1) { + SINDEX_GWUNLOCK(); + return AS_SINDEX_ERR_NOTFOUND; + } + + si = as_sindex_lookup_by_defns_lockfree(ns, imd->set, (int)bin_id, + imd->sktype, imd->itype, imd->path_str, + AS_SINDEX_LOOKUP_FLAG_NORESERVE | AS_SINDEX_LOOKUP_FLAG_ISACTIVE); + } + + if (si) { + si->state = AS_SINDEX_DESTROY; + as_sindex_reset_binid_has_sindex(ns, si->imd->binid); + AS_SINDEX_RELEASE(si); + SINDEX_GWUNLOCK(); + return AS_SINDEX_OK; + } + + SINDEX_GWUNLOCK(); + return AS_SINDEX_ERR_NOTFOUND; +} + +// On emptying a index +// reset objects and keys +// reset memory used +// add previous number of objects as deletes +void +as_sindex_clear_stats_on_empty_index(as_sindex *si) +{ + cf_atomic64_add(&si->stats.n_deletes, cf_atomic64_get(si->stats.n_objects)); + cf_atomic64_set(&si->stats.n_keys, 0); + cf_atomic64_set(&si->stats.n_objects, 0); +} + +void +as_sindex_empty_index(as_sindex_metadata * imd) +{ + as_sindex_pmetadata * pimd; + cf_atomic64_sub(&imd->si->ns->n_bytes_sindex_memory, + ai_btree_get_isize(imd) + ai_btree_get_nsize(imd)); + for (int i=0; inprts; i++) { + pimd = &imd->pimd[i]; + PIMD_WLOCK(&pimd->slock); + struct btree * ibtr = pimd->ibtr; + ai_btree_reinit_pimd(pimd, imd->sktype); + PIMD_WUNLOCK(&pimd->slock); + ai_btree_delete_ibtr(ibtr); + } + cf_atomic64_add(&imd->si->ns->n_bytes_sindex_memory, + ai_btree_get_isize(imd)); + as_sindex_clear_stats_on_empty_index(imd->si); +} + +// TODO - formerly used during set deletion - leaving it for now, but if nothing +// needs it going forward, we'll remove it. +void +as_sindex_delete_set(as_namespace * ns, char * set_name) +{ + SINDEX_GRLOCK(); + as_sindex * si_arr[ns->sindex_cnt]; + int sindex_count = as_sindex_arr_lookup_by_setname_lockfree(ns, set_name, si_arr); + + for (int i=0; iimd->iname, set_name); + as_sindex_empty_index(si_arr[i]->imd); + cf_info(AS_SINDEX, "Finished si set delete for index %s in set %s", si_arr[i]->imd->iname, set_name); + } + SINDEX_GRUNLOCK(); + as_sindex_release_arr(si_arr, sindex_count); +} +// END - SINDEX DELETE +// ************************************************************************************************ +// ************************************************************************************************ +// SINDEX POPULATE +/* + * Client API to mark index population finished, tick it ready for read + */ +int +as_sindex_populate_done(as_sindex *si) +{ + // Setting flag is atomic: meta lockless + si->flag |= AS_SINDEX_FLAG_RACTIVE; + si->flag &= ~AS_SINDEX_FLAG_POPULATING; + return AS_SINDEX_OK; +} +/* + * Client API to start namespace scan to populate secondary index. The scan + * is only performed in the namespace is warm start or if its data is not in + * memory and data is loaded from. For cold start with data in memory the indexes + * are populate upfront. + * + * This call is only made at the boot time. + */ +int +as_sindex_boot_populateall() +{ + // Initialize the secondary index builder. The thread pool is initialized + // with maximum threads to go full throttle, then down-sized to the + // configured number after the startup population job is done. + as_sbld_init(); + + int ns_cnt = 0; + + // Trigger namespace scan to populate all secondary indexes + // mark all secondary index for a namespace as populated + for (int i = 0; i < g_config.n_namespaces; i++) { + as_namespace *ns = g_config.namespaces[i]; + if (!ns || (ns->sindex_cnt == 0)) { + continue; + } + + if (! ns->storage_data_in_memory) { + // Data-not-in-memory (cold or warm restart) - have not yet built + // sindex, build it now. + as_sindex_populator_reserve_all(ns); + as_sbld_build_all(ns); + cf_info(AS_SINDEX, "Queuing namespace %s for sindex population ", ns->name); + } else { + // Data-in-memory (cold or cool restart) - already built sindex. + as_sindex_boot_populateall_done(ns); + } + ns_cnt++; + } + for (int i = 0; i < ns_cnt; i++) { + int ret; + // blocking call, wait till an item is popped out of Q : + cf_queue_pop(g_sindex_populateall_done_q, &ret, CF_QUEUE_FOREVER); + // TODO: Check for failure .. is generally fatal if it fails + } + + for (int i = 0; i < g_config.n_namespaces; i++) { + as_namespace *ns = g_config.namespaces[i]; + if (!ns || (ns->sindex_cnt == 0)) { + continue; + } + + if (! ns->storage_data_in_memory) { + // Data-not-in-memory - finished sindex building job. + as_sindex_populator_release_all(ns); + } + } + + // Down-size builder thread pool to configured value. + as_sbld_resize_thread_pool(g_config.sindex_builder_threads); + + g_sindex_boot_done = true; + + return AS_SINDEX_OK; +} + +/* + * Client API to mark all the indexes in namespace populated and ready for read + */ +int +as_sindex_boot_populateall_done(as_namespace *ns) +{ + SINDEX_GWLOCK(); + int ret = AS_SINDEX_OK; + + for (int i = 0; i < AS_SINDEX_MAX; i++) { + as_sindex *si = &ns->sindex[i]; + if (!as_sindex_isactive(si)) continue; + // This sindex is getting populating by it self scan + if (si->flag & AS_SINDEX_FLAG_POPULATING) continue; + si->flag |= AS_SINDEX_FLAG_RACTIVE; + } + SINDEX_GWUNLOCK(); + cf_queue_push(g_sindex_populateall_done_q, &ret); + cf_info(AS_SINDEX, "Namespace %s sindex population done", ns->name); + return ret; +} + +// END - SINDEX POPULATE +// ************************************************************************************************ +// ************************************************************************************************ +// SINDEX BIN PATH +as_sindex_status +as_sindex_add_mapkey_in_path(as_sindex_metadata * imd, char * path_str, int start, int end) +{ + if (end < start) { + return AS_SINDEX_ERR; + } + + int path_length = imd->path_length; + char int_str[20]; + strncpy(int_str, path_str+start, end-start+1); + int_str[end-start+1] = '\0'; + char * str_part; + imd->path[path_length-1].value.key_int = strtol(int_str, &str_part, 10); + if (str_part == int_str || (*str_part != '\0')) { + imd->path[path_length-1].value.key_str = cf_strndup(int_str, strlen(int_str)+1); + imd->path[path_length-1].mapkey_type = AS_PARTICLE_TYPE_STRING; + } + else { + imd->path[path_length-1].mapkey_type = AS_PARTICLE_TYPE_INTEGER; + } + return AS_SINDEX_OK; +} + +as_sindex_status +as_sindex_add_listelement_in_path(as_sindex_metadata * imd, char * path_str, int start, int end) +{ + if (end < start) { + return AS_SINDEX_ERR; + } + int path_length = imd->path_length; + char int_str[10]; + strncpy(int_str, path_str+start, end-start+1); + int_str[end-start+1] = '\0'; + char * str_part; + imd->path[path_length-1].value.index = strtol(int_str, &str_part, 10); + if (str_part == int_str || (*str_part != '\0')) { + return AS_SINDEX_ERR; + } + return AS_SINDEX_OK; +} + +as_sindex_status +as_sindex_parse_subpath(as_sindex_metadata * imd, char * path_str, int start, int end) +{ + int path_len = strlen(path_str); + bool overflow = end >= path_len ? true : false; + + if (start == 0 ) { + if (overflow) { + imd->bname = cf_strndup(path_str+start, end-start); + } + else if (path_str[end] == '.') { + imd->bname = cf_strndup(path_str+start, end-start); + imd->path_length++; + imd->path[imd->path_length-1].type = AS_PARTICLE_TYPE_MAP; + } + else if (path_str[end] == '[') { + imd->bname = cf_strndup(path_str+start, end-start); + imd->path_length++; + imd->path[imd->path_length-1].type = AS_PARTICLE_TYPE_LIST; + } + else { + return AS_SINDEX_ERR; + } + } + else if (path_str[start] == '.') { + if (overflow) { + if (as_sindex_add_mapkey_in_path(imd, path_str, start+1, end-1) != AS_SINDEX_OK) { + return AS_SINDEX_ERR; + } + } + else if (path_str[end] == '.') { + // take map value + if (as_sindex_add_mapkey_in_path(imd, path_str, start+1, end-1) != AS_SINDEX_OK) { + return AS_SINDEX_ERR; + } + // add type for next node in path + imd->path_length++; + imd->path[imd->path_length-1].type = AS_PARTICLE_TYPE_MAP; + } + else if (path_str[end] == '[') { + // value + if (as_sindex_add_mapkey_in_path(imd, path_str, start+1, end-1) != AS_SINDEX_OK) { + return AS_SINDEX_ERR; + } + // add type for next node in path + imd->path_length++; + imd->path[imd->path_length-1].type = AS_PARTICLE_TYPE_LIST; + } + else { + return AS_SINDEX_ERR; + } + } + else if (path_str[start] == '[') { + if (!overflow && path_str[end] == ']') { + //take list value + if (as_sindex_add_listelement_in_path(imd, path_str, start+1, end-1) != AS_SINDEX_OK) { + return AS_SINDEX_ERR; + } + } + else { + return AS_SINDEX_ERR; + } + } + else if (path_str[start] == ']') { + if (end - start != 1) { + return AS_SINDEX_ERR; + } + else if (overflow) { + return AS_SINDEX_OK; + } + if (path_str[end] == '.') { + imd->path_length++; + imd->path[imd->path_length-1].type = AS_PARTICLE_TYPE_MAP; + } + else if (path_str[end] == '[') { + imd->path_length++; + imd->path[imd->path_length-1].type = AS_PARTICLE_TYPE_LIST; + } + else { + return AS_SINDEX_ERR; + } + } + else { + return AS_SINDEX_ERR; + } + return AS_SINDEX_OK; +} +/* + * This function parses the path_str and populate array of path structure in + * imd. + * Each element of the path is the way to reach the the next path. + * For e.g + * bin.k1[1][0] + * array of the path structure would be like - + * path[0].type = AS_PARTICLE_TYPE_MAP . path[0].value.key_str = k1 path[0].value.ke + * path[1].type = AS_PARTICLE_TYPE_LIST . path[1].value.index = 1 + * path[2].type = AS_PARTICLE_TYPE_LIST . path[2].value.index = 0 +*/ +as_sindex_status +as_sindex_extract_bin_path(as_sindex_metadata * imd, char * path_str) +{ + int path_len = strlen(path_str); + int start = 0; + int end = 0; + if (path_len > AS_SINDEX_MAX_PATH_LENGTH) { + cf_warning(AS_SINDEX, "Bin path length exceeds the maximum allowed."); + return AS_SINDEX_ERR; + } + // Iterate through the path_str and search for character (., [, ]) + // which leads to sublevels in maps and lists + while (end < path_len) { + if (path_str[end] == '.' || path_str[end] == '[' || path_str[end] == ']') { + if (as_sindex_parse_subpath(imd, path_str, start, end)!=AS_SINDEX_OK) { + return AS_SINDEX_ERR; + } + start = end; + if (imd->path_length >= AS_SINDEX_MAX_DEPTH) { + cf_warning(AS_SINDEX, "Bin position depth level exceeds the max depth allowed %d", AS_SINDEX_MAX_DEPTH); + return AS_SINDEX_ERR; + } + } + end++; + } + if (as_sindex_parse_subpath(imd, path_str, start, end)!=AS_SINDEX_OK) { + return AS_SINDEX_ERR; + } +/* +// For debugging + cf_info(AS_SINDEX, "After parsing : bin name: %s", imd->bname); + for (int i=0; ipath_length; i++) { + if(imd->path[i].type == AS_PARTICLE_TYPE_MAP ) { + if (imd->path[i].key_type == AS_PARTICLE_TYPE_INTEGER) { + cf_info(AS_SINDEX, "map key_int %d", imd->path[i].value.key_int); + } + else if (imd->path[i].key_type == AS_PARTICLE_TYPE_STRING){ + cf_info(AS_SINDEX, "map key_str %s", imd->path[i].value.key_str); + } + else { + cf_info(AS_SINDEX, "ERROR EEROR EERROR ERRROR REERROR"); + } + } + else{ + cf_info(AS_SINDEX, "list index %d", imd->path[i].value.index); + } + } +*/ + return AS_SINDEX_OK; +} + +as_sindex_status +as_sindex_extract_bin_from_path(char * path_str, char *bin) +{ + int path_len = strlen(path_str); + int end = 0; + if (path_len > AS_SINDEX_MAX_PATH_LENGTH) { + cf_warning(AS_SINDEX, "Bin path length exceeds the maximum allowed."); + return AS_SINDEX_ERR; + } + + while (end < path_len && path_str[end] != '.' && path_str[end] != '[' && path_str[end] != ']') { + end++; + } + + if (end > 0 && end < AS_ID_BIN_SZ) { + strncpy(bin, path_str, end); + bin[end] = '\0'; + } + else { + return AS_SINDEX_ERR; + } + + return AS_SINDEX_OK; +} + +as_sindex_status +as_sindex_destroy_value_path(as_sindex_metadata * imd) +{ + for (int i=0; ipath_length; i++) { + if (imd->path[i].type == AS_PARTICLE_TYPE_MAP && + imd->path[i].mapkey_type == AS_PARTICLE_TYPE_STRING) { + cf_free(imd->path[i].value.key_str); + } + } + return AS_SINDEX_OK; +} + +/* + * This function checks the existence of path stored in the sindex metadata + * in a bin + */ +as_val * +as_sindex_extract_val_from_path(as_sindex_metadata * imd, as_val * v) +{ + if (!v) { + return NULL; + } + + as_val * val = v; + + as_particle_type imd_sktype = as_sindex_pktype(imd); + if (imd->path_length == 0) { + goto END; + } + as_sindex_path *path = imd->path; + for (int i=0; ipath_length; i++) { + switch (val->type) { + case AS_STRING: + case AS_INTEGER: + return NULL; + case AS_LIST: { + if (path[i].type != AS_PARTICLE_TYPE_LIST) { + return NULL; + } + int index = path[i].value.index; + as_arraylist* list = (as_arraylist*) as_list_fromval(val); + as_arraylist_iterator it; + as_arraylist_iterator_init( &it, list); + int j = 0; + while( as_arraylist_iterator_has_next( &it) && j<=index) { + val = (as_val*) as_arraylist_iterator_next( &it); + j++; + } + if (j-1 != index ) { + return NULL; + } + break; + } + case AS_MAP: { + if (path[i].type != AS_PARTICLE_TYPE_MAP) { + return NULL; + } + as_map * map = as_map_fromval(val); + as_val * key; + if (path[i].mapkey_type == AS_PARTICLE_TYPE_STRING) { + key = (as_val *)as_string_new(path[i].value.key_str, false); + } + else if (path[i].mapkey_type == AS_PARTICLE_TYPE_INTEGER) { + key = (as_val *)as_integer_new(path[i].value.key_int); + } + else { + cf_warning(AS_SINDEX, "Possible false data in sindex metadata"); + return NULL; + } + val = as_map_get(map, key); + if (key) { + as_val_destroy(key); + } + if ( !val ) { + return NULL; + } + break; + } + default: + return NULL; + } + } + +END: + if (imd->itype == AS_SINDEX_ITYPE_DEFAULT) { + if (val->type == AS_INTEGER && imd_sktype == AS_PARTICLE_TYPE_INTEGER) { + return val; + } + else if (val->type == AS_STRING && imd_sktype == AS_PARTICLE_TYPE_STRING) { + return val; + } + } + else if (imd->itype == AS_SINDEX_ITYPE_MAPKEYS || imd->itype == AS_SINDEX_ITYPE_MAPVALUES) { + if (val->type == AS_MAP) { + return val; + } + } + else if (imd->itype == AS_SINDEX_ITYPE_LIST) { + if (val->type == AS_LIST) { + return val; + } + } + return NULL; +} +// END - SINDEX BIN PATH +// ************************************************************************************************ +// ************************************************************************************************ +// SINDEX QUERY +/* + * Returns - + * NULL - On failure + * si - On success. + * Notes - + * Reserves the si if found in the srange + * Releases the si if imd is null or bin type is mis matched. + * + */ +as_sindex * +as_sindex_from_range(as_namespace *ns, char *set, as_sindex_range *srange) +{ + cf_debug(AS_SINDEX, "as_sindex_from_range"); + if (ns->single_bin) { + cf_warning(AS_SINDEX, "Secondary index query not allowed on single bin namespace %s", ns->name); + return NULL; + } + as_sindex *si = as_sindex_lookup_by_defns(ns, set, srange->start.id, + as_sindex_sktype_from_pktype(srange->start.type), srange->itype, srange->bin_path, + AS_SINDEX_LOOKUP_FLAG_ISACTIVE); + if (si && si->imd) { + // Do the type check + as_sindex_metadata *imd = si->imd; + if ((imd->binid == srange->start.id) && (srange->start.type != as_sindex_pktype(imd))) { + cf_warning(AS_SINDEX, "Query and Index Bin Type Mismatch: " + "[binid %d : Index Bin type %d : Query Bin Type %d]", + imd->binid, as_sindex_pktype(imd), srange->start.type ); + AS_SINDEX_RELEASE(si); + return NULL; + } + } + return si; +} + +/* + * The way to filter out imd information from the as_msg which is primarily + * query with all the details. For the normal operations the imd is formed out + * of the as_op. + */ +/* + * Returns - + * NULL - On failure. + * as_sindex - On success. + * + * Description - + * Firstly obtains the simatch using ns name and set name. + * Then returns the corresponding slot from sindex array. + * + * TODO + * log messages + */ +as_sindex * +as_sindex_from_msg(as_namespace *ns, as_msg *msgp) +{ + cf_debug(AS_SINDEX, "as_sindex_from_msg"); + as_msg_field *ifp = as_msg_field_get(msgp, AS_MSG_FIELD_TYPE_INDEX_NAME); + + if (!ifp) { + cf_debug(AS_SINDEX, "Index name not found in the query request"); + return NULL; + } + + uint32_t iname_len = as_msg_field_get_value_sz(ifp); + + if (iname_len >= AS_ID_INAME_SZ) { + cf_warning(AS_SINDEX, "index name too long"); + return NULL; + } + + char iname[AS_ID_INAME_SZ]; + + memcpy(iname, ifp->data, iname_len); + iname[iname_len] = 0; + + as_sindex *si = as_sindex_lookup_by_iname(ns, iname, AS_SINDEX_LOOKUP_FLAG_ISACTIVE); + if (!si) { + cf_detail(AS_SINDEX, "Search did not find index "); + } + + return si; +} + + +/* + * Internal Function - as_sindex_range_free + * frees the sindex range + * + * Returns + * AS_SINDEX_OK - In every case + */ +int +as_sindex_range_free(as_sindex_range **range) +{ + cf_debug(AS_SINDEX, "as_sindex_range_free"); + as_sindex_range *sk = (*range); + if (sk->region) { + geo_region_destroy(sk->region); + } + cf_free(sk); + return AS_SINDEX_OK; +} + +/* + * Extract out range information from the as_msg and create the irange structure + * if required allocates the memory. + * NB: It is responsibility of caller to call the cleanup routine to clean the + * range structure up and free up its memory + * + * query range field layout: contains - numranges, binname, start, end + * + * generic field header + * 0 4 size = size of data only + * 4 1 field_type = CL_MSG_FIELD_TYPE_INDEX_RANGE + * + * numranges + * 5 1 numranges (max 255 ranges) + * + * binname + * 6 1 binnamelen b + * 7 b binname + * + * particle (start & end) + * +b 1 particle_type + * +b+1 4 start_particle_size x + * +b+5 x start_particle_data + * +b+5+x 4 end_particle_size y + * +b+5+x+y+4 y end_particle_data + * + * repeat "numranges" times from "binname" + */ + +/* + * Function as_sindex_binlist_from_msg + * + * Returns - + * binlist - On success + * NULL - On failure + * + */ +cf_vector * +as_sindex_binlist_from_msg(as_namespace *ns, as_msg *msgp, int * num_bins) +{ + cf_debug(AS_SINDEX, "as_sindex_binlist_from_msg"); + as_msg_field *bfp = as_msg_field_get(msgp, AS_MSG_FIELD_TYPE_QUERY_BINLIST); + if (!bfp) { + return NULL; + } + const uint8_t *data = bfp->data; + int numbins = *data++; + *num_bins = numbins; + + cf_vector *binlist = cf_vector_create(AS_ID_BIN_SZ, numbins, 0); + + for (int i = 0; i < numbins; i++) { + int binnamesz = *data++; + if (binnamesz <= 0 || binnamesz > AS_ID_BIN_SZ - 1) { + cf_warning(AS_SINDEX, "Size of the bin name in bin list of sindex query is out of bounds. Size %d", binnamesz); + cf_vector_destroy(binlist); + return NULL; + } + char binname[AS_ID_BIN_SZ]; + memcpy(&binname, data, binnamesz); + binname[binnamesz] = 0; + cf_vector_set(binlist, i, (void *)binname); + data += binnamesz; + } + + cf_debug(AS_SINDEX, "Queried Bin List %d ", numbins); + for (int i = 0; i < cf_vector_size(binlist); i++) { + char binname[AS_ID_BIN_SZ]; + cf_vector_get(binlist, i, (void*)&binname); + cf_debug(AS_SINDEX, " String Queried is |%s| \n", binname); + } + + return binlist; +} + +/* + * Returns - + * AS_SINDEX_OK - On success. + * AS_SINDEX_ERR_PARAM - On failure. + * AS_SINDEX_ERR_BIN_NOTFOUND - On failure. + * + * Description - + * Frames a sane as_sindex_range from msg. + * + * We are not supporting multiranges right now. So numrange is always expected to be 1. + */ +int +as_sindex_range_from_msg(as_namespace *ns, as_msg *msgp, as_sindex_range *srange) +{ + cf_debug(AS_SINDEX, "as_sindex_range_from_msg"); + srange->num_binval = 0; + // Ensure region is initialized in case we need to return an error code early. + srange->region = NULL; + + // getting ranges + as_msg_field *itype_fp = as_msg_field_get(msgp, AS_MSG_FIELD_TYPE_INDEX_TYPE); + as_msg_field *rfp = as_msg_field_get(msgp, AS_MSG_FIELD_TYPE_INDEX_RANGE); + if (!rfp) { + cf_warning(AS_SINDEX, "Required Index Range Not Found"); + return AS_SINDEX_ERR_PARAM; + } + const uint8_t *data = rfp->data; + int numrange = *data++; + + if (numrange != 1) { + cf_warning(AS_SINDEX, + "can't handle multiple ranges right now %d", rfp->data[0]); + return AS_SINDEX_ERR_PARAM; + } + // NOTE - to support geospatial queries the srange object is actually a vector + // of MAX_REGION_CELLS elements. Normal queries only use the first element. + // Geospatial queries use multiple elements. + // + memset(srange, 0, sizeof(as_sindex_range) * MAX_REGION_CELLS); + if (itype_fp) { + srange->itype = *itype_fp->data; + } + else { + srange->itype = AS_SINDEX_ITYPE_DEFAULT; + } + for (int i = 0; i < numrange; i++) { + as_sindex_bin_data *start = &(srange->start); + as_sindex_bin_data *end = &(srange->end); + // Populate Bin id + uint8_t bin_path_len = *data++; + if (bin_path_len >= AS_SINDEX_MAX_PATH_LENGTH) { + cf_warning(AS_SINDEX, "Index position size %d exceeds the max length %d", bin_path_len, AS_SINDEX_MAX_PATH_LENGTH); + return AS_SINDEX_ERR_PARAM; + } + + strncpy(srange->bin_path, (char *)data, bin_path_len); + srange->bin_path[bin_path_len] = '\0'; + + char binname[AS_ID_BIN_SZ]; + if (as_sindex_extract_bin_from_path(srange->bin_path, binname) == AS_SINDEX_OK) { + int16_t id = as_bin_get_id(ns, binname); + if (id != -1) { + start->id = id; + end->id = id; + } else { + return AS_SINDEX_ERR_BIN_NOTFOUND; + } + } + else { + return AS_SINDEX_ERR_PARAM; + } + + data += bin_path_len; + + // Populate type + int type = *data++; + start->type = type; + end->type = start->type; + + // TODO - Refactor these into generic conversion from + // buffer to as_sindex_bin_data functions. Can be used + // by write code path as well. + if ((type == AS_PARTICLE_TYPE_INTEGER)) { + // get start point + uint32_t startl = ntohl(*((uint32_t *)data)); + data += sizeof(uint32_t); + if (startl != 8) { + cf_warning(AS_SINDEX, + "Can only handle 8 byte numerics right now %u", startl); + goto Cleanup; + } + start->u.i64 = __cpu_to_be64(*((uint64_t *)data)); + data += sizeof(uint64_t); + + // get end point + uint32_t endl = ntohl(*((uint32_t *)data)); + data += sizeof(uint32_t); + if (endl != 8) { + cf_warning(AS_SINDEX, + "can only handle 8 byte numerics right now %u", endl); + goto Cleanup; + } + end->u.i64 = __cpu_to_be64(*((uint64_t *)data)); + data += sizeof(uint64_t); + if (start->u.i64 > end->u.i64) { + cf_warning(AS_SINDEX, + "Invalid range from %ld to %ld", start->u.i64, end->u.i64); + goto Cleanup; + } else { + srange->isrange = start->u.i64 != end->u.i64; + } + cf_debug(AS_SINDEX, "Range is equal %"PRId64", %"PRId64"", + start->u.i64, end->u.i64); + } else if (type == AS_PARTICLE_TYPE_STRING) { + // get start point + uint32_t startl = ntohl(*((uint32_t *)data)); + data += sizeof(uint32_t); + char* start_binval = (char *)data; + data += startl; + srange->isrange = false; + + if (startl >= AS_SINDEX_MAX_STRING_KSIZE) { + cf_warning(AS_SINDEX, "Query on bin %s fails. Value length %u too long.", binname, startl); + goto Cleanup; + } + uint32_t endl = ntohl(*((uint32_t *)data)); + data += sizeof(uint32_t); + char * end_binval = (char *)data; + if (startl != endl && strncmp(start_binval, end_binval, startl)) { + cf_warning(AS_SINDEX, + "Only Equality Query Supported in Strings %s-%s", + start_binval, end_binval); + goto Cleanup; + } + cf_digest_compute(start_binval, startl, &(start->digest)); + cf_debug(AS_SINDEX, "Range is equal %s ,%s", + start_binval, end_binval); + } else if (type == AS_PARTICLE_TYPE_GEOJSON) { + // get start point + uint32_t startl = ntohl(*((uint32_t *)data)); + data += sizeof(uint32_t); + char* start_binval = (char *)data; + data += startl; + + if ((startl == 0) || (startl >= AS_SINDEX_MAX_GEOJSON_KSIZE)) { + cf_warning(AS_SINDEX, "Out of bound query key size %u", startl); + goto Cleanup; + } + uint32_t endl = ntohl(*((uint32_t *)data)); + data += sizeof(uint32_t); + char * end_binval = (char *)data; + if (startl != endl && strncmp(start_binval, end_binval, startl)) { + cf_warning(AS_SINDEX, + "Only Geospatial Query Supported on GeoJSON %s-%s", + start_binval, end_binval); + goto Cleanup; + } + + srange->cellid = 0; + if (!geo_parse(ns, start_binval, startl, + &srange->cellid, &srange->region)) { + cf_warning(AS_GEO, "failed to parse query GeoJSON"); + goto Cleanup; + } + + if (srange->cellid && srange->region) { + geo_region_destroy(srange->region); + srange->region = NULL; + cf_warning(AS_GEO, "query geo_parse: both point and region"); + goto Cleanup; + } + + if (!srange->cellid && !srange->region) { + cf_warning(AS_GEO, "query geo_parse: neither point nor region"); + goto Cleanup; + } + + if (srange->cellid) { + // REGIONS-CONTAINING-POINT QUERY + + uint64_t center[MAX_REGION_LEVELS]; + int numcenters; + if (!geo_point_centers(ns, srange->cellid, MAX_REGION_LEVELS, + center, &numcenters)) { + cf_warning(AS_GEO, "Query point invalid"); + goto Cleanup; + } + + // Geospatial queries use multiple srange elements. Many + // of the fields are copied from the first cell because + // they were filled in above. + for (int ii = 0; ii < numcenters; ++ii) { + srange[ii].num_binval = 1; + srange[ii].isrange = true; + srange[ii].start.id = srange[0].start.id; + srange[ii].start.type = srange[0].start.type; + srange[ii].start.u.i64 = center[ii]; + srange[ii].end.id = srange[0].end.id; + srange[ii].end.type = srange[0].end.type; + srange[ii].end.u.i64 = center[ii]; + srange[ii].itype = srange[0].itype; + } + } else { + // POINTS-INSIDE-REGION QUERY + + uint64_t cellmin[MAX_REGION_CELLS]; + uint64_t cellmax[MAX_REGION_CELLS]; + int numcells; + if (!geo_region_cover(ns, srange->region, MAX_REGION_CELLS, + NULL, cellmin, cellmax, &numcells)) { + cf_warning(AS_GEO, "Query region invalid."); + goto Cleanup; + } + + cf_atomic64_incr(&ns->geo_region_query_count); + cf_atomic64_add(&ns->geo_region_query_cells, numcells); + + // Geospatial queries use multiple srange elements. Many + // of the fields are copied from the first cell because + // they were filled in above. + for (int ii = 0; ii < numcells; ++ii) { + srange[ii].num_binval = 1; + srange[ii].isrange = true; + srange[ii].start.id = srange[0].start.id; + srange[ii].start.type = srange[0].start.type; + srange[ii].start.u.i64 = cellmin[ii]; + srange[ii].end.id = srange[0].end.id; + srange[ii].end.type = srange[0].end.type; + srange[ii].end.u.i64 = cellmax[ii]; + srange[ii].itype = srange[0].itype; + } + } + } else { + cf_warning(AS_SINDEX, "Only handle String, Numeric and GeoJSON type"); + goto Cleanup; + } + srange->num_binval = numrange; + } + return AS_SINDEX_OK; + +Cleanup: + return AS_SINDEX_ERR_PARAM; +} + +/* + * Function as_sindex_rangep_from_msg + * + * Arguments + * ns - the namespace on which srange has to be build + * msgp - the msgp from which sent + * srange - it builds this srange + * + * Returns + * AS_SINDEX_OK - On success + * else the return value of as_sindex_range_from_msg + * + * Description + * Allocating space for srange and then calling as_sindex_range_from_msg. + */ +int +as_sindex_rangep_from_msg(as_namespace *ns, as_msg *msgp, as_sindex_range **srange) +{ + cf_debug(AS_SINDEX, "as_sindex_rangep_from_msg"); + + // NOTE - to support geospatial queries we allocate an array of + // MAX_REGION_CELLS length. Nongeospatial queries use only the + // first element. Geospatial queries use one element per region + // cell, up to MAX_REGION_CELLS. + *srange = cf_malloc(sizeof(as_sindex_range) * MAX_REGION_CELLS); + + int ret = as_sindex_range_from_msg(ns, msgp, *srange); + if (AS_SINDEX_OK != ret) { + as_sindex_range_free(srange); + *srange = NULL; + return ret; + } + return AS_SINDEX_OK; +} + +/* + * Returns - + * AS_SINDEX_ERR_PARAM + * o/w return value from ai_btree_query + * + * Notes - + * Client API to do range get from index based on passed in range key, returns + * digest list + * + * Synchronization - + * + */ +int +as_sindex_query(as_sindex *si, as_sindex_range *srange, as_sindex_qctx *qctx) +{ + if (! si || ! srange) { + return AS_SINDEX_ERR_PARAM; + } + + as_sindex_metadata *imd = si->imd; + as_sindex_pmetadata *pimd = &imd->pimd[qctx->pimd_idx]; + + if (! as_sindex_can_query(si)) { + return AS_SINDEX_ERR_NOT_READABLE; + } + + PIMD_RLOCK(&pimd->slock); + int ret = ai_btree_query(imd, srange, qctx); + PIMD_RUNLOCK(&pimd->slock); + + as_sindex__process_ret(si, ret, AS_SINDEX_OP_READ, + 0 /* No histogram for query per call */, __LINE__); + + return ret; +} +// END - SINDEX QUERY +// ************************************************************************************************ +// ************************************************************************************************ +// SBIN UTILITY +void +as_sindex_init_sbin(as_sindex_bin * sbin, as_sindex_op op, as_particle_type type, as_sindex * si) +{ + sbin->si = si; + sbin->to_free = false; + sbin->num_values = 0; + sbin->op = op; + sbin->heap_capacity = 0; + sbin->type = type; + sbin->values = NULL; +} + +int +as_sindex_sbin_free(as_sindex_bin *sbin) +{ + if (sbin->to_free) { + if (sbin->values) { + cf_free(sbin->values); + } + } + return AS_SINDEX_OK; +} + +int +as_sindex_sbin_freeall(as_sindex_bin *sbin, int numbins) +{ + for (int i = 0; i < numbins; i++) { + as_sindex_sbin_free(&sbin[i]); + } + return AS_SINDEX_OK; +} + +as_sindex_status +as_sindex__op_by_sbin(as_namespace *ns, const char *set, int numbins, as_sindex_bin *start_sbin, cf_digest * pkey) +{ + // If numbins == 0 return AS_SINDEX_OK + // Iterate through sbins + // Reserve the SI. + // Take the read lock on imd + // Get a value from sbin + // Get the related pimd + // Get the pimd write lock + // If op is DELETE delete the values from sbin from sindex + // If op is INSERT put all the values from bin in sindex. + // Release the pimd lock + // Release the imd lock. + // Release the SI. + + as_sindex_status retval = AS_SINDEX_OK; + if (!ns || !start_sbin) { + return AS_SINDEX_ERR; + } + + // If numbins != 1 return AS_SINDEX_OK + if (numbins != 1 ) { + return AS_SINDEX_OK; + } + + as_sindex * si = NULL; + as_sindex_bin * sbin = NULL; + as_sindex_metadata * imd = NULL; + as_sindex_pmetadata * pimd = NULL; + as_sindex_op op; + // Iterate through sbins + for (int i=0; isi; + if (!si) { + cf_warning(AS_SINDEX, "as_sindex_op_by_sbin : si is null in sbin"); + return AS_SINDEX_ERR; + } + imd = si->imd; + op = sbin->op; + // Take the read lock on imd + for (int j=0; jnum_values; j++) { + + // Get a value from sbin + void * skey; + switch (sbin->type) { + case AS_PARTICLE_TYPE_INTEGER: + case AS_PARTICLE_TYPE_GEOJSON: + if (j==0) { + skey = (void *)&(sbin->value.int_val); + } + else { + skey = (void *)((uint64_t *)(sbin->values) + j); + } + break; + case AS_PARTICLE_TYPE_STRING: + if (j==0) { + skey = (void *)&(sbin->value.str_val); + } + else { + skey = (void *)((cf_digest *)(sbin->values) + j); + } + break; + default: + retval = AS_SINDEX_ERR; + goto Cleanup; + } + // Get the related pimd + pimd = &imd->pimd[ai_btree_key_hash(imd, skey)]; + uint64_t starttime = 0; + if (si->enable_histogram) { + starttime = cf_getns(); + } + + // Get the pimd write lock + PIMD_WLOCK(&pimd->slock); + + // If op is DELETE delete the value from sindex + int ret = AS_SINDEX_OK; + if (op == AS_SINDEX_OP_DELETE) { + ret = ai_btree_delete(imd, pimd, skey, pkey); + } + else if (op == AS_SINDEX_OP_INSERT) { + // If op is INSERT put the value in sindex. + ret = ai_btree_put(imd, pimd, skey, pkey); + } + + // Release the pimd lock + PIMD_WUNLOCK(&pimd->slock); + as_sindex__process_ret(si, ret, op, starttime, __LINE__); + } + cf_debug(AS_SINDEX, " Secondary Index Op Finish------------- "); + + // Release the imd lock. + // Release the SI. + + } +Cleanup: + return retval; +} +// END - SBIN UTILITY +// ************************************************************************************************ +// ************************************************************************************************ +// ADD TO SBIN + + +as_sindex_status +as_sindex_add_sbin_value_in_heap(as_sindex_bin * sbin, void * val) +{ + // Get the size of the data we are going to store + // If to_free = false, this means this is the first + // time we are storing value for this sbin to heap + // Check if there is need to copy the existing data from stack_buf + // init_storage(num_values) + // If num_values != 0 + // Copy the existing data from stack to heap + // reduce the used stack_buf size + // to_free = true; + // Else + // If (num_values == heap_capacity) + // extend the allocation and capacity + // Copy the value to the appropriate position. + + uint32_t size = 0; + bool to_copy = false; + uint8_t data_sz = 0; + void * tmp_value = NULL; + sbin_value_pool * stack_buf = sbin->stack_buf; + + // Get the size of the data we are going to store + if (sbin->type == AS_PARTICLE_TYPE_INTEGER || + sbin->type == AS_PARTICLE_TYPE_GEOJSON) { + data_sz = sizeof(uint64_t); + } + else if (sbin->type == AS_PARTICLE_TYPE_STRING) { + data_sz = sizeof(cf_digest); + } + else { + cf_warning(AS_SINDEX, "Bad type of data to index %d", sbin->type); + return AS_SINDEX_ERR; + } + + // If to_free = false, this means this is the first + // time we are storing value for this sbin to heap + // Check if there is need to copy the existing data from stack_buf + if (!sbin->to_free) { + if (sbin->num_values == 0) { + size = 2; + } + else if (sbin->num_values == 1) { + to_copy = true; + size = 2; + tmp_value = &sbin->value; + } + else if (sbin->num_values > 1) { + to_copy = true; + size = 2 * sbin->num_values; + tmp_value = sbin->values; + } + else { + cf_warning(AS_SINDEX, "num_values in sbin is less than 0 %"PRIu64"", sbin->num_values); + return AS_SINDEX_ERR; + } + + sbin->values = cf_malloc(data_sz * size); + sbin->to_free = true; + sbin->heap_capacity = size; + + // Copy the existing data from stack to heap + // reduce the used stack_buf size + if (to_copy) { + if (!memcpy(sbin->values, tmp_value, data_sz * sbin->num_values)) { + cf_warning(AS_SINDEX, "memcpy failed"); + return AS_SINDEX_ERR; + } + if (sbin->num_values != 1) { + stack_buf->used_sz -= (sbin->num_values * data_sz); + } + } + } + else + { + // Else + // If (num_values == heap_capacity) + // extend the allocation and capacity + if (sbin->heap_capacity == sbin->num_values) { + sbin->heap_capacity = 2 * sbin->heap_capacity; + sbin->values = cf_realloc(sbin->values, sbin->heap_capacity * data_sz); + } + } + + // Copy the value to the appropriate position. + if (sbin->type == AS_PARTICLE_TYPE_INTEGER || + sbin->type == AS_PARTICLE_TYPE_GEOJSON) { + if (!memcpy((void *)((uint64_t *)sbin->values + sbin->num_values), (void *)val, data_sz)) { + cf_warning(AS_SINDEX, "memcpy failed"); + return AS_SINDEX_ERR; + } + } + else if (sbin->type == AS_PARTICLE_TYPE_STRING) { + if (!memcpy((void *)((cf_digest *)sbin->values + sbin->num_values), (void *)val, data_sz)) { + cf_warning(AS_SINDEX, "memcpy failed"); + return AS_SINDEX_ERR; + } + } + else { + cf_warning(AS_SINDEX, "Bad type of data to index %d", sbin->type); + return AS_SINDEX_ERR; + } + + sbin->num_values++; + return AS_SINDEX_OK; +} + +as_sindex_status +as_sindex_add_value_to_sbin(as_sindex_bin * sbin, uint8_t * val) +{ + // If this is the first value coming to the sbin + // assign the value to the local variable of struct. + // Else + // If to_free is true or stack_buf is full + // add value to the heap + // else + // If needed copy the values stored in sbin to stack_buf + // add the value to end of stack buf + + int data_sz = 0; + if (sbin->type == AS_PARTICLE_TYPE_STRING) { + data_sz = sizeof(cf_digest); + } + else if (sbin->type == AS_PARTICLE_TYPE_INTEGER || + sbin->type == AS_PARTICLE_TYPE_GEOJSON) { + data_sz = sizeof(uint64_t); + } + else { + cf_warning(AS_SINDEX, "sbin type is invalid %d", sbin->type); + return AS_SINDEX_ERR; + } + + sbin_value_pool * stack_buf = sbin->stack_buf; + if (sbin->num_values == 0 ) { + if (sbin->type == AS_PARTICLE_TYPE_STRING) { + sbin->value.str_val = *(cf_digest *)val; + } + else if (sbin->type == AS_PARTICLE_TYPE_INTEGER || + sbin->type == AS_PARTICLE_TYPE_GEOJSON) { + sbin->value.int_val = *(int64_t *)val; + } + sbin->num_values++; + } + else if (sbin->num_values == 1) { + if ((stack_buf->used_sz + data_sz + data_sz) > AS_SINDEX_VALUESZ_ON_STACK ) { + if (as_sindex_add_sbin_value_in_heap(sbin, (void *)val)) { + cf_warning(AS_SINDEX, "Adding value in sbin failed."); + return AS_SINDEX_ERR; + } + } + else { + // sbin->values gets initiated here + sbin->values = stack_buf->value + stack_buf->used_sz; + + if (!memcpy(sbin->values, (void *)&sbin->value, data_sz)) { + cf_warning(AS_SINDEX, "Memcpy failed"); + return AS_SINDEX_ERR; + } + stack_buf->used_sz += data_sz; + + if (!memcpy((void *)((uint8_t *)sbin->values + data_sz * sbin->num_values), (void *)val, data_sz)) { + cf_warning(AS_SINDEX, "Memcpy failed"); + return AS_SINDEX_ERR; + } + sbin->num_values++; + stack_buf->used_sz += data_sz; + } + } + else if (sbin->num_values > 1) { + if (sbin->to_free || (stack_buf->used_sz + data_sz ) > AS_SINDEX_VALUESZ_ON_STACK ) { + if (as_sindex_add_sbin_value_in_heap(sbin, (void *)val)) { + cf_warning(AS_SINDEX, "Adding value in sbin failed."); + return AS_SINDEX_ERR; + } + } + else { + if (!memcpy((void *)((uint8_t *)sbin->values + data_sz * sbin->num_values), (void *)val, data_sz)) { + cf_warning(AS_SINDEX, "Memcpy failed"); + return AS_SINDEX_ERR; + } + sbin->num_values++; + stack_buf->used_sz += data_sz; + } + } + else { + cf_warning(AS_SINDEX, "numvalues is coming as negative. Possible memory corruption in sbin."); + return AS_SINDEX_ERR; + } + return AS_SINDEX_OK; +} + +as_sindex_status +as_sindex_add_integer_to_sbin(as_sindex_bin * sbin, uint64_t val) +{ + return as_sindex_add_value_to_sbin(sbin, (uint8_t * )&val); +} + +as_sindex_status +as_sindex_add_digest_to_sbin(as_sindex_bin * sbin, cf_digest val_dig) +{ + return as_sindex_add_value_to_sbin(sbin, (uint8_t * )&val_dig); +} + +as_sindex_status +as_sindex_add_string_to_sbin(as_sindex_bin * sbin, char * val) +{ + if (!val) { + return AS_SINDEX_ERR; + } + // Calculate digest and cal add_digest_to_sbin + cf_digest val_dig; + cf_digest_compute(val, strlen(val), &val_dig); + return as_sindex_add_digest_to_sbin(sbin, val_dig); +} +// END - ADD TO SBIN +// ************************************************************************************************ +// ************************************************************************************************ +// ADD KEYTYPE FROM BASIC TYPE ASVAL +as_sindex_status +as_sindex_add_long_from_asval(as_val *val, as_sindex_bin *sbin) +{ + if (!val) { + return AS_SINDEX_ERR; + } + if (sbin->type != AS_PARTICLE_TYPE_INTEGER) { + return AS_SINDEX_ERR; + } + + as_integer *i = as_integer_fromval(val); + if (!i) { + return AS_SINDEX_ERR; + } + uint64_t int_val = (uint64_t)as_integer_get(i); + return as_sindex_add_integer_to_sbin(sbin, int_val); +} + +as_sindex_status +as_sindex_add_digest_from_asval(as_val *val, as_sindex_bin *sbin) +{ + if (!val) { + return AS_SINDEX_ERR; + } + if (sbin->type != AS_PARTICLE_TYPE_STRING) { + return AS_SINDEX_ERR; + } + + as_string *s = as_string_fromval(val); + if (!s) { + return AS_SINDEX_ERR; + } + char * str_val = as_string_get(s); + return as_sindex_add_string_to_sbin(sbin, str_val); +} + +as_sindex_status +as_sindex_add_geo2dsphere_from_as_val(as_val *val, as_sindex_bin *sbin) +{ + if (!val) { + return AS_SINDEX_ERR; + } + if (sbin->type != AS_PARTICLE_TYPE_GEOJSON) { + return AS_SINDEX_ERR; + } + + as_geojson *g = as_geojson_fromval(val); + if (!g) { + return AS_SINDEX_ERR; + } + + const char *s = as_geojson_get(g); + size_t jsonsz = as_geojson_len(g); + uint64_t parsed_cellid = 0; + geo_region_t parsed_region = NULL; + + if (! geo_parse(NULL, s, jsonsz, &parsed_cellid, &parsed_region)) { + cf_warning(AS_PARTICLE, "geo_parse() failed - unexpected"); + geo_region_destroy(parsed_region); + return AS_SINDEX_ERR; + } + + if (parsed_cellid) { + if (parsed_region) { + geo_region_destroy(parsed_region); + cf_warning(AS_PARTICLE, "geo_parse found both point and region"); + return AS_SINDEX_ERR; + } + + // POINT + if (as_sindex_add_integer_to_sbin(sbin, parsed_cellid) != AS_SINDEX_OK) { + cf_warning(AS_PARTICLE, "as_sindex_add_integer_to_sbin() failed - unexpected"); + return AS_SINDEX_ERR; + } + } + else if (parsed_region) { + // REGION + int numcells; + uint64_t outcells[MAX_REGION_CELLS]; + + if (! geo_region_cover(NULL, parsed_region, MAX_REGION_CELLS, outcells, NULL, NULL, &numcells)) { + geo_region_destroy(parsed_region); + cf_warning(AS_PARTICLE, "geo_region_cover failed"); + return AS_SINDEX_ERR; + } + + geo_region_destroy(parsed_region); + + int added = 0; + for (size_t i = 0; i < numcells; i++) { + if (as_sindex_add_integer_to_sbin(sbin, outcells[i]) == AS_SINDEX_OK) { + added++; + } + else { + cf_warning(AS_PARTICLE, "as_sindex_add_integer_to_sbin() failed - unexpected"); + } + } + + if (added == 0 && numcells > 0) { + return AS_SINDEX_ERR; + } + } + else { + cf_warning(AS_PARTICLE, "geo_parse found neither point nor region"); + return AS_SINDEX_ERR; + } + + return AS_SINDEX_OK; +} + +typedef as_sindex_status (*as_sindex_add_keytype_from_asval_fn) +(as_val *val, as_sindex_bin * sbin); +static const as_sindex_add_keytype_from_asval_fn + as_sindex_add_keytype_from_asval[COL_TYPE_MAX] = { + NULL, + as_sindex_add_long_from_asval, + as_sindex_add_digest_from_asval, + as_sindex_add_geo2dsphere_from_as_val // 3 +}; + +// END - ADD KEYTYPE FROM BASIC TYPE ASVAL +// ************************************************************************************************ +// ************************************************************************************************ +// ADD ASVAL TO SINDEX TYPE +as_sindex_status +as_sindex_add_asval_to_default_sindex(as_val *val, as_sindex_bin * sbin) +{ + return as_sindex_add_keytype_from_asval[as_sindex_sktype_from_pktype(sbin->type)](val, sbin); +} + +static bool as_sindex_add_listvalues_foreach(as_val * element, void * udata) +{ + as_sindex_bin * sbin = (as_sindex_bin *)udata; + as_sindex_add_keytype_from_asval[as_sindex_sktype_from_pktype(sbin->type)](element, sbin); + return true; +} + +as_sindex_status +as_sindex_add_asval_to_list_sindex(as_val *val, as_sindex_bin * sbin) +{ + // If val type is not AS_LIST + // return AS_SINDEX_ERR + // Else iterate through all values of list + // If type == AS_PARTICLE_TYPE_STRING + // add all string type values to the sbin + // If type == AS_PARTICLE_TYPE_INTEGER + // add all integer type values to the sbin + + // If val type is not AS_LIST + // return AS_SINDEX_ERR + if (!val) { + return AS_SINDEX_ERR; + } + if (val->type != AS_LIST) { + return AS_SINDEX_ERR; + } + // Else iterate through all elements of map + as_list * list = as_list_fromval(val); + if (as_list_foreach(list, as_sindex_add_listvalues_foreach, sbin)) { + return AS_SINDEX_OK; + } + return AS_SINDEX_ERR; +} + +static bool as_sindex_add_mapkeys_foreach(const as_val * key, const as_val * val, void * udata) +{ + as_sindex_bin * sbin = (as_sindex_bin *)udata; + as_sindex_add_keytype_from_asval[as_sindex_sktype_from_pktype(sbin->type)]((as_val *)key, sbin); + return true; +} + +static bool as_sindex_add_mapvalues_foreach(const as_val * key, const as_val * val, void * udata) +{ + as_sindex_bin * sbin = (as_sindex_bin *)udata; + as_sindex_add_keytype_from_asval[as_sindex_sktype_from_pktype(sbin->type)]((as_val *)val, sbin); + return true; +} + +as_sindex_status +as_sindex_add_asval_to_mapkeys_sindex(as_val *val, as_sindex_bin * sbin) +{ + // If val type is not AS_MAP + // return AS_SINDEX_ERR + // Defensive check. Should not happen. + if (!val) { + return AS_SINDEX_ERR; + } + if (val->type != AS_MAP) { + cf_warning(AS_SINDEX, "Unexpected wrong type %d", val->type); + return AS_SINDEX_ERR; + } + + // Else iterate through all keys of map + as_map * map = as_map_fromval(val); + if (as_map_foreach(map, as_sindex_add_mapkeys_foreach, sbin)) { + return AS_SINDEX_OK; + } + return AS_SINDEX_ERR; +} + +as_sindex_status +as_sindex_add_asval_to_mapvalues_sindex(as_val *val, as_sindex_bin * sbin) +{ + // If val type is not AS_MAP + // return AS_SINDEX_ERR + // Else iterate through all values of all keys of the map + // If type == AS_PARTICLE_TYPE_STRING + // add all string type values to the sbin + // If type == AS_PARTICLE_TYPE_INTEGER + // add all integer type values to the sbin + + // If val type is not AS_MAP + // return AS_SINDEX_ERR + if (!val) { + return AS_SINDEX_ERR; + } + if (val->type != AS_MAP) { + return AS_SINDEX_ERR; + } + // Else iterate through all keys, values of map + as_map * map = as_map_fromval(val); + if (as_map_foreach(map, as_sindex_add_mapvalues_foreach, sbin)) { + return AS_SINDEX_OK; + } + return AS_SINDEX_ERR; +} + +typedef as_sindex_status (*as_sindex_add_asval_to_itype_sindex_fn) +(as_val *val, as_sindex_bin * sbin); +static const as_sindex_add_asval_to_itype_sindex_fn + as_sindex_add_asval_to_itype_sindex[AS_SINDEX_ITYPE_MAX] = { + as_sindex_add_asval_to_default_sindex, + as_sindex_add_asval_to_list_sindex, + as_sindex_add_asval_to_mapkeys_sindex, + as_sindex_add_asval_to_mapvalues_sindex +}; +// END - ADD ASVAL TO SINDEX TYPE +// ************************************************************************************************ +// ************************************************************************************************ +// DIFF FROM BIN TO SINDEX + +static bool +as_sindex_bin_add_skey(as_sindex_bin *sbin, const void *skey, as_val_t type) +{ + if (type == AS_STRING) { + if (as_sindex_add_digest_to_sbin(sbin, *((cf_digest *)skey)) == AS_SINDEX_OK) { + return true; + } + } + else if (type == AS_INTEGER) { + if (as_sindex_add_integer_to_sbin(sbin, *((uint64_t *)skey)) == AS_SINDEX_OK) { + return true; + } + } + + return false; +} + +static void +packed_val_init_unpacker(const cdt_payload *val, as_unpacker *pk) +{ + pk->buffer = val->ptr; + pk->length = val->sz; + pk->offset = 0; +} + +static bool +packed_val_make_skey(const cdt_payload *val, as_val_t type, void *skey) +{ + as_unpacker pk; + packed_val_init_unpacker(val, &pk); + + as_val_t packed_type = as_unpack_peek_type(&pk); + + if (packed_type != type) { + return false; + } + + if (type == AS_STRING) { + int32_t size = as_unpack_blob_size(&pk); + + if (size < 0) { + return false; + } + + if (pk.buffer[pk.offset++] != AS_BYTES_STRING) { + return false; + } + + cf_digest_compute(pk.buffer + pk.offset, pk.length - pk.offset, (cf_digest *)skey); + } + else if (type == AS_INTEGER) { + if (as_unpack_int64(&pk, (int64_t *)skey) < 0) { + return false; + } + } + else { + return false; + } + + return true; +} + +static bool +packed_val_add_sbin_or_update_shash(cdt_payload *val, as_sindex_bin *sbin, cf_shash *hash, as_val_t type) +{ + uint8_t skey[sizeof(cf_digest)]; + + if (! packed_val_make_skey(val, type, skey)) { + // packed_vals that aren't of type are ignored. + return true; + } + + bool found = false; + + if (cf_shash_get(hash, skey, &found) != CF_SHASH_OK) { + // Item not in hash, add to sbin. + return as_sindex_bin_add_skey(sbin, skey, type); + } + else { + // Item is in hash, set it to true. + found = true; + cf_shash_put(hash, skey, &found); + + return true; + } + + return false; +} + +static void +shash_add_packed_val(cf_shash *h, const cdt_payload *val, as_val_t type, bool value) +{ + uint8_t skey[sizeof(cf_digest)]; + + if (! packed_val_make_skey(val, type, skey)) { + // packed_vals that aren't of type are ignored. + return; + } + + cf_shash_put(h, skey, &value); +} + +static int +shash_diff_reduce_fn(const void *skey, void *data, void *udata) +{ + bool value = *(bool *)data; + as_sindex_bin *sbin = (as_sindex_bin *)udata; + + if (! sbin) { + cf_debug(AS_SINDEX, "SBIN sent as NULL"); + return -1; + } + + if (! value) { + // Add in the sbin. + if (sbin->type == AS_PARTICLE_TYPE_STRING) { + as_sindex_add_digest_to_sbin(sbin, *(const cf_digest*)skey); + } + else if (sbin->type == AS_PARTICLE_TYPE_INTEGER) { + as_sindex_add_integer_to_sbin(sbin, *(const uint64_t*)skey); + } + } + + return 0; +} + +// Find delta list elements and put them into sbins. +// Currently supports only string/integer index types. +static int32_t +as_sindex_sbins_sindex_list_diff_populate(as_sindex_bin *sbins, as_sindex *si, const as_bin *b_old, const as_bin *b_new) +{ + // Algorithm + // Add elements of short_list into hash with value = false + // Iterate through all the values in the long_list + // For all elements of long_list in hash, set value = true + // For all elements of long_list not in hash, add to sbin (insert or delete) + // Iterate through all the elements of hash + // For all elements where value == false, add to sbin (insert or delete) + + as_particle_type type = as_sindex_pktype(si->imd); + int data_size; + as_val_t expected_type; + + if (type == AS_PARTICLE_TYPE_STRING) { + data_size = 20; + expected_type = AS_STRING; + } + else if (type == AS_PARTICLE_TYPE_INTEGER) { + data_size = 8; + expected_type = AS_INTEGER; + } + else { + cf_debug(AS_SINDEX, "Invalid data type %d", type); + return -1; + } + + cdt_payload old_val; + cdt_payload new_val; + + as_bin_particle_list_get_packed_val(b_old, &old_val); + as_bin_particle_list_get_packed_val(b_new, &new_val); + + as_unpacker pk_old; + as_unpacker pk_new; + + packed_val_init_unpacker(&old_val, &pk_old); + packed_val_init_unpacker(&new_val, &pk_new); + + int64_t old_list_count = as_unpack_list_header_element_count(&pk_old); + int64_t new_list_count = as_unpack_list_header_element_count(&pk_new); + + if (old_list_count < 0 || new_list_count < 0) { + return -1; + } + + // Skip msgpack ext if it exist as the first element. + if (old_list_count != 0 && as_unpack_peek_is_ext(&pk_old)) { + if (as_unpack_size(&pk_old) < 0) { + return -1; + } + + old_list_count--; + } + + if (new_list_count != 0 && as_unpack_peek_is_ext(&pk_new)) { + if (as_unpack_size(&pk_new) < 0) { + return -1; + } + + new_list_count--; + } + + bool old_list_is_short = old_list_count < new_list_count; + + uint32_t short_list_count; + uint32_t long_list_count; + as_unpacker *pk_short; + as_unpacker *pk_long; + + if (old_list_is_short) { + short_list_count = (uint32_t)old_list_count; + long_list_count = (uint32_t)new_list_count; + pk_short = &pk_old; + pk_long = &pk_new; + } + else { + short_list_count = (uint32_t)new_list_count; + long_list_count = (uint32_t)old_list_count; + pk_short = &pk_new; + pk_long = &pk_old; + } + + if (short_list_count == 0) { + if (long_list_count == 0) { + return 0; + } + + as_sindex_init_sbin(sbins, old_list_is_short ? AS_SINDEX_OP_INSERT : AS_SINDEX_OP_DELETE, type, si); + + for (uint32_t i = 0; i < long_list_count; i++) { + cdt_payload ele; + + ele.ptr = pk_long->buffer + pk_long->offset; + ele.sz = as_unpack_size(pk_long); + + // sizeof(cf_digest) is big enough for all key types we support so far. + uint8_t skey[sizeof(cf_digest)]; + + if (! packed_val_make_skey(&ele, expected_type, skey)) { + // packed_vals that aren't of type are ignored. + continue; + } + + if (! as_sindex_bin_add_skey(sbins, skey, expected_type)) { + cf_warning(AS_SINDEX, "as_sindex_sbins_sindex_list_diff_populate() as_sindex_bin_add_skey failed"); + as_sindex_sbin_free(sbins); + return -1; + } + } + + return sbins->num_values == 0 ? 0 : 1; + } + + cf_shash *hash = cf_shash_create(cf_shash_fn_u32, data_size, 1, short_list_count, 0); + + // Add elements of shorter list into hash with value = false. + for (uint32_t i = 0; i < short_list_count; i++) { + cdt_payload ele = { + .ptr = pk_short->buffer + pk_short->offset + }; + + int size = as_unpack_size(pk_short); + + if (size < 0) { + cf_warning(AS_SINDEX, "as_sindex_sbins_sindex_list_diff_populate() list unpack failed"); + cf_shash_destroy(hash); + return -1; + } + + ele.sz = size; + shash_add_packed_val(hash, &ele, expected_type, false); + } + + as_sindex_init_sbin(sbins, old_list_is_short ? AS_SINDEX_OP_INSERT : AS_SINDEX_OP_DELETE, type, si); + + for (uint32_t i = 0; i < long_list_count; i++) { + cdt_payload ele; + + ele.ptr = pk_long->buffer + pk_long->offset; + ele.sz = as_unpack_size(pk_long); + + if (! packed_val_add_sbin_or_update_shash(&ele, sbins, hash, expected_type)) { + cf_warning(AS_SINDEX, "as_sindex_sbins_sindex_list_diff_populate() hash update failed"); + as_sindex_sbin_free(sbins); + cf_shash_destroy(hash); + return -1; + } + } + + // Need to keep track of start for unwinding on error. + as_sindex_bin *start_sbin = sbins; + int found = 0; + + if (sbins->num_values > 0) { + sbins++; + found++; + } + + as_sindex_init_sbin(sbins, old_list_is_short ? AS_SINDEX_OP_DELETE : AS_SINDEX_OP_INSERT, type, si); + + // Iterate through all the elements of hash. + if (cf_shash_reduce(hash, shash_diff_reduce_fn, sbins) != 0) { + as_sindex_sbin_freeall(start_sbin, found + 1); + cf_shash_destroy(hash); + return -1; + } + + if (sbins->num_values > 0) { + found++; + } + + cf_shash_destroy(hash); + + return found; +} + +void +as_sindex_sbins_debug_print(as_sindex_bin *sbins, uint32_t count) +{ + cf_warning( AS_SINDEX, "as_sindex_sbins_list_update_diff() found=%d", count); + for (uint32_t i = 0; i < count; i++) { + as_sindex_bin *p = sbins + i; + + cf_warning( AS_SINDEX, " %d: values= %"PRIu64" type=%d op=%d", + i, p->num_values, p->type, p->op); + + if (p->type == AS_PARTICLE_TYPE_INTEGER) { + int64_t *values = (int64_t *)p->values; + + if (p->num_values == 1) { + cf_warning( AS_SINDEX, " %ld", p->value.int_val); + } + else { + for (uint64_t j = 0; j < p->num_values; j++) { + cf_warning( AS_SINDEX, " %"PRIu64": %"PRId64"", j, values[j]); + } + } + } + } +} + +// Assumes b_old and b_new are AS_PARTICLE_TYPE_LIST bins. +// Assumes b_old and b_new have the same id. +static int32_t +as_sindex_sbins_list_diff_populate(as_sindex_bin *sbins, as_namespace *ns, const char *set_name, const as_bin *b_old, const as_bin *b_new) +{ + uint16_t id = b_new->id; + + if (! as_sindex_binid_has_sindex(ns, id)) { + return 0; + } + + cf_ll *simatch_ll = NULL; + as_sindex__simatch_list_by_set_binid(ns, set_name, id, &simatch_ll); + + if (! simatch_ll) { + return 0; + } + + uint32_t populated = 0; + + for (cf_ll_element *ele = cf_ll_get_head(simatch_ll); ele; ele = ele->next) { + sindex_set_binid_hash_ele *si_ele = (sindex_set_binid_hash_ele *)ele; + int simatch = si_ele->simatch; + as_sindex *si = &ns->sindex[simatch]; + + if (! as_sindex_isactive(si)) { + ele = ele->next; + continue; + } + + int32_t delta = as_sindex_sbins_sindex_list_diff_populate(&sbins[populated], si, b_old, b_new); + + if (delta < 0) { + return -1; + } + + populated += delta; + } + + return populated; +} + +uint32_t +as_sindex_sbins_populate(as_sindex_bin *sbins, as_namespace *ns, const char *set_name, const as_bin *b_old, const as_bin *b_new) +{ + if (as_bin_get_particle_type(b_old) == AS_PARTICLE_TYPE_LIST && as_bin_get_particle_type(b_new) == AS_PARTICLE_TYPE_LIST) { + int32_t ret = as_sindex_sbins_list_diff_populate(sbins, ns, set_name, b_old, b_new); + + if (ret >= 0) { + return (uint32_t)ret; + } + } + + uint32_t populated = 0; + + // TODO - might want an optimization that detects the (rare) case when a + // particle was rewritten with the exact old value. + populated += as_sindex_sbins_from_bin(ns, set_name, b_old, &sbins[populated], AS_SINDEX_OP_DELETE); + populated += as_sindex_sbins_from_bin(ns, set_name, b_new, &sbins[populated], AS_SINDEX_OP_INSERT); + + return populated; +} +// DIFF FROM BIN TO SINDEX +// ************************************************************************************************ +// ************************************************************************************************ +// SBIN INTERFACE FUNCTIONS +int +as_sindex_sbin_from_sindex(as_sindex * si, const as_bin *b, as_sindex_bin * sbin, as_val ** cdt_asval) +{ + as_sindex_metadata * imd = si->imd; + as_particle_type imd_sktype = as_sindex_pktype(imd); + as_val * cdt_val = * cdt_asval; + uint32_t valsz = 0; + int sindex_found = 0; + as_particle_type bin_type = 0; + bool found = false; + + bin_type = as_bin_get_particle_type(b); + + // Prepare si + // If path_length == 0 + if (imd->path_length == 0) { + // If itype == AS_SINDEX_ITYPE_DEFAULT and bin_type == STRING OR INTEGER + // Add the value to the sbin. + if (imd->itype == AS_SINDEX_ITYPE_DEFAULT && bin_type == imd_sktype) { + if (bin_type == AS_PARTICLE_TYPE_INTEGER) { + found = true; + sbin->value.int_val = as_bin_particle_integer_value(b); + + if (as_sindex_add_integer_to_sbin(sbin, (uint64_t)sbin->value.int_val) == AS_SINDEX_OK) { + if (sbin->num_values) { + sindex_found++; + } + } + } + else if (bin_type == AS_PARTICLE_TYPE_STRING) { + found = true; + char* bin_val; + valsz = as_bin_particle_string_ptr(b, &bin_val); + + if (valsz > AS_SINDEX_MAX_STRING_KSIZE) { + cf_warning( AS_SINDEX, "sindex key size out of bounds %d ", valsz); + cf_warning(AS_SINDEX, "Sindex on bin %s fails. Value length %u too long.", imd->bname, valsz); + } + else { + cf_digest buf_dig; + cf_digest_compute(bin_val, valsz, &buf_dig); + + if (as_sindex_add_digest_to_sbin(sbin, buf_dig) == AS_SINDEX_OK) { + if (sbin->num_values) { + sindex_found++; + } + } + } + } + else if (bin_type == AS_PARTICLE_TYPE_GEOJSON) { + // GeoJSON is like AS_PARTICLE_TYPE_STRING when + // reading the value and AS_PARTICLE_TYPE_INTEGER for + // adding the result to the index. + found = true; + bool added = false; + uint64_t * cells; + size_t ncells = as_bin_particle_geojson_cellids(b, &cells); + for (size_t ndx = 0; ndx < ncells; ++ndx) { + if (as_sindex_add_integer_to_sbin(sbin, cells[ndx]) == AS_SINDEX_OK) { + added = true; + } + } + if (added && sbin->num_values) { + sindex_found++; + } + } + } + } + // Else if path_length > 0 OR type == MAP or LIST + // Deserialize the bin if have not deserialized it yet. + // Extract as_val from path within the bin. + // Add the values to the sbin. + if (!found) { + if (bin_type == AS_PARTICLE_TYPE_MAP || bin_type == AS_PARTICLE_TYPE_LIST) { + if (! cdt_val) { + cdt_val = as_bin_particle_to_asval(b); + } + as_val * res_val = as_sindex_extract_val_from_path(imd, cdt_val); + if (!res_val) { + goto END; + } + if (as_sindex_add_asval_to_itype_sindex[imd->itype](res_val, sbin) == AS_SINDEX_OK) { + if (sbin->num_values) { + sindex_found++; + } + } + } + } +END: + *cdt_asval = cdt_val; + return sindex_found; +} + +// Returns the number of sindex found +// TODO - deprecate and conflate body with as_sindex_sbins_from_bin() below. +int +as_sindex_sbins_from_bin_buf(as_namespace *ns, const char *set, const as_bin *b, as_sindex_bin * start_sbin, + as_sindex_op op) +{ + // Check the sindex bit array. + // If there is not sindex present on this bin return 0 + // Get the simatch_ll from set_binid_hash + // If simatch_ll is NULL return 0 + // Iterate through simatch_ll + // If path_length == 0 + // If itype == AS_SINDEX_ITYPE_DEFAULT and bin_type == STRING OR INTEGER + // Add the value to the sbin. + // If itype == AS_SINDEX_ITYPE_MAP or AS_SINDEX_ITYPE_INVMAP and type = MAP + // Deserialize the bin if have not deserialized it yet. + // Extract as_val from path within the bin + // Add them to the sbin. + // If itype == AS_SINDEX_ITYPE_LIST and type = LIST + // Deserialize the bin if have not deserialized it yet. + // Extract as_val from path within the bin. + // Add the values to the sbin. + // Else if path_length > 0 and type == MAP or LIST + // Deserialize the bin if have not deserialized it yet. + // Extract as_val from path within the bin. + // Add the values to the sbin. + // Return the number of sbins found. + + int sindex_found = 0; + if (!b) { + cf_warning(AS_SINDEX, "Null Bin Passed, No sbin created"); + return sindex_found; + } + if (!ns) { + cf_warning(AS_SINDEX, "NULL Namespace Passed"); + return sindex_found; + } + if (!as_bin_inuse(b)) { + return sindex_found; + } + + // Check the sindex bit array. + // If there is not sindex present on this bin return 0 + if (!as_sindex_binid_has_sindex(ns, b->id) ) { + return sindex_found; + } + + // Get the simatch_ll from set_binid_hash + cf_ll * simatch_ll = NULL; + as_sindex__simatch_list_by_set_binid(ns, set, b->id, &simatch_ll); + + // If simatch_ll is NULL return 0 + if (!simatch_ll) { + return sindex_found; + } + + // Iterate through simatch_ll + cf_ll_element * ele = cf_ll_get_head(simatch_ll); + sindex_set_binid_hash_ele * si_ele = NULL; + int simatch = -1; + as_sindex * si = NULL; + as_val * cdt_val = NULL; + int sbins_in_si = 0; + while (ele) { + si_ele = (sindex_set_binid_hash_ele *) ele; + simatch = si_ele->simatch; + si = &ns->sindex[simatch]; + if (!as_sindex_isactive(si)) { + ele = ele->next; + continue; + } + as_sindex_init_sbin(&start_sbin[sindex_found], op, as_sindex_pktype(si->imd), si); + uint64_t s_time = cf_getns(); + sbins_in_si = as_sindex_sbin_from_sindex(si, b, &start_sbin[sindex_found], &cdt_val); + if (sbins_in_si == 1) { + sindex_found += sbins_in_si; + // sbin free will happen once sbin is updated in sindex tree + SINDEX_HIST_INSERT_DATA_POINT(si, si_prep_hist, s_time); + } + else { + as_sindex_sbin_free(&start_sbin[sindex_found]); + if (sbins_in_si) { + cf_warning(AS_SINDEX, "sbins found in si is neither 1 nor 0. It is %d", sbins_in_si); + } + } + ele = ele->next; + } + + // FREE as_val + if (cdt_val) { + as_val_destroy(cdt_val); + } + // Return the number of sbin found. + return sindex_found; +} + +int +as_sindex_sbins_from_bin(as_namespace *ns, const char *set, const as_bin *b, as_sindex_bin * start_sbin, as_sindex_op op) +{ + return as_sindex_sbins_from_bin_buf(ns, set, b, start_sbin, op); +} + +/* + * returns number of sbins found. + */ +int +as_sindex_sbins_from_rd(as_storage_rd *rd, uint16_t from_bin, uint16_t to_bin, as_sindex_bin sbins[], as_sindex_op op) +{ + uint16_t count = 0; + for (uint16_t i = from_bin; i < to_bin; i++) { + as_bin *b = &rd->bins[i]; + count += as_sindex_sbins_from_bin(rd->ns, as_index_get_set_name(rd->r, rd->ns), b, &sbins[count], op); + } + return count; +} + +// Needs comments +int +as_sindex_update_by_sbin(as_namespace *ns, const char *set, as_sindex_bin *start_sbin, int num_sbins, cf_digest * pkey) +{ + cf_debug(AS_SINDEX, "as_sindex_update_by_sbin"); + + // Need to address sbins which have OP as AS_SINDEX_OP_DELETE before the ones which have + // OP as AS_SINDEX_OP_INSERT. This is because same secondary index key can exist in sbins + // with different OPs + int sindex_ret = AS_SINDEX_OK; + for (int i=0; isindex in parallel. + while (count < AS_SINDEX_MAX && valid < ns->sindex_cnt) { + as_sindex *si = &ns->sindex[count]; + if (! as_sindex_put_rd(si, rd)) { + valid++; + } + count++; + } +} + +as_sindex_status +as_sindex_put_rd(as_sindex *si, as_storage_rd *rd) +{ + // Proceed only if sindex is active + SINDEX_GRLOCK(); + if (! as_sindex_isactive(si)) { + SINDEX_GRUNLOCK(); + return AS_SINDEX_ERR; + } + + as_sindex_metadata *imd = si->imd; + // Validate Set name. Other function do this check while + // performing searching for simatch. + const char *setname = NULL; + if (as_index_has_set(rd->r)) { + setname = as_index_get_set_name(rd->r, si->ns); + } + + if (!as_sindex__setname_match(imd, setname)) { + SINDEX_GRUNLOCK(); + return AS_SINDEX_OK; + } + + // collect sbins + SINDEX_BINS_SETUP(sbins, 1); + + int sbins_populated = 0; + as_val * cdt_val = NULL; + + as_bin *b = as_bin_get(rd, imd->bname); + + if (!b) { + SINDEX_GRUNLOCK(); + return AS_SINDEX_OK; + } + + as_sindex_init_sbin(&sbins[sbins_populated], AS_SINDEX_OP_INSERT, + as_sindex_pktype(si->imd), si); + sbins_populated = as_sindex_sbin_from_sindex(si, b, &sbins[sbins_populated], &cdt_val); + + // Only 1 sbin should be populated here. + // If populated should be freed after sindex update + if (sbins_populated != 1) { + as_sindex_sbin_free(&sbins[sbins_populated]); + if (sbins_populated) { + cf_warning(AS_SINDEX, "Number of sbins found for 1 sindex is neither 1 nor 0. It is %d", + sbins_populated); + } + } + SINDEX_GRUNLOCK(); + + if (cdt_val) { + as_val_destroy(cdt_val); + } + + if (sbins_populated) { + as_sindex_update_by_sbin(rd->ns, setname, sbins, sbins_populated, &rd->r->keyd); + as_sindex_sbin_freeall(sbins, sbins_populated); + } + + return AS_SINDEX_OK; +} +// END - PUT RD IN SINDEX +// ************************************************************************************************ + + +// ************************************************************************************************ +// SMD CALLBACKS +/* + * +------------------+ + * client --> | Secondary Index | + * +------------------+ + * /|\ + * | 4 accept + * +----------+ 2 + * | |<------- +------------------+ 1 request + * | SMD | 3 merge | Secondary Index | <------------| + * | |<-------> | | 5 response | CLIENT + * | | 4 accept | | ------------>| + * | |--------> +------------------+ + * +----------+ + * | 4 accept + * \|/ + * +------------------+ + * client --> | Secondary Index | + * +------------------+ + * + * + * System Metadta module sits in the middle of multiple secondary index + * module on multiple nodes. The changes which eventually are made to the + * secondary index are always triggerred from SMD. Here is the flow. + * + * Step1: Client send (could possibly be secondary index thread) triggers + * create / delete / update related to secondary index metadata. + * + * Step2: The request passed through secondary index module (may be few + * node specific info is added on the way) to the SMD. + * + * Step3: SMD send out the request to the paxos master. + * + * Step4: Paxos master request the relevant metadata info from all the + * nodes in the cluster once it has all the data... [SMD always + * stores copy of the data, it is stored when the first time + * create happens]..it call secondary index merge callback + * function. The function is responsible for resolving the winning + * version ... + * + * Step5: Once winning version is decided for all the registered module + * the changes are sent to all the node. + * + * Step6: At each node accept_fn is called for each module. Which triggers + * the call to the secondary index create/delete/update functions + * which would be used to in-memory operation and make it available + * for the system. + * + * There are two types of operations which look at the secondary index + * operations. + * + * a) Normal operation .. they all look a the in-memory structure and + * data which is in sindex and ai_btree layer. + * + * b) Other part which do DDL operation like which work through the SMD + * layer. Multiple operation happening from the multiple nodes which + * come through this layer. The synchronization is responsible of + * SMD layer. The part sindex / ai_btree code is responsible is to + * make sure when the call from the SMD comes there is proper sync + * between this and operation in section a + * + */ + +// Global flag to signal that all secondary index SMD is restored. +static bool g_sindex_smd_restored = false; + +void +as_sindex_init_smd() +{ + int retval = as_smd_create_module(SINDEX_MODULE, + as_smd_majority_consensus_merge, NULL, + NULL, NULL, + as_sindex_smd_accept_cb, NULL, + NULL, NULL); + + cf_assert(retval == 0, AS_SINDEX, "failed to create sindex SMD module (rv %d)", retval); + + // Wait for Secondary Index SMD to be completely restored. + while (! g_sindex_smd_restored) { + usleep(1000); + } +} + +/* + * This function is called when the SMD has resolved the correct state of + * metadata. This function needs to, based on the value, looks at the current + * state of the index and trigger requests to secondary index to do the + * needful. At the start of time there is nothing in sindex and this code + * comes and setup indexes + * + * Expectation. SMD is responsible for persisting data and communicating back + * to sindex layer to create in-memory structures + * + * + * Description: To perform sindex operations(ADD,MODIFY,DELETE), through SMD + * This function called on every node, after paxos master decides + * the final version of the sindex to be created. This is the final + * version and the only allowed version in the sindex.Operations coming + * to this function are least expected to fail, ideally they should + * never fail. + * + * Parameters: + * module: SINDEX_MODULE + * as_smd_item_list_t: list of action items, to be performed on sindex. + * udata: ?? + * + * Returns: + * always 0 + * + * Synchronization: + * underlying secondary index all needs to take corresponding lock and + * SMD is today single threaded no sync needed there + */ + +as_sindex_ktype +as_sindex_ktype_from_smd_char(char c) +{ + if (c == 'I') { + return COL_TYPE_LONG; + } + else if (c == 'S') { + return COL_TYPE_DIGEST; + } + else if (c == 'G') { + return COL_TYPE_GEOJSON; + } + else { + cf_warning(AS_SINDEX, "unknown smd ktype %c", c); + return COL_TYPE_INVALID; + } +} + +char +as_sindex_ktype_to_smd_char(as_sindex_ktype ktype) +{ + if (ktype == COL_TYPE_LONG) { + return 'I'; + } + else if (ktype == COL_TYPE_DIGEST) { + return 'S'; + } + else if (ktype == COL_TYPE_GEOJSON) { + return 'G'; + } + else { + cf_crash(AS_SINDEX, "unknown ktype %d", ktype); + return '?'; + } +} + +as_sindex_type +as_sindex_type_from_smd_char(char c) +{ + if (c == '.') { + return AS_SINDEX_ITYPE_DEFAULT; // or - "scalar" + } + else if (c == 'L') { + return AS_SINDEX_ITYPE_LIST; + } + else if (c == 'K') { + return AS_SINDEX_ITYPE_MAPKEYS; + } + else if (c == 'V') { + return AS_SINDEX_ITYPE_MAPVALUES; + } + else { + cf_warning(AS_SINDEX, "unknown smd type %c", c); + return AS_SINDEX_ITYPE_MAX; // since there's no named illegal value + } +} + +char +as_sindex_type_to_smd_char(as_sindex_type itype) +{ + if (itype == AS_SINDEX_ITYPE_DEFAULT) { + return '.'; + } + else if (itype == AS_SINDEX_ITYPE_LIST) { + return 'L'; + } + else if (itype == AS_SINDEX_ITYPE_MAPKEYS) { + return 'K'; + } + else if (itype == AS_SINDEX_ITYPE_MAPVALUES) { + return 'V'; + } + else { + cf_crash(AS_SINDEX, "unknown type %d", itype); + return '?'; + } +} + +#define TOK_CHAR_DELIMITER '|' + +bool +smd_key_to_imd(const char *smd_key, as_sindex_metadata *imd) +{ + // ns-name||path|itype|sktype + // Note - sktype a.k.a. ktype and dtype. + + const char *read = smd_key; + const char *tok = strchr(read, TOK_CHAR_DELIMITER); + + if (! tok) { + cf_warning(AS_SINDEX, "smd - namespace name missing delimiter"); + return false; + } + + uint32_t ns_name_len = tok - read; + + imd->ns_name = cf_malloc(ns_name_len + 1); + memcpy(imd->ns_name, read, ns_name_len); + imd->ns_name[ns_name_len] = 0; + + read = tok + 1; + tok = strchr(read, TOK_CHAR_DELIMITER); + + if (! tok) { + cf_warning(AS_SINDEX, "smd - set name missing delimiter"); + return false; + } + + uint32_t set_name_len = tok - read; + + if (set_name_len != 0) { + imd->set = cf_malloc(set_name_len + 1); + memcpy(imd->set, read, set_name_len); + imd->set[set_name_len] = 0; + } + // else - imd->set remains NULL. + + read = tok + 1; + tok = strchr(read, TOK_CHAR_DELIMITER); + + if (! tok) { + cf_warning(AS_SINDEX, "smd - path missing delimiter"); + return false; + } + + uint32_t path_len = tok - read; + + imd->path_str = cf_malloc(path_len + 1); + memcpy(imd->path_str, read, path_len); + imd->path_str[path_len] = 0; + + if (as_sindex_extract_bin_path(imd, imd->path_str) != AS_SINDEX_OK) { + cf_warning(AS_SINDEX, "smd - can't parse path"); + return false; + } + + read = tok + 1; + tok = strchr(read, TOK_CHAR_DELIMITER); + + if (! tok) { + cf_warning(AS_SINDEX, "smd - itype missing delimiter"); + return false; + } + + if ((imd->itype = as_sindex_type_from_smd_char(*read)) == + AS_SINDEX_ITYPE_MAX) { + cf_warning(AS_SINDEX, "smd - bad itype"); + return false; + } + + read = tok + 1; + + if ((imd->sktype = as_sindex_ktype_from_smd_char(*read)) == + COL_TYPE_INVALID) { + cf_warning(AS_SINDEX, "smd - bad sktype"); + return false; + } + + return true; +} + +void +smd_value_to_imd(const char *smd_value, as_sindex_metadata *imd) +{ + // For now, it's only index-name + imd->iname = cf_strdup(smd_value); +} + +void +as_sindex_imd_to_smd_key(const as_sindex_metadata *imd, char *smd_key) +{ + // ns-name||path|itype|sktype + // Note - sktype a.k.a. ktype and dtype. + + sprintf(smd_key, "%s|%s|%s|%c|%c", + imd->ns_name, + imd->set ? imd->set : "", + imd->path_str, + as_sindex_type_to_smd_char(imd->itype), + as_sindex_ktype_to_smd_char(imd->sktype)); +} + +bool +as_sindex_delete_imd_to_smd_key(as_namespace *ns, as_sindex_metadata *imd, char *smd_key) +{ + // ns-name||path|sktype| + // Note - sktype a.k.a. ktype and dtype. + + // The imd passed in doesn't have enough to make SMD key - use a full imd + // from the existing sindex, if it's there. + + // TODO - takes lock - is this ok? Flags ok? + as_sindex *si = as_sindex_lookup_by_iname(ns, imd->iname, + AS_SINDEX_LOOKUP_FLAG_NORESERVE | AS_SINDEX_LOOKUP_FLAG_ISACTIVE); + + if (! si) { + return false; + } + + as_sindex_imd_to_smd_key(si->imd, smd_key); + + return true; +} + +int +as_sindex_smd_accept_cb(char *module, as_smd_item_list_t *items, void *udata, uint32_t accept_opt) +{ + if ((accept_opt & AS_SMD_ACCEPT_OPT_CREATE) != 0) { + g_sindex_smd_restored = true; + return 0; + } + + for (int i = 0; i < (int)items->num_items; i++) { + as_smd_item_t *item = items->item[i]; + as_sindex_metadata imd; + + memset(&imd, 0, sizeof(imd)); // TODO - arrange to use { 0 } ??? + + if (! smd_key_to_imd(item->key, &imd)) { + as_sindex_imd_free(&imd); + continue; + } + + as_namespace *ns = as_namespace_get_byname(imd.ns_name); + + if (! ns) { + cf_detail(AS_SINDEX, "skipping invalid namespace %s", imd.ns_name); + as_sindex_imd_free(&imd); + continue; + } + + if (item->action == AS_SMD_ACTION_SET) { + smd_value_to_imd(item->value, &imd); // sets index name + as_sindex_smd_create(ns, &imd); + } + else if (item->action == AS_SMD_ACTION_DELETE) { + as_sindex_destroy(ns, &imd); + } + else { + cf_warning(AS_SINDEX, "smd accept cb - unknown action"); + } + + as_sindex_imd_free(&imd); + } + + return 0; +} +// END - SMD CALLBACKS +// ************************************************************************************************ +// ************************************************************************************************ +// SINDEX TICKER +// Sindex ticker start +void +as_sindex_ticker_start(as_namespace * ns, as_sindex * si) +{ + cf_info(AS_SINDEX, "Sindex-ticker start: ns=%s si=%s job=%s", ns->name ? ns->name : "", + si ? si->imd->iname : "", si ? "SINDEX_POPULATE" : "SINDEX_POPULATEALL"); + +} +// Sindex ticker +void +as_sindex_ticker(as_namespace * ns, as_sindex * si, uint64_t n_obj_scanned, uint64_t start_time) +{ + const uint64_t sindex_ticker_obj_count = 500000; + + if (n_obj_scanned % sindex_ticker_obj_count == 0 && n_obj_scanned != 0) { + // Ticker can be dumped from here, we'll be in this place for both + // sindex populate and populate-all. + // si memory gets set from as_sindex_reserve_data_memory() which in turn gets set from : + // ai_btree_put() <- for every single sindex insertion (boot-time/dynamic) + // as_sindex_create() : for dynamic si creation, cluster change, smd on boot-up. + + uint64_t si_memory = 0; + char * si_name = NULL; + + if (si) { + si_memory += ai_btree_get_isize(si->imd); + si_memory += ai_btree_get_nsize(si->imd); + si_name = si->imd->iname; + } + else { + si_memory = (uint64_t)cf_atomic64_get(ns->n_bytes_sindex_memory); + si_name = ""; + } + + uint64_t n_objects = cf_atomic64_get(ns->n_objects); + uint64_t pct_obj_scanned = n_objects == 0 ? 100 : ((n_obj_scanned * 100) / n_objects); + uint64_t elapsed = (cf_getms() - start_time); + uint64_t est_time = (elapsed * n_objects)/n_obj_scanned - elapsed; + + cf_info(AS_SINDEX, " Sindex-ticker: ns=%s si=%s obj-scanned=%"PRIu64" si-mem-used=%"PRIu64"" + " progress= %"PRIu64"%% est-time=%"PRIu64" ms", + ns->name, si_name, n_obj_scanned, si_memory, pct_obj_scanned, est_time); + } +} + +// Sindex ticker end +void +as_sindex_ticker_done(as_namespace * ns, as_sindex * si, uint64_t start_time) +{ + uint64_t si_memory = 0; + char * si_name = NULL; + + if (si) { + si_memory += ai_btree_get_isize(si->imd); + si_memory += ai_btree_get_nsize(si->imd); + si_name = si->imd->iname; + } + else { + si_memory = (uint64_t)cf_atomic64_get(ns->n_bytes_sindex_memory); + si_name = ""; + } + + cf_info(AS_SINDEX, "Sindex-ticker done: ns=%s si=%s si-mem-used=%"PRIu64" elapsed=%"PRIu64" ms", + ns->name, si_name, si_memory, cf_getms() - start_time); + +} +// END - SINDEX TICKER +// ************************************************************************************************ +// ************************************************************************************************ +// INDEX KEYS ARR +// Functions are not used in this file. +static cf_queue *g_q_index_keys_arr = NULL; +int +as_index_keys_ll_reduce_fn(cf_ll_element *ele, void *udata) +{ + return CF_LL_REDUCE_DELETE; +} + +void +as_index_keys_ll_destroy_fn(cf_ll_element *ele) +{ + as_index_keys_ll_element * node = (as_index_keys_ll_element *) ele; + if (node) { + if (node->keys_arr) { + as_index_keys_release_arr_to_queue(node->keys_arr); + node->keys_arr = NULL; + } + cf_free(node); + } +} + +as_index_keys_arr * +as_index_get_keys_arr(void) +{ + as_index_keys_arr *keys_arr; + if (cf_queue_pop(g_q_index_keys_arr, &keys_arr, CF_QUEUE_NOWAIT) == CF_QUEUE_EMPTY) { + keys_arr = cf_malloc(sizeof(as_index_keys_arr)); + } + keys_arr->num = 0; + return keys_arr; +} + +void +as_index_keys_release_arr_to_queue(as_index_keys_arr *v) +{ + as_index_keys_arr * keys_arr = (as_index_keys_arr *)v; + if (cf_queue_sz(g_q_index_keys_arr) < AS_INDEX_KEYS_ARRAY_QUEUE_HIGHWATER) { + cf_queue_push(g_q_index_keys_arr, &keys_arr); + } + else { + cf_free(keys_arr); + } + +} +// END - INDEX KEYS ARR +// ************************************************************************************************ + +/* + * Main initialization function. Talks to Aerospike Index to pull up all the indexes + * and populates sindex hanging from namespace + */ +int +as_sindex_init(as_namespace *ns) +{ + ns->sindex = cf_malloc(sizeof(as_sindex) * AS_SINDEX_MAX); + + ns->sindex_cnt = 0; + for (int i = 0; i < AS_SINDEX_MAX; i++) { + as_sindex *si = &ns->sindex[i]; + memset(si, 0, sizeof(as_sindex)); + si->state = AS_SINDEX_INACTIVE; + si->stats._delete_hist = NULL; + si->stats._query_hist = NULL; + si->stats._query_batch_lookup = NULL; + si->stats._query_batch_io = NULL; + si->stats._query_rcnt_hist = NULL; + si->stats._query_diff_hist = NULL; + } + + // binid to simatch lookup + ns->sindex_set_binid_hash = cf_shash_create(cf_shash_fn_zstr, + AS_SINDEX_PROP_KEY_SIZE, sizeof(cf_ll *), AS_SINDEX_MAX, 0); + + // iname to simatch lookup + ns->sindex_iname_hash = cf_shash_create(cf_shash_fn_zstr, AS_ID_INAME_SZ, + sizeof(uint32_t), AS_SINDEX_MAX, 0); + + // Init binid_has_sindex to zero + memset(ns->binid_has_sindex, 0, sizeof(uint32_t)*AS_BINID_HAS_SINDEX_SIZE); + if (!g_q_index_keys_arr) { + g_q_index_keys_arr = cf_queue_create(sizeof(void *), true); + } + return AS_SINDEX_OK; +} + +void +as_sindex_dump(char *nsname, char *iname, char *fname, bool verbose) +{ + as_namespace *ns = as_namespace_get_byname(nsname); + as_sindex *si = as_sindex_lookup_by_iname(ns, iname, AS_SINDEX_LOOKUP_FLAG_ISACTIVE); + ai_btree_dump(si->imd, fname, verbose); + AS_SINDEX_RELEASE(si); +} diff --git a/as/src/base/security_ce.c b/as/src/base/security_ce.c new file mode 100644 index 00000000..49137dbb --- /dev/null +++ b/as/src/base/security_ce.c @@ -0,0 +1,163 @@ +/* + * security_stubs.c + * + * Copyright (C) 2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "base/security.h" +#include "base/security_config.h" + +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" + +#include "fault.h" +#include "socket.h" + +#include "base/proto.h" +#include "base/transaction.h" + + +//========================================================== +// Public API. +// + +// Security is an enterprise feature - here, do nothing. +void +as_security_init() +{ +} + +// Security is an enterprise feature - here, allow all operations. +uint8_t +as_security_check(const as_file_handle* fd_h, as_sec_perm perm) +{ + return AS_PROTO_RESULT_OK; +} + +// Security is an enterprise feature - here, allow all operations. +bool +as_security_check_data_op(as_transaction* tr, as_namespace* ns, + as_sec_perm perm) +{ + return true; +} + +// Security is an enterprise feature - here, there's no filter. +void* +as_security_filter_create() +{ + return NULL; +} + +// Security is an enterprise feature - here, there's no filter. +void +as_security_filter_destroy(void* pv_filter) +{ +} + +// Security is an enterprise feature - here, do nothing. +void +as_security_log(const as_file_handle* fd_h, uint8_t result, as_sec_perm perm, + const char* action, const char* detail) +{ +} + +// Security is an enterprise feature - here, do nothing. +void +as_security_refresh(as_file_handle* fd_h) +{ +} + +// Security is an enterprise feature. If we receive a security message from a +// client here, quickly return AS_SEC_ERR_NOT_SUPPORTED. The client may choose +// to continue using this (unsecured) socket. +void +as_security_transact(as_transaction* tr) +{ + // We don't need the request, since we're ignoring it. + cf_free(tr->msgp); + tr->msgp = NULL; + + // Set up a simple response with a single as_sec_msg that has no fields. + size_t resp_size = sizeof(as_proto) + sizeof(as_sec_msg); + uint8_t resp[resp_size]; + + // Fill out the as_proto fields. + as_proto* p_resp_proto = (as_proto*)resp; + + p_resp_proto->version = PROTO_VERSION; + p_resp_proto->type = PROTO_TYPE_SECURITY; + p_resp_proto->sz = sizeof(as_sec_msg); + + // Switch to network byte order. + as_proto_swap(p_resp_proto); + + uint8_t* p_proto_body = resp + sizeof(as_proto); + + memset((void*)p_proto_body, 0, sizeof(as_sec_msg)); + + // Fill out the relevant as_sec_msg fields. + as_sec_msg* p_sec_msg = (as_sec_msg*)p_proto_body; + + p_sec_msg->scheme = AS_SEC_MSG_SCHEME; + p_sec_msg->result = AS_SEC_ERR_NOT_SUPPORTED; + + // Send the complete response. + cf_socket *sock = &tr->from.proto_fd_h->sock; + + if (cf_socket_send_all(sock, resp, resp_size, MSG_NOSIGNAL, + CF_SOCKET_TIMEOUT) < 0) { + cf_warning(AS_SECURITY, "fd %d send failed, errno %d", + CSFD(sock), errno); + as_end_of_transaction_force_close(tr->from.proto_fd_h); + tr->from.proto_fd_h = NULL; + return; + } + + as_end_of_transaction_ok(tr->from.proto_fd_h); + tr->from.proto_fd_h = NULL; +} + + +//========================================================== +// Public API - security configuration. +// + +// Security is an enterprise feature - here, do nothing. +void +as_security_config_check() +{ +} + +// Security is an enterprise feature - here, do nothing. +void +as_security_config_log_scope(uint32_t sink, const char* ns_name, + const char* set_name) +{ +} diff --git a/as/src/base/signal.c b/as/src/base/signal.c new file mode 100644 index 00000000..b5eca1da --- /dev/null +++ b/as/src/base/signal.c @@ -0,0 +1,249 @@ +/* + * signal.c + * + * Copyright (C) 2010-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include +#include +#include +#include +#include +#include + +#include "fault.h" + +#include "base/xdr_serverside.h" + + +//========================================================== +// Constants. +// + +// String constants in version.c, generated by make. +extern const char aerospike_build_type[]; +extern const char aerospike_build_id[]; +extern const char aerospike_build_os[]; + + +//========================================================== +// Globals. +// + +// The mutex that the main function deadlocks on after starting the service. +extern pthread_mutex_t g_main_deadlock; +extern bool g_startup_complete; + + +//========================================================== +// Local helpers. +// + +static inline void +register_signal_handler(int sig_num, sighandler_t handler) +{ + sighandler_t old_handler = signal(sig_num, handler); + + if (old_handler == SIG_ERR) { + cf_crash(AS_AS, "could not register signal handler for %d", sig_num); + } + // Occasionally we've seen the value 1 (SIG_IGN) returned, assume it's ok. + else if (old_handler && old_handler != SIG_IGN) { + cf_warning(AS_AS, "found unexpected old signal handler %p for %d", + old_handler, sig_num); + // This should never happen, but for now, proceed anyway... + } +} + +static inline void +reraise_signal(int sig_num, sighandler_t handler) +{ + if (signal(sig_num, SIG_DFL) != handler) { + cf_warning(AS_AS, "could not register default signal handler for %d", + sig_num); + _exit(-1); + } + + raise(sig_num); +} + + +//========================================================== +// Signal handlers. +// + +// We get here on some crashes. +void +as_sig_handle_abort(int sig_num) +{ + cf_warning(AS_AS, "SIGABRT received, aborting %s build %s os %s", + aerospike_build_type, aerospike_build_id, aerospike_build_os); + + xdr_sig_handler(sig_num); + + PRINT_STACKTRACE(); + reraise_signal(sig_num, as_sig_handle_abort); +} + +void +as_sig_handle_bus(int sig_num) +{ + cf_warning(AS_AS, "SIGBUS received, aborting %s build %s", + aerospike_build_type, aerospike_build_id); + + xdr_sig_handler(sig_num); + + PRINT_STACKTRACE(); + reraise_signal(sig_num, as_sig_handle_bus); +} + +// Floating point exception. +void +as_sig_handle_fpe(int sig_num) +{ + cf_warning(AS_AS, "SIGFPE received, aborting %s build %s os %s", + aerospike_build_type, aerospike_build_id, aerospike_build_os); + + xdr_sig_handler(sig_num); + + PRINT_STACKTRACE(); + reraise_signal(sig_num, as_sig_handle_fpe); +} + +// This signal is our cue to roll the log. +void +as_sig_handle_hup(int sig_num) +{ + cf_info(AS_AS, "SIGHUP received, rolling log"); + + cf_fault_sink_logroll(); +} + +// We get here on some crashes. +void +as_sig_handle_ill(int sig_num) +{ + cf_warning(AS_AS, "SIGILL received, aborting %s build %s os %s", + aerospike_build_type, aerospike_build_id, aerospike_build_os); + + PRINT_STACKTRACE(); + reraise_signal(sig_num, as_sig_handle_ill); +} + +// We get here on cf_crash_nostack(), cf_assert_nostack(). +void +as_sig_handle_int(int sig_num) +{ + cf_warning(AS_AS, "SIGINT received, shutting down"); + + if (! g_startup_complete) { + cf_warning(AS_AS, "startup was not complete, exiting immediately"); + _exit(1); + } + + xdr_sig_handler(sig_num); + + pthread_mutex_unlock(&g_main_deadlock); +} + +// We get here if we intentionally trigger the signal. +void +as_sig_handle_quit(int sig_num) +{ + cf_warning(AS_AS, "SIGQUIT received, aborting %s build %s os %s", + aerospike_build_type, aerospike_build_id, aerospike_build_os); + + PRINT_STACKTRACE(); + reraise_signal(sig_num, as_sig_handle_quit); +} + +// We get here on some crashes. +void +as_sig_handle_segv(int sig_num) +{ + cf_warning(AS_AS, "SIGSEGV received, aborting %s build %s os %s", + aerospike_build_type, aerospike_build_id, aerospike_build_os); + + xdr_sig_handler(sig_num); + + PRINT_STACKTRACE(); + reraise_signal(sig_num, as_sig_handle_segv); +} + +// We get here on normal shutdown. +void +as_sig_handle_term(int sig_num) +{ + cf_info(AS_AS, "SIGTERM received, starting normal shutdown"); + + if (! g_startup_complete) { + cf_warning(AS_AS, "startup was not complete, exiting immediately"); + _exit(0); + } + + xdr_sig_handler(sig_num); + + pthread_mutex_unlock(&g_main_deadlock); +} + +// We get here on cf_crash() and cf_assert(). +void +as_sig_handle_usr1(int sig_num) +{ + cf_warning(AS_AS, "SIGUSR1 received, aborting %s build %s os %s", + aerospike_build_type, aerospike_build_id, aerospike_build_os); + + xdr_sig_handler(sig_num); + + PRINT_CALL_STACK(CF_INFO); + reraise_signal(SIGABRT, as_sig_handle_abort); +} + + +//========================================================== +// Public API. +// + +void +as_signal_setup() +{ + register_signal_handler(SIGABRT, as_sig_handle_abort); + register_signal_handler(SIGBUS, as_sig_handle_bus); + register_signal_handler(SIGFPE, as_sig_handle_fpe); + register_signal_handler(SIGHUP, as_sig_handle_hup); + register_signal_handler(SIGILL, as_sig_handle_ill); + register_signal_handler(SIGINT, as_sig_handle_int); + register_signal_handler(SIGQUIT, as_sig_handle_quit); + register_signal_handler(SIGSEGV, as_sig_handle_segv); + register_signal_handler(SIGTERM, as_sig_handle_term); + register_signal_handler(SIGUSR1, as_sig_handle_usr1); + + // Block SIGPIPE signal when there is some error while writing to pipe. The + // write() call will return with a normal error which we can handle. + struct sigaction sigact; + + memset(&sigact, 0, sizeof(sigact)); + sigact.sa_handler = SIG_IGN; + sigemptyset(&sigact.sa_mask); + sigaddset(&sigact.sa_mask, SIGPIPE); + + if (sigaction(SIGPIPE, &sigact, NULL) != 0) { + cf_warning(AS_AS, "could not block the SIGPIPE signal"); + } +} diff --git a/as/src/base/system_metadata.c b/as/src/base/system_metadata.c new file mode 100644 index 00000000..2633731b --- /dev/null +++ b/as/src/base/system_metadata.c @@ -0,0 +1,3471 @@ +/* + * system_metadata.c + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * SYNOPSIS + * The System Metadata module provides a mechanism for synchronizing + * module metadata cluster-wide. While each module is responsible + * for the interpretation of its own metadata, the System Metadata + * module provides persistence and automatic distribution of changes + * to that opaque metadata. + */ + +#include +#include +#include + +#include "aerospike/as_hashmap.h" +#include "aerospike/as_integer.h" +#include "aerospike/as_stringmap.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_queue.h" +#include "citrusleaf/cf_rchash.h" + +#include "msg.h" +#include "shash.h" + +#include "base/cfg.h" +#include "base/secondary_index.h" +#include "base/system_metadata.h" +#include "fabric/exchange.h" +#include "fabric/fabric.h" +#include "fabric/hb.h" +#include "jansson.h" + + +/* +** System Metadata Theory of Operation +** =================================== +** +** Overview: +** --------- +** +** The System Metadata (SMD) module provides the means for an Aerospike cluster to manage and +** automatically and consistently distribute data describing the state of any number of modules +** within each of the cluster nodes. This data is called "system metadata." System metadata +** is managed on a module-by-module basis, where each registered module has a set of zero or +** more SMD items. An SMD item has properties describing the item (module name, key, value +** generation, and modification timestamp.) The contents (value) of an SMD item is opaque to +** the SMD module itself. At creation time, modules may register policy callback functions +** to perform the actions of merging and accepting metadata updates, or else select the system +** default policy for these operations. +** +** Initialization: +** --------------- +** +** Prior to use, the System Metadata module must first be initialized by calling "as_smd_init()" +** to create the SMD internal data structures and launch a captive thread to process all +** incoming system metadata events. During this phase, all system metadata operations will be +** handled locally on each node. +** +** Once all server components have been initialized, SMD may be started via "as_smd_start()". +** At this point, SMD will begin handling cluster state change events and begin +** communicating with SMD in other cluster nodes via SMD fabric messages to synchronize +** system metadata cluster-wide. Fabric transactions are used guarantee message delivery +** succeeds or fails atomically, with re-try handled automatically at fabric level. +** +** The System Metadata module may be terminated using "as_smd_shutdown()", which de-registers +** the SMD fabric message type and causes the captive thread to exit. At this point, it is +** permissible to re-initialize (and then re-start) the System Metadata module again. +** +** Life Cycle of System Metadata: +** ------------------------------ +** +** For a server component to use System Metadata, the component must first create its SMD +** module. The SMD API names modules via a name string which must be unique within the +** server. Calling "as_smd_create_module()" will create a container object in SMD to +** hold the module's metadata and register any supplied policy callback functions provided by +** the component. To release the component's SMD module, call "as_smd_destroy_module()". +** +** After a module has been created, new metadata items may be added, or existing items may +** be modified, using "as_smd_set_metadata()". Existing metadata items may be removed using +** "as_smd_delete_metadata()". Metadata may be searched using "as_smd_get_metadata()", which +** can return one or more items for one or more modules, depending upon the item list passed +** in, and sends the search results to a user-supplied callback function. +** +** Each module's metadata is automatically persisted via serialization (in JSON format) to a file +** upon each accepted metadata item change and also when the module is destroyed. When a module +** is created (usually at server start-up time), if an existing SMD file is found for the module, +** its contents will be loaded in as the initial values of the module's metadata. +** +** System Metadata Policy Callback Functions: +** ------------------------------------------ +** +** There are three SMD policy callback functions a module may register. If NULL is passed +** for a callback function pointer in "as_smd_module_create()", the system default policy +** will be selected for that operation. All policy callbacks are executed in the context +** of the SMD thread. +** +** The SMD policy callbacks operate as follows: +** +** 1). The Merge Callback ("as_smd_merge_cb()"): When a cluster state change occurs, +** each module's Merge callback will be executed on the SMD principal to create a new, +** unified view of each module's metadata. The system default merge policy is to simply +** form a union of all nodes' metadata items for the given module, taking the latest +** version of metadata items with duplicate keys, chosen first by highest generation +** and second by highest timestamp. +** +** 2). The Accept Callback ("as_smd_accept_cb()"): When a modules SMD item(s) are changed, +** or when a module is created and fully restored from persistence, the Accept callback +** will be invoked on every node to commit the change, with the originator of the accept +** event passed as the accept option parameter value. +** +** This callback will be invoked in three distinct cases: +** +** First, when a module is created and its persisted metadata (if any) has been fully +** restored, this callback will be invoked with the OPT_CREATE accept option and a +** NULL item list. This event is the proper point for synchronizing with any other +** thread(s) who depend upon the given module being fully initialized. +** +** Second, after the the SMD principal has determined the merged metadata for a +** module, it will distribute the new metadata to all cluster nodes (including itself) +** for processing via the Accept callback with the OPT_MERGE accept option and an item +** list of length 0 or greater. The system default accept policy is simply to replace +** any preexisting metadata items for the module with the received metadata items. +** Modules will generally, however, define their own Accept callback to take actions +** based upon the changed metadata, such as creating secondary indexes or defining new +** User Defined Functions (UDFs.) +** +** Third, when a metadata item is set or deleted via the SMD API (or at module creation +** time, via restoration from persisted state), the Accept callback will be invoked with +** the OPT_API accept option and an item list of length 1. Note that at system start-up +** time, prior to cluster formation, the metadata change will be handled locally. Once +** a cluster has been joined, however, each metadata change event will be proxied to +** the SMD principal, who will forward it to every cluster node (including itself) +** for acceptance. +** +** 3). The Can Accept Callback ("as_smd_can_accept_cb()"): When the SMD principal +** receives a metadata change request (set or delete), it will first attempt to +** validate the request via any registered Can Accept callback. If the callback +** exists, it must return non-zero for the item to be processed. Otherwise the item +** will be rejected. +** +** Threading Structure: +** -------------------- +** +** The System Metadata module relies on a single, captive thread to handle all incoming SMD +** fabric messages, public SMD API operations, and to invoke module's registered policy +** callbacks. Single-thread access means no locking of SMD data structures is necessary. +** +** The SMD thread waits on a queue for messages from either the local node (created and sent +** via the System Metadata API functions) or from other cluster nodes (via System Metadata +** fabric messages.) +** +** Initially the System Metadata module is inactive until the "as_smd_init()" function launches +** the System Metadata thread. At this point, only node-local SMD commands and events will be +** processed. When "as_smd_start()" is called, a START message will be sent telling the SMD +** thread to also begin receiving SMD events for cluster state change notifications +** and from other cluster nodes via SMD fabric messages. SMD will now perform the full +** policy callback processing as describe above. The System Metadata module will be running +** until the "as_smd_shutdown()" function sends a SHUTDOWN message, upon receipt of which the +** System Metadata thread will exit cleanly. +** +** Internal Messaging Structure: +** ----------------------------- +** +** Each public SMD API function invocation corresponds to an event message being sent to the +** System Metadata thread via its message queue for processing. Internal command messages +** (those not generated by API calls) are also sent via the message queue to handle cluster +** state change events, incoming SMD fabric messages, and other internal utility functions. +** +** Each event is defined by an event type, options bits, and a metadata item (which may be +** NULL or partially populated, depending upon the command type.) +** +** The SMD command message types are: +** +** 1). INIT / START / SHUTDOWN: These messages correspond to the APIs controlling the +** running of the SMD subsystem itself and its captive thread. +** +** 2). CREATE_MODULE / DESTROY_MODULE: These messages create and destroy module objects +** containing metadata items. +** +** 3). SET_METADATA / DELETE_METADATA / GET_METADATA: The SMD API sends these messages to +** set, delete, and get metadata items. +** +** 4). INTERNAL: This message type is used for non-API "internal" events such as the event +** triggered by a cluster state change notification, incoming SMD fabric +** messages from other nodes, or to dump info. about the state of system metadata to +** the system log. +** +** Debugging Utilities: +** -------------------- +** +** The state of the System Metadata module can be logged using the "dump-smd:" Info command: +** +** dump-smd:[verbose={"true"|"false"}] (Default: "false".) +** +** The optional option "verbose" parameter may be set to "true" to log additional detailed +** information about the system metadata, such as information about all modules' metadata items. +** +** System Metadata may be directly manipulated using the "smd:" Info command: +** +** smd:cmd=[;module=;node=;key=;value=] +** +** where is one of: {create|destroy|set|delete|get|init|start|shutdown}, and: +** - The "init", "start", and "shutdown" commands take no parameters; +** - The "create" and "destroy" commands require a "module" parameter; +** - The "set" command requires "key" and "value", the "delete" command only requires "key"; +** - The "get" command can take "module", "key" and "node" parameters, which if specified as +** empty, e.g., "module=;key=", will perform a wildcard metadata item retrieval. +** +** Open Issues: +** ------------ +** +** The SMD API currently provides no mechanism for notifying the caller whether (or when) +** the request has succeeded (or failed.) The challenge is that in general the asynchronous +** event may be triggered on a remote node, e.g., the SMD principal. Support for an optional +** callback for this purpose (per-module or per-API call) may be added in the future. +** +*/ + + +/* Define constants. */ + + +/* Maximum length for System Metadata persistence files. */ +#define MAX_PATH_LEN (1024) + +/* Time in milliseconds to wait for an incoming message. */ +#define AS_SMD_WAIT_INTERVAL_MS (1000) + +/* Time in milliseconds for System Metadata proxy transactions to the SMD principal. */ +#define AS_SMD_TRANSACT_TIMEOUT_MS (1000) + +#define SMD_MAX_STACK_MODULES 128 +#define SMD_MAX_STACK_NUM_ITEMS (1 << 14) + +/* Declare Private Types */ + + +/* + * Type for System Metadata command option flags. + */ +typedef enum as_smd_cmd_opt_e { + AS_SMD_CMD_OPT_NONE = 0x00, + AS_SMD_CMD_OPT_DUMP_SMD = 0x01, + AS_SMD_CMD_OPT_VERBOSE = 0x02 +} as_smd_cmd_opt_t; + +/* + * Types of API commands sent to the System Metadata module. + */ +typedef enum as_smd_cmd_type_e { + AS_SMD_CMD_INIT, // System Metadata API initialization + AS_SMD_CMD_START, // System Metadata start receiving cluster state changes + AS_SMD_CMD_CREATE_MODULE, // Metadata container creation + AS_SMD_CMD_DESTROY_MODULE, // Metadata container destruction + AS_SMD_CMD_SET_METADATA, // Add new, or modify existing, metadata item + AS_SMD_CMD_DELETE_METADATA, // Existing metadata item deletion + AS_SMD_CMD_GET_METADATA, // Get single metadata item + AS_SMD_CMD_CLUSTER_CHANGED, // Cluster state change + AS_SMD_CMD_INTERNAL, // System Metadata system internal command + AS_SMD_CMD_SHUTDOWN // System Metadata shut down +} as_smd_cmd_type_t; + +/* + * Name of the given System Metadata API command type. + */ +#define AS_SMD_CMD_TYPE_NAME(cmd) (AS_SMD_CMD_INIT == cmd ? "INIT" : \ + (AS_SMD_CMD_START == cmd ? "START" : \ + (AS_SMD_CMD_CREATE_MODULE == cmd ? "CREATE" : \ + (AS_SMD_CMD_DESTROY_MODULE == cmd ? "DESTROY" : \ + (AS_SMD_CMD_SET_METADATA == cmd ? "SET" : \ + (AS_SMD_CMD_DELETE_METADATA == cmd ? "DELETE" : \ + (AS_SMD_CMD_GET_METADATA == cmd ? "GET" : \ + (AS_SMD_CMD_CLUSTER_CHANGED == cmd ? "CLUSTER" : \ + (AS_SMD_CMD_INTERNAL == cmd ? "INTERNAL" : \ + (AS_SMD_CMD_SHUTDOWN == cmd ? "SHUTDOWN" : "")))))))))) + +/* + * Type for System Metadata event messages sent via the API. + */ +typedef struct as_smd_cmd_s { + as_smd_cmd_type_t type; // System Metadata command type + uint32_t options; // Bit vector of event options of type "as_smd_cmd_opt_t" + as_smd_item_t *item; // Metadata item associated with this event (only relevant fields are set) + void *a, *b, *c, *d, *e, *f, *g, *h; // Generic storage for command parameters. +} as_smd_cmd_t; + +/* + * Types of operation messages handled by the System Metadata module, received as msg events. + */ +typedef enum as_smd_msg_op_e { + AS_SMD_MSG_OP_SET_ITEM, // Add a new, or modify an existing, metadata item + AS_SMD_MSG_OP_DELETE_ITEM, // Delete an existing metadata item (must already exist) [[Deprecated]] + AS_SMD_MSG_OP_MY_CURRENT_METADATA, // Current metadata sent from a node to the principal + AS_SMD_MSG_OP_ACCEPT_THIS_METADATA, // New blessed metadata sent from the principal to a node + AS_SMD_MSG_OP_SET_FROM_PR // Accept item (OPT_API) from principal. +} as_smd_msg_op_t; + +/* + * Name of the given System Metadata message operation. + */ +#define AS_SMD_MSG_OP_NAME(op) (AS_SMD_MSG_OP_SET_ITEM == op ? "SET_ITEM" : \ + (AS_SMD_MSG_OP_DELETE_ITEM == op ? "DELETE_ITEM" : \ + (AS_SMD_MSG_OP_MY_CURRENT_METADATA == op ? "MY_CURRENT_METADATA" : \ + (AS_SMD_MSG_OP_ACCEPT_THIS_METADATA == op ? "ACCEPT_THIS_METADATA" : \ + (AS_SMD_MSG_OP_SET_FROM_PR == op ? "SET_FROM_PR" : ""))))) + +/* + * Name of the given System Metadata action. + */ +#define AS_SMD_ACTION_NAME(action) (AS_SMD_ACTION_SET == action ? "SET" : \ + (AS_SMD_ACTION_DELETE == action ? "DELETE" : "")) + + +/* Define API Command / Message Type / Callback Action Correspondence Macros. */ + + +/* + * Message operation corresponding to the given API command type. + * (Default to SET_ITEM for the unknown case.) + */ +#define CMD_TYPE2MSG_OP(cmd) (AS_SMD_CMD_SET_METADATA == cmd ? AS_SMD_MSG_OP_SET_ITEM : \ + (AS_SMD_CMD_DELETE_METADATA == cmd ? AS_SMD_MSG_OP_DELETE_ITEM : AS_SMD_MSG_OP_SET_ITEM)) + +/* + * API action corresponding to the given message operation. + * (Default to SET for the unknown case.) + */ +#define MSG_OP2ACTION(op) (AS_SMD_MSG_OP_SET_ITEM == op ? AS_SMD_ACTION_SET : \ + (AS_SMD_MSG_OP_DELETE_ITEM == op ? AS_SMD_ACTION_DELETE : AS_SMD_ACTION_SET)) + +/* + * Type for System Metadata messages transmitted via the fabric. + */ +typedef struct as_smd_msg_s { + as_smd_msg_op_t op; // System Metadata operation + uint64_t cluster_key; // Sending node's cluster key + cf_node node_id; // Sending node's ID + char *module_name; // Name of the module. + uint32_t num_items; // Number of metadata items + as_smd_item_list_t *items; // List of metadata items associated with this message (only relevant fields are set) + uint32_t options; // Message options (originator) +} as_smd_msg_t; + +/* + * Types of events sent to and processed by the System Metadata thread. + */ +typedef enum as_smd_event_type_e { + AS_SMD_CMD, // SMD API command + AS_SMD_MSG, // SMD fabric message +} as_smd_event_type_t; + +/* + * Type for an event object handled by the System Metadata system. + * An event can either be an API command or a message transmitted via the fabric. + */ +typedef struct as_smd_event_s { + as_smd_event_type_t type; // Selector determining event type (command or message) + union { + as_smd_cmd_t cmd; // SMD command event sent via the SMD API + as_smd_msg_t msg; // SMD message event sent via fabric + } u; +} as_smd_event_t; + +/* + * Type for the key for items in the external metadata hash table: node_id, key_len, key (flexible array member, sized by key_len.) + */ +typedef struct as_smd_external_item_key_s { + cf_node node_id; // ID of the source cluster node. + size_t key_len; // Length of the key string. + char key[]; // Flexible array member for the null-terminated key string. +} as_smd_external_item_key_t; + +typedef enum { + AS_SMD_MSG_TRID, + AS_SMD_MSG_ID, + AS_SMD_MSG_CLUSTER_KEY, + AS_SMD_MSG_OP, + AS_SMD_MSG_NUM_ITEMS, // deprecated + AS_SMD_MSG_ACTION, // deprecated + AS_SMD_MSG_MODULE, // deprecated + AS_SMD_MSG_KEY, // deprecated + AS_SMD_MSG_VALUE, // deprecated + AS_SMD_MSG_GENERATION, // deprecated + AS_SMD_MSG_TIMESTAMP, + AS_SMD_MSG_MODULE_NAME, + AS_SMD_MSG_OPTIONS, // deprecated + + AS_SMD_MSG_MODULE_LIST, + AS_SMD_MSG_MODULE_COUNTS, + AS_SMD_MSG_KEY_LIST, + AS_SMD_MSG_VALUE_LIST, + AS_SMD_MSG_GEN_LIST, + + AS_SMD_MSG_SINGLE_KEY, + AS_SMD_MSG_SINGLE_VALUE, + AS_SMD_MSG_SINGLE_GENERATION, + AS_SMD_MSG_SINGLE_TIMESTAMP, + + NUM_SMD_FIELDS +} smd_msg_fields; + +#define AS_SMD_MSG_V2_IDENTIFIER 0x123B + +/* + * Define the template for System Metadata messages. + * + * System Metadata message structure: + * 0). Transaction ID - UINT64 (Required for Fabric Transact.) + * 1). System Metadata Protocol Version Identifier - (uint32_t <==> UINT32) [Only V2 for now.] + * 2). Cluster Key - (uint64_t <==> UINT64) + * 3). Operation - (uint32_t <==> UINT32) + * 4). Number of items - (uint32_t <==> UINT32) + * 5). Action[] - Array of (uint32_t <==> UINT32) + * 6). Module[] - Array of (char * <==> STR) + * 7). Key[] - Array of (char * <==> STR) + * 8). Value[] - Array of (char * <==> STR) + * 9). Generation[] - Array of (uint32_t <==> UINT32) + * 10). Timestamp[] - Array of (uint64_t <==> UINT64) + * 11). Module Name - (char * <==> STR) + * 12). Options - (uint32_t <==> UINT32) + */ +static const msg_template as_smd_msg_template[] = { + { AS_SMD_MSG_TRID, M_FT_UINT64 }, // Transaction ID for Fabric Transact + { AS_SMD_MSG_ID, M_FT_UINT32 }, // Version of the System Metadata protocol + { AS_SMD_MSG_CLUSTER_KEY, M_FT_UINT64 }, // Cluster key corresponding to msg contents + { AS_SMD_MSG_OP, M_FT_UINT32 }, // Metadata operation + { AS_SMD_MSG_NUM_ITEMS, M_FT_UINT32 }, // Number of metadata items + { AS_SMD_MSG_ACTION, M_FT_ARRAY_UINT32 }, // Metadata action array + { AS_SMD_MSG_MODULE, M_FT_ARRAY_STR }, // Metadata module array + { AS_SMD_MSG_KEY, M_FT_ARRAY_STR }, // Metadata key array + { AS_SMD_MSG_VALUE, M_FT_ARRAY_STR }, // Metadata value array + { AS_SMD_MSG_GENERATION, M_FT_ARRAY_UINT32 }, // Metadata generation array + { AS_SMD_MSG_TIMESTAMP, M_FT_ARRAY_UINT64 }, // Metadata timestamp array + { AS_SMD_MSG_MODULE_NAME, M_FT_STR }, // Name of module the message is from or else NULL if from all. + { AS_SMD_MSG_OPTIONS, M_FT_UINT32 }, // Option flags specifying the originator of the message (i.e., MERGE/API) + + { AS_SMD_MSG_MODULE_LIST, M_FT_MSGPACK }, + { AS_SMD_MSG_MODULE_COUNTS, M_FT_MSGPACK }, + { AS_SMD_MSG_KEY_LIST, M_FT_MSGPACK }, + { AS_SMD_MSG_VALUE_LIST, M_FT_MSGPACK }, + { AS_SMD_MSG_GEN_LIST, M_FT_MSGPACK }, + + { AS_SMD_MSG_SINGLE_KEY, M_FT_STR }, + { AS_SMD_MSG_SINGLE_VALUE, M_FT_STR }, + { AS_SMD_MSG_SINGLE_GENERATION, M_FT_UINT32 }, + { AS_SMD_MSG_SINGLE_TIMESTAMP, M_FT_UINT64 }, +}; + +COMPILER_ASSERT(sizeof(as_smd_msg_template) / sizeof(msg_template) == NUM_SMD_FIELDS); + +#define AS_SMD_MSG_SCRATCH_SIZE 64 // accommodate module name + +/* + * State of operation of the System Metadata module. + */ +typedef enum as_smd_state_e { + AS_SMD_STATE_IDLE, // Not initialized yet + AS_SMD_STATE_INITIALIZED, // Ready to receive API calls + AS_SMD_STATE_RUNNING, // Normal operation: Receiving cluster state changes + AS_SMD_STATE_EXITING // Shutting down +} as_smd_state_t; + +/* + * Name of the given System Metadata state. + */ +#define AS_SMD_STATE_NAME(state) (AS_SMD_STATE_IDLE == state ? "IDLE" : \ + (AS_SMD_STATE_INITIALIZED == state ? "INITIALIZED" : \ + (AS_SMD_STATE_RUNNING == state ? "RUNNING" : \ + (AS_SMD_STATE_EXITING == state ? "EXITING" : "UNKNOWN")))) + +#define SMD_PENDING_MERGE_TIMEOUT_SEC 30 + +typedef struct smd_pending_merge_s { + as_smd_msg_t m; + uint64_t expire; +} smd_pending_merge; + +/* + * Internal representation of the state of the System Metadata module. + */ +struct as_smd_s { + + // System Metadata thread ID. + pthread_t thr_id; + + // System Metadata thread attributes. + pthread_attr_t thr_attr; + + // Is the System Metadata module up and running? + as_smd_state_t state; + + // Hash table mapping module name (char *) ==> module object (as_smd_module_t *). + cf_rchash *modules; + + // Message queue for receiving System Metadata messages. + cf_queue *msgq; + + // Scoreboard of what cluster nodes the SMD principal has received metadata from: cf_node ==> cf_shash *. + cf_shash *scoreboard; + + cf_queue pending_merge_queue; // elements are (smd_pending_merge) +}; + +/* + * Type representing a module and holding all metadata for the module. + */ +typedef struct as_smd_module_s { + + // Name of this module. + char *module; + + // This module's merge metadata callback function (or NULL if none.) + as_smd_merge_cb merge_cb; + + // User data for the merge metadata callback (or NULL if none.) + void *merge_udata; + + // This module's item conflict resolution callback function (or NULL if none.) + as_smd_conflict_cb conflict_cb; + + // User data for the item conflict resolution callback (or NULL if none.) + void *conflict_udata; + + // This module's accept metadata callback function (or NULL if none.) + as_smd_accept_cb accept_cb; + + // User data for the accept metadata callback (or NULL if none.) + void *accept_udata; + + // This module's user_op validation callback (or NULL if none.) + as_smd_can_accept_cb can_accept_cb; + + // User data for the user_op validation callback (or NULL if none.) + void *can_accept_udata; + + // Parsed JSON representation of the module's metadata. + json_t *json; + + // Hash table of metadata registered by this node mapping key (char *) ==> metadata item (as_smd_item_t *). + cf_rchash *my_metadata; + + // Hash table of metadata received from all external nodes mapping key (as_smd_external_item_key_t *) ==> metadata item (as_smd_item_t *). + cf_rchash *external_metadata; + + // Does the module need to be persisted? + bool dirty; +} as_smd_module_t; + + +/* Define macros. */ + + +/* + * Free and set to NULL a pointer if non-NULL. + */ +#define CF_FREE_AND_NULLIFY(ptr) \ + if (ptr) { \ + cf_free(ptr); \ + ptr = NULL; \ + } + +/* + * Free members of a metadata item if non-NULL. + */ +#define RELEASE_ITEM_MEMBERS(ptr) \ + CF_FREE_AND_NULLIFY(ptr->module_name); \ + CF_FREE_AND_NULLIFY(ptr->key); \ + CF_FREE_AND_NULLIFY(ptr->value); + + +/* Function forward references. */ + + +static int as_smd_module_persist(as_smd_module_t *module_obj); +void *as_smd_thr(void *arg); + + +/* Globals. */ + +as_smd_t *g_smd; + +static uint64_t g_cluster_key; +static uint32_t g_cluster_size; +static cf_node g_succession[AS_CLUSTER_SZ]; + +static void as_smd_destroy_event(as_smd_event_t *evt); + +/* Get SMD's principal node */ + + +static inline cf_node as_smd_principal() +{ + return g_succession[0]; +} + + +/* Internal message passing functions. */ + + +/* + * Allocate a System Metadata cmd event object to handle API commands. + * (Note: Using 0 for "node_id" is shorthand for the current node.) + * + * Release using "as_smd_destroy_event()". + */ +static as_smd_event_t *as_smd_create_cmd_event(as_smd_cmd_type_t type, ...) +{ + as_smd_event_t *evt = NULL; + as_smd_item_t *item = NULL; + + // In Commands: Internal + uint32_t options = 0; + + // (Always zero.) + cf_node node_id = 0; + + // In Commands: Create / Destroy / Set / Delete / Get + char *module = NULL; + + // In Commands: Set / Delete / Get + char *key = NULL; + + // In Commands: Set + char *value = NULL; + uint32_t generation = 0; + uint64_t timestamp = 0UL; + + // In Commands: Create + as_smd_merge_cb merge_cb = NULL; + void *merge_udata = NULL; + as_smd_conflict_cb conflict_cb = NULL; + void *conflict_udata = NULL; + as_smd_accept_cb accept_cb = NULL; + void *accept_udata = NULL; + as_smd_can_accept_cb can_accept_cb = NULL; + void *can_accept_udata = NULL; + + // In Commands: Get + as_smd_get_cb get_cb = NULL; + void *get_udata = NULL; + + // In Command: Cluster-changed + uint64_t cluster_key = 0; + uint32_t cluster_size = 0; + cf_node *succession = NULL; + + // Handle variable arguments. + va_list args; + va_start(args, type); + switch (type) { + case AS_SMD_CMD_INIT: + case AS_SMD_CMD_START: + case AS_SMD_CMD_SHUTDOWN: + // (No additional arguments.) + break; + + case AS_SMD_CMD_CREATE_MODULE: + module = va_arg(args, char *); + merge_cb = va_arg(args, as_smd_merge_cb); + merge_udata = va_arg(args, void *); + conflict_cb = va_arg(args, as_smd_conflict_cb); + conflict_udata = va_arg(args, void *); + accept_cb = va_arg(args, as_smd_accept_cb); + accept_udata = va_arg(args, void *); + can_accept_cb = va_arg(args, as_smd_can_accept_cb); + can_accept_udata = va_arg(args, void *); + break; + + case AS_SMD_CMD_DESTROY_MODULE: + module = va_arg(args, char *); + break; + + case AS_SMD_CMD_SET_METADATA: + module = va_arg(args, char *); + key = va_arg(args, char *); + value = va_arg(args, char *); + generation = va_arg(args, uint32_t); + timestamp = va_arg(args, uint64_t); + break; + + case AS_SMD_CMD_DELETE_METADATA: + module = va_arg(args, char *); + key = va_arg(args, char *); + break; + + case AS_SMD_CMD_GET_METADATA: + module = va_arg(args, char *); + key = va_arg(args, char *); + get_cb = va_arg(args, as_smd_get_cb); + get_udata = va_arg(args, void *); + break; + + case AS_SMD_CMD_CLUSTER_CHANGED: + cf_debug(AS_SMD, "At event creation for cluster state change"); + cluster_key = va_arg(args, uint64_t); + cluster_size = va_arg(args, uint32_t); + succession = va_arg(args, cf_node *); + break; + + case AS_SMD_CMD_INTERNAL: + options = va_arg(args, uint32_t); + break; + } + va_end(args); + + // Allocate an event object and initialize it as a command. + evt = (as_smd_event_t *) cf_calloc(1, sizeof(as_smd_event_t)); + evt->type = AS_SMD_CMD; + as_smd_cmd_t *cmd = &(evt->u.cmd); + cmd->type = type; + cmd->options = options; + + // Only events with the module specified will create a cmd containing a metadata item. + if (module) { + // Create the metadata item. + // [NB: Reference-counted for insertion in metadata "rchash" table.] + item = (as_smd_item_t *) cf_rc_alloc(sizeof(as_smd_item_t)); + memset(item, 0, sizeof(as_smd_item_t)); + + cmd->item = item; + + // Set the originating node ID. + // (Note: Using 0 for "node_id" is shorthand for the current node.) + item->node_id = (!node_id ? g_config.self_node : node_id); + + item->action = MSG_OP2ACTION(CMD_TYPE2MSG_OP(type)); + + // Populate the item with duplicated metadata + // (Note: The caller is responsible for releasing any dynamically-allocated values passed in.) + + if (module) { + item->module_name = cf_strdup(module); + } + + if (key) { + item->key = cf_strdup(key); + } + + if (value) { + size_t value_len = strlen(value) + 1; + item->value = (char *) cf_malloc(value_len); + strncpy(item->value, value, value_len); + } + + item->generation = generation; + + item->timestamp = timestamp; + } + + // Store the policy callback information generically. + if (AS_SMD_CMD_CREATE_MODULE == type) { + cmd->a = merge_cb; + cmd->b = merge_udata; + cmd->c = conflict_cb; + cmd->d = conflict_udata; + cmd->e = accept_cb; + cmd->f = accept_udata; + cmd->g = can_accept_cb; + cmd->h = can_accept_udata; + } else if (AS_SMD_CMD_GET_METADATA == type) { + cmd->a = get_cb; + cmd->b = get_udata; + } else if (AS_SMD_CMD_CLUSTER_CHANGED == type) { + cmd->a = (void *)cluster_key; + cmd->b = (void *)(uint64_t)cluster_size; + cmd->c = succession; + } + + return evt; +} + +static bool +smd_msg_read_items(as_smd_msg_t *sm, const msg *m, const cf_vector *mod_vec, + const uint32_t *counts, cf_vector *key_vec, cf_vector *value_vec, + uint32_t *gen_list) +{ + if (! msg_msgpack_list_get_buf_array_presized(m, AS_SMD_MSG_KEY_LIST, + key_vec)) { + cf_warning(AS_SMD, "KEY_LIST invalid"); + return false; + } + + msg_msgpack_list_get_buf_array_presized(m, AS_SMD_MSG_VALUE_LIST, + value_vec); + + uint32_t check = sm->num_items; + + if (! msg_msgpack_list_get_uint32_array(m, AS_SMD_MSG_GEN_LIST, gen_list, + &check) || check != sm->num_items) { + cf_warning(AS_SMD, "GEN_LIST invalid with count %u num_items %u", check, sm->num_items); + return false; + } + + if (msg_get_uint64_array_count(m, AS_SMD_MSG_TIMESTAMP, &check) != 0 || + check != sm->num_items) { + cf_warning(AS_SMD, "TIMESTAMP invalid with count %u num_items %u", check, sm->num_items); + return false; + } + + sm->items = as_smd_item_list_create(sm->num_items); + + uint32_t msg_idx = 0; + + for (uint32_t i = 0; i < cf_vector_size(mod_vec); i++) { + const msg_buf_ele *p_mod = cf_vector_getp((cf_vector *)mod_vec, i); + + for (uint32_t j = 0; j < counts[i]; j++) { + as_smd_item_t *item = sm->items->item[msg_idx]; + + item->node_id = sm->node_id; + item->module_name = cf_strndup((const char *)p_mod->ptr, p_mod->sz); + + const msg_buf_ele *p_key = cf_vector_getp(key_vec, msg_idx); + const msg_buf_ele *p_value = (msg_idx < cf_vector_size(value_vec)) ? + cf_vector_getp(value_vec, msg_idx) : NULL; + + if (! p_key->ptr) { + cf_warning(AS_SMD, "invalid packed key at %u/%u", msg_idx, sm->num_items); + return false; + } + + item->key = cf_strndup((const char *)p_key->ptr, p_key->sz); + item->value = (p_value && p_value->ptr) ? + cf_strndup((const char *)p_value->ptr, p_value->sz) : NULL; + + item->generation = gen_list[msg_idx]; + msg_get_uint64_array(m, AS_SMD_MSG_TIMESTAMP, msg_idx, + &item->timestamp); + + item->action = item->value ? + AS_SMD_ACTION_SET : AS_SMD_ACTION_DELETE; + + msg_idx++; + } + } + + return true; +} + +// New message protocol. +static bool +smd_new_create_msg_event(as_smd_msg_t *sm, cf_node node_id, msg *m) +{ + uint32_t counts[SMD_MAX_STACK_MODULES]; + cf_vector_define(mod_vec, sizeof(msg_buf_ele), SMD_MAX_STACK_MODULES, 0); + + if (sm->op == AS_SMD_MSG_OP_ACCEPT_THIS_METADATA) { + sm->options = AS_SMD_ACCEPT_OPT_MERGE; + } + else if (sm->op == AS_SMD_MSG_OP_SET_FROM_PR) { + sm->op = AS_SMD_MSG_OP_ACCEPT_THIS_METADATA; + sm->options = AS_SMD_ACCEPT_OPT_API; + } + + if (sm->module_name) { + // Check single item optimized packing. + char *key; + + if (msg_get_str(m, AS_SMD_MSG_SINGLE_KEY, &key, NULL, + MSG_GET_DIRECT) == 0) { + sm->num_items = 1; + + sm->items = as_smd_item_list_create(1); + + as_smd_item_t *item = sm->items->item[0]; + + item->node_id = node_id; + item->module_name = cf_strdup(sm->module_name); + item->key = cf_strdup(key); + msg_get_str(m, AS_SMD_MSG_SINGLE_VALUE, &item->value, NULL, + MSG_GET_COPY_MALLOC); + msg_get_uint32(m, AS_SMD_MSG_SINGLE_GENERATION, &item->generation); + msg_get_uint64(m, AS_SMD_MSG_SINGLE_TIMESTAMP, &item->timestamp); + item->action = item->value ? + AS_SMD_ACTION_SET : AS_SMD_ACTION_DELETE; + + return true; + } + + if (! msg_msgpack_container_get_count(m, AS_SMD_MSG_KEY_LIST, + &sm->num_items) || sm->num_items == 0) { + sm->items = as_smd_item_list_create(0); + return true; + } + + msg_buf_ele ele = { + .sz = (uint32_t)strlen(sm->module_name), + .ptr = (uint8_t *)sm->module_name + }; + + cf_vector_append(&mod_vec, &ele); + counts[0] = sm->num_items; + } + else { + if (! msg_msgpack_container_get_count(m, AS_SMD_MSG_KEY_LIST, + &sm->num_items) || sm->num_items == 0) { + sm->items = as_smd_item_list_create(0); + return true; + } + + if (! msg_msgpack_list_get_buf_array_presized(m, AS_SMD_MSG_MODULE_LIST, + &mod_vec)) { + cf_warning(AS_SMD, "MODULE_LIST invalid"); + return false; + } + + if (cf_vector_size(&mod_vec) == 0) { + cf_warning(AS_SMD, "MODULE_LIST zero module names with num_items %u", sm->num_items); + return false; + } + + uint32_t check = SMD_MAX_STACK_MODULES; + + if (! msg_msgpack_list_get_uint32_array(m, AS_SMD_MSG_MODULE_COUNTS, + counts, &check) || + check != cf_vector_size(&mod_vec)) { + cf_warning(AS_SMD, "MODULE_COUNTS invalid with counts %u vector_size(mod_vec) %u", check, cf_vector_size(&mod_vec)); + return false; + } + + uint32_t total_check = 0; + + for (uint32_t i = 0; i < cf_vector_size(&mod_vec); i++) { + total_check += counts[i]; + } + + if (total_check != sm->num_items) { + cf_warning(AS_SMD, "MODULE_COUNTS total %u does not match num_items %u", total_check, sm->num_items); + return false; + } + } + + if (sm->num_items < SMD_MAX_STACK_NUM_ITEMS) { + uint32_t gen_list[sm->num_items]; + cf_vector_define(key_vec, sizeof(msg_buf_ele), sm->num_items, 0); + cf_vector_define(value_vec, sizeof(msg_buf_ele), sm->num_items, 0); + + return smd_msg_read_items(sm, m, &mod_vec, counts, &key_vec, &value_vec, + gen_list); + } + + cf_vector key_vec; + cf_vector value_vec; + uint32_t *gen_list = cf_malloc(sizeof(uint32_t) * sm->num_items); + + cf_vector_init(&key_vec, sizeof(msg_buf_ele), sm->num_items, 0); + cf_vector_init(&value_vec, sizeof(msg_buf_ele), sm->num_items, 0); + + bool ret = smd_msg_read_items(sm, m, &mod_vec, counts, &key_vec, &value_vec, + gen_list); + + cf_vector_destroy(&key_vec); + cf_vector_destroy(&value_vec); + cf_free(gen_list); + + return ret; +} + +/* + * Allocate a System Metadata msg event object to handle an incoming SMD fabric msg. + * + * Release using "as_smd_destroy_event()". + */ +static as_smd_event_t * +as_smd_old_create_msg_event(as_smd_msg_op_t op, cf_node node_id, msg *msg) +{ + as_smd_event_t *evt = NULL; + int e = 0; + + // Allocate an event object and initialize it as a msg. + evt = (as_smd_event_t *) cf_calloc(1, sizeof(as_smd_event_t)); + evt->type = AS_SMD_MSG; + as_smd_msg_t *smd_msg = &(evt->u.msg); + + smd_msg->op = op; + smd_msg->node_id = node_id; + + if ((e = msg_get_uint64(msg, AS_SMD_MSG_CLUSTER_KEY, &(smd_msg->cluster_key)))) { + cf_warning(AS_SMD, "failed to get cluster key from System Metadata fabric msg (err %d)", e); + cf_free(evt); + return 0; + } + + if ((e = msg_get_str(msg, AS_SMD_MSG_MODULE_NAME, &(smd_msg->module_name), 0, MSG_GET_COPY_MALLOC))) { + cf_debug(AS_SMD, "failed to get module name from System Metadata fabric msg (err %d)", e); + } + + if (msg_get_uint32(msg, AS_SMD_MSG_NUM_ITEMS, &smd_msg->num_items) != 0) { + if (! smd_new_create_msg_event(smd_msg, node_id, msg)) { + as_smd_destroy_event(evt); + return NULL; + } + + return evt; + } + + as_smd_destroy_event(evt); + return NULL; +} + + +/* Memory release functions for object types passed to the callback functions. */ + + +/* + * Release a reference-counted metadata item. + * (Note: This is *not* a public API.) + */ +static void as_smd_item_destroy(as_smd_item_t *item) +{ + if (item) { + if (!cf_rc_release(item)) { + RELEASE_ITEM_MEMBERS(item); + cf_rc_free(item); + } + } +} + +/* + * Allocate an empty list of to contain metadata items. + * (Note: This is *not* a public API.) + */ +static as_smd_item_list_t *as_smd_item_list_alloc(size_t num_items) +{ + as_smd_item_list_t *item_list = (as_smd_item_list_t *) + cf_malloc(sizeof(as_smd_item_list_t) + num_items * sizeof(as_smd_item_t *)); + + item_list->num_items = num_items; + memset(item_list->item, 0, num_items * sizeof(as_smd_item_t *)); + + return item_list; +} + +/* + * Create an empty list of reference-counted metadata items. + * (Note: This is a public API for creating merge callback function arguments.) + */ +as_smd_item_list_t *as_smd_item_list_create(size_t num_items) +{ + as_smd_item_list_t *item_list = as_smd_item_list_alloc(num_items); + + // Use num_items to count the number of successfully allocated items. + item_list->num_items = 0; + for (int i = 0; i < num_items; i++) { + item_list->item[i] = (as_smd_item_t *) cf_rc_alloc(sizeof(as_smd_item_t)); + memset(item_list->item[i], 0, sizeof(as_smd_item_t)); + item_list->num_items++; + } + + return item_list; +} + +/* + * Release a list of reference-counted metadata items. + * (Note: This is a public API for releasing merge callback function arguments.) + */ +void as_smd_item_list_destroy(as_smd_item_list_t *items) +{ + if (items) { + for (int i = 0; i < items->num_items; i++) { + as_smd_item_destroy(items->item[i]); + items->item[i] = NULL; + } + cf_free(items); + } +} + +/* + * Release a System Metadata event object (either a cmd or a msg.) + */ +static void as_smd_destroy_event(as_smd_event_t *evt) +{ + if (evt) { + if (AS_SMD_CMD == evt->type) { + as_smd_cmd_t *cmd = &(evt->u.cmd); + + // Give back the item reference if necessary. + as_smd_item_destroy(cmd->item); + cmd->item = NULL; + } else if (AS_SMD_MSG == evt->type) { + as_smd_msg_t *msg = &(evt->u.msg); + + // Release the module name. + if (msg->module_name) { + cf_free(msg->module_name); + msg->module_name = NULL; + } + + // Release the msg item list. + as_smd_item_list_destroy(msg->items); + msg->num_items = 0; + msg->items = NULL; + } else { + cf_warning(AS_SMD, "not destroying unknown type of System Metadata event (%d)", evt->type); + return; + } + + // Release the event itself. + cf_free(evt); + } else { + cf_warning(AS_SMD, "not freeing NULL System Metadata event"); + } +} + +/* + * Send an event to the System Metadata thread via the message queue. + */ +static int as_smd_send_event(as_smd_t *smd, as_smd_event_t *evt) +{ + if (!smd) { + cf_warning(AS_SMD, "System Metadata is not initialized ~~ Not sending event!"); + as_smd_destroy_event(evt); + return -1; + } + + cf_queue_push(smd->msgq, &evt); + + return 0; +} + + +/* System Metadata Module Init / Start / Shutdown API */ + + +/* + * Free a module object from the modules rchash table. + */ +static void modules_rchash_destructor_fn(void *object) +{ + as_smd_module_t *module_obj = (as_smd_module_t *) object; + + cf_debug(AS_SMD, "mrdf(%p) [module \"%s\"] called!", object, module_obj->module); + + // Ensure that the module's callbacks cannot be called again. + module_obj->merge_cb = module_obj->merge_udata = NULL; + module_obj->conflict_cb = module_obj->conflict_udata = NULL; + module_obj->accept_cb = module_obj->accept_udata = NULL; + module_obj->can_accept_cb = module_obj->can_accept_udata = NULL; + + // Release the module's JSON if necessary. + json_decref(module_obj->json); + module_obj->json = NULL; + + // Free the module's name. + CF_FREE_AND_NULLIFY(module_obj->module); + + // Free both of the module's metadata hash tables. + cf_rchash_destroy(module_obj->my_metadata); + cf_rchash_destroy(module_obj->external_metadata); +} + +/* + * Free a metadata item from the metadata rchash table. + */ +static void metadata_rchash_destructor_fn(void *object) +{ + as_smd_item_t *item = (as_smd_item_t *) object; + + cf_debug(AS_SMD, "mdrdf(%p) [key \"%s\"] called!", object, item->key); + + // Free up the members of the item. + RELEASE_ITEM_MEMBERS(item); +} + +/* + * Handle a cluster state change event notification from as_exchange. + */ +static void as_smd_cluster_state_changed_fn(const as_exchange_cluster_changed_event *event, void *udata) +{ + as_smd_t *smd = (as_smd_t *) udata; + + cf_debug(AS_SMD, "Received cluster state changed event!"); + + size_t succession_size = event->cluster_size * sizeof(cf_node); + cf_node *succession = cf_malloc(succession_size); + + memcpy(succession, event->succession, succession_size); + + // Send a Cluster Changed command to the System Metadata thread. + as_smd_send_event(smd, as_smd_create_cmd_event(AS_SMD_CMD_CLUSTER_CHANGED, event->cluster_key, event->cluster_size, succession)); +} + +/* + * Create and initialize a System Metadata module. (Local method for now.) + */ +static as_smd_t *as_smd_create(void) +{ + as_smd_t *smd = (as_smd_t *) cf_calloc(1, sizeof(as_smd_t)); + + // Go to the not yet initialized state. + smd->state = AS_SMD_STATE_IDLE; + + // Create the System Metadata modules hash table. + cf_rchash_create(&(smd->modules), cf_rchash_fn_fnv32, modules_rchash_destructor_fn, 0, 127, CF_RCHASH_BIG_LOCK); + + // Create the scoreboard hash table. + smd->scoreboard = cf_shash_create(cf_shash_fn_ptr, sizeof(cf_node), sizeof(cf_shash *), 127, CF_SHASH_BIG_LOCK); + + // Create the System Metadata message queue. + smd->msgq = cf_queue_create(sizeof(as_smd_event_t *), true); + + cf_queue_init(&smd->pending_merge_queue, sizeof(smd_pending_merge), 128, false); + + // Create the System Metadata thread. + + if (pthread_attr_init(&(smd->thr_attr))) { + cf_crash(AS_SMD, "failed to initialize the System Metadata thread attributes"); + } + + if (pthread_create(&(smd->thr_id), &(smd->thr_attr), as_smd_thr, smd)) { + cf_crash(AS_SMD, "failed to create the System Metadata thread"); + } + + // Send an INIT message to the System Metadata thread. + if (as_smd_send_event(smd, as_smd_create_cmd_event(AS_SMD_CMD_INIT))) { + cf_crash(AS_SMD, "failed to send INIT message to System Metadata thread"); + } + + return smd; +} + +/* + * Initialize the single global System Metadata module. + */ +as_smd_t *as_smd_init(void) +{ + // This is here only because we happen to use the absence of the old + // sindex SMD files as proof of a proper live jump from v3 to v5. We'll + // need to keep this around for a long time - perhaps move it to a + // better place when SMD is overhauled. + + char smd_path[MAX_PATH_LEN]; + char smd_save_path[MAX_PATH_LEN]; + + snprintf(smd_path, MAX_PATH_LEN, "%s/smd/%s.smd", g_config.work_directory, OLD_SINDEX_MODULE); + snprintf(smd_save_path, MAX_PATH_LEN, "%s.save", smd_path); + + struct stat buf; + bool both_gone = + stat(smd_path, &buf) != 0 && errno == ENOENT && + stat(smd_save_path, &buf) != 0 && errno == ENOENT; + + if (! both_gone) { + cf_crash_nostack(AS_SMD, + "Aerospike server was not properly switched to paxos-protocol v5 - " + "see Aerospike documentation http://www.aerospike.com/docs/operations/upgrade/cluster_to_3_13"); + } + + if (! g_smd) { + g_smd = as_smd_create(); + } else { + cf_warning(AS_SMD, "System Metadata is already initialized"); + } + + return g_smd; +} + +/* + * Convert an incoming fabric message into the corresponding msg event and post it to the System Metadata message queue. + */ +static int as_smd_msgq_push(cf_node node_id, msg *msg, void *udata) +{ + as_smd_t *smd = (as_smd_t *) udata; + + cf_debug(AS_SMD, "asmp(): Receiving a System Metadata message from node %016lX", node_id); + + // Make sure System Metadata is running before processing msg. + if (smd && smd->state != AS_SMD_STATE_RUNNING) { + cf_warning(AS_SMD, "System Metadata not initialized ~~ Ignoring incoming fabric msg!"); + return -1; + } + + // Verify the System Metadata fabric protocol version. + uint32_t version; + int e = msg_get_uint32(msg, AS_SMD_MSG_ID, &version); + if (0 > e) { + cf_warning(AS_SMD, "failed to get protocol version from System Metadata fabric msg"); + return -1; + } else if (AS_SMD_MSG_V2_IDENTIFIER != version) { + cf_warning(AS_SMD, "received System Metadata fabric msg for unknown protocol version (read: %d ; expected: %d) ~~ Ignoring message!", + version, AS_SMD_MSG_V2_IDENTIFIER); + return -1; + } + + // Extract the operation from the incoming fabric msg. + uint32_t op = 0; + msg_get_uint32(msg, AS_SMD_MSG_OP, &op); + + cf_debug(AS_SMD, "Operation received %s", AS_SMD_MSG_OP_NAME(op)); + + // Create a System Metadata msg event object and populate it from the fabric msg. + as_smd_event_t *evt = as_smd_old_create_msg_event(op, node_id, msg); + + cf_assert(evt, AS_SMD, "failed to create a System Metadata msg event"); + + // Send the msg event to the System Metadata thread. + return as_smd_send_event(smd, evt); +} + +/* + * Receiver function for System Metadata fabric transactions. + */ +static int as_smd_transact_recv_fn(cf_node node_id, msg *msg, void *transact_data, void *udata) +{ + as_smd_t *smd = (as_smd_t *) udata; + int retval = 0; + + cf_debug(AS_SMD, "astrf(): node %016lX (%s) received SMD transaction from node %016lX (%s)", + g_config.self_node, (as_smd_principal() == g_config.self_node ? "SMD principal" : "regular node"), + node_id, (as_smd_principal() == node_id ? "SMD principal" : "regular node")); + + // Send the received msg to the System Metadata thread. + if ((retval = as_smd_msgq_push(node_id, msg, smd))) { + cf_warning(AS_SMD, "failed to push received transact msg (retval %d)", retval); + } + + // Complete the transaction by replying to the received msg. + msg_reset(msg); + as_fabric_transact_reply(msg, transact_data); + + return retval; +} + +/* + * Start the System Metadata module to begin receiving cluster state change events. + */ +int as_smd_start(as_smd_t *smd) +{ + // Register System Metadata fabric transact message type. + if (as_fabric_transact_register(M_TYPE_SMD, as_smd_msg_template, + sizeof(as_smd_msg_template), AS_SMD_MSG_SCRATCH_SIZE, + as_smd_transact_recv_fn, smd)) { + cf_crash(AS_SMD, "Failed to register System Metadata fabric transact msg type!"); + } + + // Register to receive cluster state changed events. + as_exchange_register_listener(as_smd_cluster_state_changed_fn, (void *)smd); + + // Send a START message to the System Metadata thread. + int retval = 0; + if ((retval = as_smd_send_event(smd, as_smd_create_cmd_event(AS_SMD_CMD_START)))) { + cf_crash(AS_SMD, "failed to send START message to System Metadata thread"); + } + + return retval; +} + +/* + * Terminate the System Metadata module. + */ +int as_smd_shutdown(as_smd_t *smd) +{ + // Send a SHUTDOWN message to the System Metadata thread. + return as_smd_send_event(smd, as_smd_create_cmd_event(AS_SMD_CMD_SHUTDOWN)); +} + + +/* + * Public System Metadata Manipulation API Functions: + * These functions are executed in the context of a module using System Metadata. + */ + + +/* + * Create a container for the named module's metadata and register the policy callback functions. + * (Pass a NULL callback function pointer to select the default policy.) + */ +int as_smd_create_module(char *module, + as_smd_merge_cb merge_cb, void *merge_udata, + as_smd_conflict_cb conflict_cb, void *conflict_udata, + as_smd_accept_cb accept_cb, void *accept_udata, + as_smd_can_accept_cb can_accept_cb, void *can_accept_udata) +{ + // Send a CREATE command to the System Metadata thread. + return as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_CREATE_MODULE, module, + merge_cb, merge_udata, conflict_cb, conflict_udata, + accept_cb, accept_udata, can_accept_cb, can_accept_udata)); +} + +/* + * Destroy the container for the named module's metadata, releasing all of its metadata. + */ +int as_smd_destroy_module(char *module) +{ + // Send a DESTROY command to the System Metadata thread. + return as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_DESTROY_MODULE, module)); +} + +/* + * Add a new, or modify an existing, metadata item in an existing module. + */ +int as_smd_set_metadata(char *module, char *key, char *value) +{ + // Send an SET command to the System Metadata thread. + return as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_SET_METADATA, module, key, value, 0, 0UL)); +} + +/* + * Add a new, or modify an existing, metadata item (with generation and timestamp) in an existing module. + * (Note: This is an internal-only function, not available via the public SMD API.) + */ +int as_smd_set_metadata_gen_ts(char *module, char *key, char *value, uint32_t generation, uint64_t timestamp) +{ + // Send an SET command to the System Metadata thread. + return as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_SET_METADATA, module, key, value, generation, timestamp)); +} + +/* + * Delete an existing metadata item from an existing module. + */ +int as_smd_delete_metadata(char *module, char *key) +{ + // Send a DELETE command to the System Metadata thread. + return as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_DELETE_METADATA, module, key)); +} + +/* + * Retrieve metadata item(s.) (Pass NULL for module and/or key for "all".) + */ +int as_smd_get_metadata(char *module, char *key, as_smd_get_cb cb, void *udata) +{ + // Send a GET command to the System Metadata thread. + return as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_GET_METADATA, module, key, cb, udata)); +} + + +/* + * Info Command Functions: + * These functions are executed in the context of the Info system. + */ + + +/* + * Reduce function to print a single metadata item. + */ +static int as_smd_metadata_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ +// char *smd_key = (char *) key; // (Not used.) + as_smd_item_t *item = (as_smd_item_t *) object; + + cf_info(AS_SMD, "%016lX\t\"%s\"\t\"%s\"\t\"%s\"\t%u\t\t%lu", item->node_id, item->module_name, item->key, item->value, item->generation, item->timestamp); + + return 0; +} + +/* + * Reduce function to print info. about a single System Metadata module. + */ +static int as_smd_dump_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ + const char *module = (const char *) key; + as_smd_module_t *module_obj = (as_smd_module_t *) object; + int *module_num = (int *) udata; + int num_items = 0; + + cf_info(AS_SMD, "Module %d: \"%s\" [\"%s\"]: ", *module_num++, module, module_obj->module); + cf_info(AS_SMD, "merge cb: %p", module_obj->merge_cb); + cf_info(AS_SMD, "merge udata: %p", module_obj->merge_udata); + cf_info(AS_SMD, "conflict cb: %p", module_obj->conflict_cb); + cf_info(AS_SMD, "conflict udata: %p", module_obj->conflict_udata); + cf_info(AS_SMD, "accept cb: %p", module_obj->accept_cb); + cf_info(AS_SMD, "accept udata: %p", module_obj->accept_udata); + cf_info(AS_SMD, "can accept cb: %p", module_obj->can_accept_cb); + cf_info(AS_SMD, "can accept udata: %p", module_obj->can_accept_udata); + + cf_info(AS_SMD, "My Metadata:"); + cf_info(AS_SMD, "number of metadata items: %d", num_items = cf_rchash_get_size(module_obj->my_metadata)); + if (num_items) { + cf_info(AS_SMD, "Node ID\t\tModule\tKey\tValue\t\tGeneration\tTimestamp"); + cf_rchash_reduce(module_obj->my_metadata, as_smd_metadata_reduce_fn, NULL); + } + + cf_info(AS_SMD, "External Metadata:"); + cf_info(AS_SMD, "number of metadata items: %d", num_items = cf_rchash_get_size(module_obj->external_metadata)); + if (num_items) { + cf_info(AS_SMD, "Node ID\t\tModule\tKey\tValue\t\tGeneration\tTimestamp"); + cf_rchash_reduce(module_obj->external_metadata, as_smd_metadata_reduce_fn, NULL); + } + + return 0; +} + +/* + * Print info. about the System Metadata state to the log. + * (Verbose event option prints detailed info. about the metadata values.) + */ +void as_smd_dump_metadata(as_smd_t *smd, as_smd_cmd_t *cmd) +{ + // Print info. about the System Metadata system. + cf_info(AS_SMD, "System Metadata Status:"); + cf_info(AS_SMD, "-----------------------"); + cf_info(AS_SMD, "thr_id: 0x%lx", smd->thr_id); + cf_info(AS_SMD, "thr_attr: %p", &smd->thr_attr); + cf_info(AS_SMD, "state: %s", AS_SMD_STATE_NAME(smd->state)); + cf_info(AS_SMD, "number of modules: %d", cf_rchash_get_size(smd->modules)); + cf_info(AS_SMD, "number of pending messages in queue: %d", cf_queue_sz(smd->msgq)); + + // If verbose, dump info. about the metadata itself. + if (cmd->options & AS_SMD_CMD_OPT_VERBOSE) { + int module_num = 0; + cf_rchash_reduce(smd->modules, as_smd_dump_reduce_fn, &module_num); + } +} + +/* + * Print info. about the System Metadata state to the log. + * (Verbose true prints detailed info. about the metadata values.) + */ +void as_smd_dump(bool verbose) +{ + // Send an INTERNAL + DUMP_SMD + verbosity command to the System Metadata thread. + as_smd_send_event(g_smd, as_smd_create_cmd_event(AS_SMD_CMD_INTERNAL, + (AS_SMD_CMD_OPT_DUMP_SMD | (verbose ? AS_SMD_CMD_OPT_VERBOSE : 0)))); +} + +/* + * Callback used to receive System Metadata items requested via the Info SMD "get" command. + */ +static int as_smd_info_get_fn(char *module, as_smd_item_list_t *items, void *udata) +{ + for (int i = 0; i < items->num_items; i++) { + as_smd_item_t *item = items->item[i]; + cf_info(AS_SMD, "SMD Info get metadata item[%d]: module \"%s\" ; key \"%s\" ; value \"%s\" ; generation %u ; timestamp %lu", + i, item->module_name, item->key, item->value, item->generation, item->timestamp); + } + + return 0; +} + +/* + * Manipulate the System Metadata and log the result. + */ +void as_smd_info_cmd(char *cmd, cf_node node_id, char *module, char *key, char *value) +{ + int retval = 0; + + // Invoke the appropriate System Metadata API function. + + if (!strcmp(cmd, "create")) { + if ((retval = as_smd_create_module(module, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL))) { + cf_warning(AS_SMD, "System Metadata create module \"%s\" failed (retval %d)", module, retval); + } + } else if (!strcmp(cmd, "destroy")) { + if ((retval = as_smd_destroy_module(module))) { + cf_warning(AS_SMD, "System Metadata destroy module \"%s\" failed (retval %d)", module, retval); + } + } else if (!strcmp(cmd, "set")) { + if (((retval = as_smd_set_metadata(module, key, value)))) { + cf_warning(AS_SMD, "System Metadata set item: module: \"%s\" key: \"%s\" value: \"%s\" failed (retval %d)", module, key, value, retval); + } + } else if (!strcmp(cmd, "delete")) { + if (((retval = as_smd_delete_metadata(module, key)))) { + cf_warning(AS_SMD, "System Metadata delete item: module: \"%s\" key: \"%s\" failed (retval %d)", module, key, retval); + } + } else if (!strcmp(cmd, "get")) { + if ((retval = as_smd_get_metadata(module, key, as_smd_info_get_fn, NULL))) { + cf_warning(AS_SMD, "System Metadata get node: %016lX module: \"%s\" key: \"%s\" failed (retval %d)", node_id, module, key, retval); + } + } else if (!strcmp(cmd, "init")) { + as_smd_init(); + } else if (!strcmp(cmd, "start")) { + if (g_smd) { + if ((retval = as_smd_start(g_smd))) { + cf_warning(AS_SMD, "System Metadata start up failed (retval %d)", retval); + } + } else { + cf_warning(AS_SMD, "System Metadata is not initialized"); + } + } else if (!strcmp(cmd, "shutdown")) { + if (g_smd) { + as_smd_shutdown(g_smd); + } else { + cf_warning(AS_SMD, "System Metadata is not initialized"); + } + } else { + cf_warning(AS_SMD, "unknown System Metadata command: \"%s\"", cmd); + } +} + + +/* + * System Metadata Internals: + * These functions are executed in the context of the System Metadata thread, + * except for the fabric callbacks. + */ + + +/* Metadata persistence functions. */ + + +/* + * Read in metadata for the given module from the standard location. + * Return: 0 if successful, -1 otherwise. + */ +static int as_smd_read(char *module, json_t **module_smd) +{ + int retval = 0; + json_t *root = NULL; + + char smd_path[MAX_PATH_LEN]; + size_t load_flags = JSON_REJECT_DUPLICATES; + json_error_t json_error; + + snprintf(smd_path, MAX_PATH_LEN, "%s/smd/%s.smd", g_config.work_directory, module); + + // Check if the persisted metadata file exists before attempting to read it. + struct stat buf; + if (!stat(smd_path, &buf)) { + if (!(root = json_load_file(smd_path, load_flags, &json_error))) { + cf_warning(AS_SMD, "failed to load System Metadata for module \"%s\" from file \"%s\" with JSON error: %s ; source: %s ; line: %d ; column: %d ; position: %d", + module, smd_path, json_error.text, json_error.source, json_error.line, json_error.column, json_error.position); + retval = -1; + } + } else { + cf_debug(AS_SMD, "failed to read persisted System Metadata file \"%s\" for module \"%s\": %s (%d)", smd_path, module, cf_strerror(errno), errno); + } + + if (module_smd) { + *module_smd = root; + } + + return retval; +} + +/* + * Write out metadata for the given module to the the standard location. + * Return: 0 if successful, -1 otherwise. + * + * Note: Any pre-existing file will be saved prior to write for + * manual recovery in case of system failure. + */ +static int as_smd_write(char *module, json_t *module_smd) +{ + int retval = 0; + + char smd_path[MAX_PATH_LEN]; + char smd_save_path[MAX_PATH_LEN]; + size_t dump_flags = JSON_INDENT(3) | JSON_ENSURE_ASCII | JSON_PRESERVE_ORDER; + + snprintf(smd_path, MAX_PATH_LEN, "%s/smd/%s.smd", g_config.work_directory, module); + snprintf(smd_save_path, MAX_PATH_LEN, "%s.save", smd_path); + + if (json_dump_file(module_smd, smd_save_path, dump_flags) < 0) { + cf_warning(AS_SMD, "failed to dump System Metadata for module \"%s\" to file \"%s\": %s (%d)", module, smd_path, cf_strerror(errno), errno); + return -1; + } + + if (rename(smd_save_path, smd_path) != 0) { + cf_warning(AS_SMD, "error on renaming existing metadata file \"%s\": %s (%d)", smd_save_path, cf_strerror(errno), errno); + return -1; + } + + return retval; +} + +/* + * Load persisted System Metadata for the given module: + * Read the module's JSON file (if it exists) and add each metadata found therein. + * Return: The number of metadata items restored (which may be 0) if reading + * the metadata file was successful, -1 otherwise. + */ +static int as_smd_module_restore(as_smd_module_t *module_obj) +{ + int retval = 0; + + // Load the module's metadata (if persisted.) + if ((retval = as_smd_read(module_obj->module, &(module_obj->json)))) { + cf_warning(AS_SMD, "failed to read persisted System Metadata for module \"%s\"", module_obj->module); + return -1; + } + + size_t num_items = json_array_size(module_obj->json); + for (int i = 0; i < num_items; i++) { + json_t *json_item = json_array_get(module_obj->json, i); + + if (!json_is_object(json_item)) { + // Warn and skip the bad item. + cf_warning(AS_SMD, "non-JSON object %d of type %d in persisted System Metadata for module \"%s\" ~~ Skipping!", i, json_typeof(json_item), module_obj->module); + continue; + } + + size_t num_fields = json_object_size(json_item); + if (5 != num_fields) { + // Warn if the item doesn't have the right number of fields. + cf_warning(AS_SMD, "wrong number of fields %zu (expected 5) for object %d in persisted System Metadata for module \"%s\"", num_fields, i, module_obj->module); + } + + char *module = (char *) json_string_value(json_object_get(json_item, "module")); + if (!module) { + cf_warning(AS_SMD, "missing \"module\" for object %d in persisted System Metadata for module \"%s\" ~~ Skipping!", i, module_obj->module); + continue; + } else if (strcmp(module_obj->module, module)) { + cf_warning(AS_SMD, "incorrect module \"%s\" for object %d in persisted System Metadata for module \"%s\" ~~ Skipping!", module, i, module_obj->module); + continue; + } + + char *key = (char *) json_string_value(json_object_get(json_item, "key")); + if (!key) { + cf_warning(AS_SMD, "missing \"key\" for object %d in persisted System Metadata for module \"%s\" ~~ Skipping!", i, module_obj->module); + continue; + } + + char *value = (char *) json_string_value(json_object_get(json_item, "value")); + if (!value) { + cf_warning(AS_SMD, "missing \"value\" for object %d in persisted System Metadata for module \"%s\" ~~ Skipping!", i, module_obj->module); + continue; + } + + // [Note: Should really use uint32_t, but Jansson integers are longs.] + uint64_t generation = 1; + json_t *generation_obj = json_object_get(json_item, "generation"); + if (!generation_obj) { + cf_warning(AS_SMD, "missing \"generation\" for object %d in persisted System Metadata for module \"%s\" ~~ Using 1!", i, module_obj->module); + } else { + if (0 == (generation = json_integer_value(generation_obj))) { + cf_warning(AS_SMD, "bad \"generation\" for object %d in persisted System Metadata for module \"%s\" ~~ Using 1!", i, module_obj->module); + generation = 1; + } + } + + uint64_t timestamp = cf_getms(); + json_t *timestamp_obj = json_object_get(json_item, "timestamp"); + if (!timestamp_obj) { + cf_warning(AS_SMD, "missing \"timestamp\" for object %d in persisted System Metadata for module \"%s\" ~~ Using now!", i, module_obj->module); + } else { + if (0 == (timestamp = json_integer_value(timestamp_obj))) { + cf_warning(AS_SMD, "bad \"timestamp\" for object %d in persisted System Metadata for module \"%s\" ~~ Using now!", i, module_obj->module); + timestamp = cf_getms(); + } + } + + // Send the item metadata add command. + as_smd_set_metadata_gen_ts(module, key, value, generation, timestamp); + + // Another metadata item was successfully restored. + retval++; + } + + // Release the module's JSON if necessary. + json_decref(module_obj->json); + module_obj->json = NULL; + + return retval; +} + +/* + * Serialize a single metadata item into a JSON object and add it to the array passed in via "udata". + */ +static int as_smd_serialize_into_json_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ +// char *smd_key = (char *) key; // (Not used.) + as_smd_item_t *item = (as_smd_item_t *) object; + json_t *array = (json_t *) udata; + json_t *metadata_obj = NULL; + + // Create an empty JSON object to hold the + if (!(metadata_obj = json_object())) { + cf_warning(AS_SMD, "failed to create JSON object to serialize metadata item: module \"%s\" ; key \"%s\"", item->module_name, item->key); + return 0; + } + + // Add each of the item's properties to the JSON object. + int e = 0; + e += json_object_set_new(metadata_obj, "module", json_string(item->module_name)); + e += json_object_set_new(metadata_obj, "key", json_string(item->key)); + e += json_object_set_new(metadata_obj, "value", json_string(item->value)); + e += json_object_set_new(metadata_obj, "generation", json_integer(item->generation)); + e += json_object_set_new(metadata_obj, "timestamp", json_integer(item->timestamp)); + + if (e) { + cf_warning(AS_SMD, "failed to serialize fields of metadata item: module \"%s\" ; key \"%s\"", item->module_name, item->key); + } else { + if (json_array_append_new(array, metadata_obj)) { + cf_warning(AS_SMD, "failed to add to array metadata item: module \"%s\" ; key \"%s\"", item->module_name, item->key); + } + } + + return 0; +} + +/* + * Store persistently System Metadata for the given module: + * Convert each of the module's metadata items into a JSON object and write an array of the results to the module's JSON file. + */ +static int as_smd_module_persist(as_smd_module_t *module_obj) +{ + int retval = 0; + + // Avoid unnecessary writes. + if (!module_obj->dirty) { + return retval; + } + + if (module_obj->json) { + cf_warning(AS_SMD, "module \"%s\" JSON is unexpectedly non-NULL (rc %zu) ~~ Nulling!", module_obj->module, module_obj->json->refcount); + json_decref(module_obj->json); + module_obj->json = NULL; + } + + // Create an empty JSON array. + if (!(module_obj->json = json_array())) { + cf_warning(AS_SMD, "failed to create JSON array for persisting module \"%s\"", module_obj->module); + return -1; + } + + // Walk the module's metadata hash table and create a JSON array of objects, one for each item. + cf_rchash_reduce(module_obj->my_metadata, as_smd_serialize_into_json_reduce_fn, module_obj->json); + + // Store the module's metadata persistently if necessary. + if (module_obj->json && (retval = as_smd_write(module_obj->module, module_obj->json))) { + cf_warning(AS_SMD, "failed to write persisted System Metadata file for module \"%s\"", module_obj->module); + retval = -1; + } else { + // The module's SMD has been persisted. + module_obj->dirty = false; + } + + // Release the module's JSON if necessary. + json_decref(module_obj->json); + module_obj->json = NULL; + + return retval; +} + +/* + * Create a metadata container for the given module. + */ +static int as_smd_module_create(as_smd_t *smd, as_smd_cmd_t *cmd) +{ + as_smd_item_t *item = cmd->item; + as_smd_module_t *module_obj; + int retval = 0; + + cf_debug(AS_SMD, "System Metadata thread - creating module \"%s\"", item->module_name); + + // Verify the module does not yet exist. + if (CF_RCHASH_OK == (retval = cf_rchash_get(smd->modules, item->module_name, strlen(item->module_name) + 1, (void **) &module_obj))) { + // (Note: This is not a problem ~~ May have come over the wire.) + cf_detail(AS_SMD, "System Metadata module \"%s\" already exists", item->module_name); + + // Give back the reference. + cf_rc_release(module_obj); + + return retval; + } + + // Create the module object. + // [NB: Reference-counted for insertion in modules "rchash" table.] + module_obj = (as_smd_module_t *) cf_rc_alloc(sizeof(as_smd_module_t)); + memset(module_obj, 0, sizeof(as_smd_module_t)); + + // Set the module's name. + module_obj->module = cf_strdup(item->module_name); + + // Create the module's local metadata hash table. + cf_rchash_create(&(module_obj->my_metadata), cf_rchash_fn_fnv32, metadata_rchash_destructor_fn, 0, 127, CF_RCHASH_BIG_LOCK); + + // Create the module's external metadata hash table. + cf_rchash_create(&(module_obj->external_metadata), cf_rchash_fn_fnv32, metadata_rchash_destructor_fn, 0, 127, CF_RCHASH_BIG_LOCK); + + // Add the module to the modules hash table. + if (CF_RCHASH_OK != (retval = cf_rchash_put_unique(smd->modules, item->module_name, strlen(item->module_name) + 1, module_obj))) { + cf_crash(AS_SMD, "failed to add System Metadata module \"%s\" to modules table (retval %d)", item->module_name, retval); + } + + // Set the callback functions and their respective user data. + module_obj->merge_cb = cmd->a; + module_obj->merge_udata = cmd->b; + module_obj->conflict_cb = cmd->c; + module_obj->conflict_udata = cmd->d; + module_obj->accept_cb = cmd->e; + module_obj->accept_udata = cmd->f; + module_obj->can_accept_cb = cmd->g; + module_obj->can_accept_udata = cmd->h; + + int num_items = as_smd_module_restore(module_obj); + if (0 > num_items) { + cf_warning(AS_SMD, "failed to restore persisted System Metadata for module \"%s\"", item->module_name); + } + + // Set an empty metadata item, signifying the completion of module creation, + // including the restoration of zero or more persisted metadata items. + // (Will trigger an Accept callback with the OPT_CREATE accept option.) + if ((retval = as_smd_set_metadata(module_obj->module, NULL, NULL))) { + cf_warning(AS_SMD, "failed to send SMD module \"%s\" creation complete event", module_obj->module); + } + + return retval; +} + +/* + * Find or create a System Metadata module object. + * The name if the module can be at two places + * 1. With each item + * 2. At the item_list level + * + * First preference is to get the information from the specific item + * If the item is NULL, get the information from the item_list. + */ +static as_smd_module_t * +as_smd_module_get(as_smd_t *smd, as_smd_item_t *item, as_smd_msg_t *msg) +{ + as_smd_module_t *module_obj = NULL; + int retval = 0; + + char *module_name = NULL; + + // First check for a given message with the module name set. + if (msg && msg->module_name) { + cf_debug(AS_SMD, "asmg(): Name of module from message: \"%s\"", module_name); + module_name = msg->module_name; + } + else if (item && item->module_name) { + // Next, see if an item is passed and it has module name set. This takes precedence. + module_name = item->module_name; + cf_debug(AS_SMD, "asmg(): Name of module from the item: \"%s\"", module_name); + } + else { + // If the message, item, and item_list are NULL, we cannot do anything. + cf_debug(AS_SMD, "asmg(): No module name found!"); + return NULL; + } + + if (CF_RCHASH_OK != (retval = cf_rchash_get(smd->modules, module_name, strlen(module_name) + 1, (void **) &module_obj))) { + as_smd_cmd_t cmd; + as_smd_item_t fakeitem; + // Could not find the module object corresponding to the module name. Create one. + // Note: No policy callback will be set if the module is created on-the-fly. + // + // Ideally, we should not land into this situation at all. + // All the legal module objects should get created upfront + // TODO : Should we not throw a warning/crash here and not create a new module ??? + memset(&cmd, 0, sizeof(as_smd_cmd_t)); + fakeitem.module_name = module_name; // Only the module name is used. All the callback pointers will be NULL. + cmd.type = AS_SMD_CMD_CREATE_MODULE; + cmd.item = &fakeitem; + if ((retval = as_smd_module_create(smd, &cmd))) { + cf_warning(AS_SMD, "failed to create System Metadata module \"%s\" (rv %d)", module_name, retval); + } else { + cf_debug(AS_SMD, "created System Metadata module \"%s\" on-the-fly", module_name); + + if (CF_RCHASH_OK != (retval = cf_rchash_get(smd->modules, module_name, strlen(module_name) + 1, (void **) &module_obj))) { + cf_crash(AS_SMD, "failed to get System Metadata module \"%s\" after creation (rv %d)", module_name, retval); + } + } + } + + return module_obj; +} + +/* + * Destroy a metadata container for the given module after releasing all contained metadata. + */ +static int as_smd_module_destroy(as_smd_t *smd, as_smd_cmd_t *cmd) +{ + as_smd_item_t *item = cmd->item; + int retval = 0; + + cf_debug(AS_SMD, "System Metadata thread - destroying module \"%s\"", item->module_name); + + // Remove the module's object from the hash table. + if (CF_RCHASH_OK != (retval = cf_rchash_delete(smd->modules, item->module_name, strlen(item->module_name) + 1))) { + cf_warning(AS_SMD, "failed to delete System Metadata module \"%s\" (retval %d)", item->module_name, retval); + return retval; + } + + return retval; +} + +static void +smd_msg_fill_items(msg *m, as_smd_item_t **items, uint32_t num_items, + cf_vector *key_vec, cf_vector *value_vec, uint32_t *gen_list) +{ + uint32_t value_count = 0; + + msg_set_uint64_array_size(m, AS_SMD_MSG_TIMESTAMP, num_items); + + for (uint32_t i = 0; i < num_items; i++) { + msg_buf_ele key_ele = { + .sz = (uint32_t)strlen(items[i]->key), + .ptr = (uint8_t *)items[i]->key + }; + + cf_vector_append(key_vec, &key_ele); + + msg_buf_ele value_ele = { + .ptr = (uint8_t *)items[i]->value + }; + + if (items[i]->value) { + value_ele.sz = (uint32_t)strlen(items[i]->value); + value_count++; + } + + cf_vector_append(value_vec, &value_ele); + + gen_list[i] = items[i]->generation; + msg_set_uint64_array(m, AS_SMD_MSG_TIMESTAMP, i, items[i]->timestamp); + } + + msg_msgpack_list_set_buf(m, AS_SMD_MSG_KEY_LIST, key_vec); + + if (value_count != 0) { + msg_msgpack_list_set_buf(m, AS_SMD_MSG_VALUE_LIST, value_vec); + } + + msg_msgpack_list_set_uint32(m, AS_SMD_MSG_GEN_LIST, gen_list, num_items); +} + +// New message protocol. +static msg * +smd_create_msg(as_smd_msg_op_t op, as_smd_item_t **items, uint32_t num_items, + const char *module_name, uint32_t accept_opt) +{ + msg *m = as_fabric_msg_get(M_TYPE_SMD); + + msg_set_uint32(m, AS_SMD_MSG_ID, AS_SMD_MSG_V2_IDENTIFIER); + msg_set_uint64(m, AS_SMD_MSG_CLUSTER_KEY, g_cluster_key); + + if (op == AS_SMD_MSG_OP_ACCEPT_THIS_METADATA && + (accept_opt & AS_SMD_ACCEPT_OPT_API) != 0) { + op = AS_SMD_MSG_OP_SET_FROM_PR; + } + else if (op == AS_SMD_MSG_OP_DELETE_ITEM) { + op = AS_SMD_MSG_OP_SET_ITEM; + } + + msg_set_uint32(m, AS_SMD_MSG_OP, op); + + if (module_name) { + msg_set_str(m, AS_SMD_MSG_MODULE_NAME, module_name, MSG_SET_COPY); + + // Single item optimized packing. + if (num_items == 1) { + msg_set_str(m, AS_SMD_MSG_SINGLE_KEY, items[0]->key, + MSG_SET_COPY); + + if (items[0]->value) { + msg_set_str(m, AS_SMD_MSG_SINGLE_VALUE, items[0]->value, + MSG_SET_COPY); + } + + if (items[0]->generation != 0) { + msg_set_uint32(m, AS_SMD_MSG_SINGLE_GENERATION, + items[0]->generation); + } + + if (items[0]->timestamp != 0) { + msg_set_uint64(m, AS_SMD_MSG_SINGLE_TIMESTAMP, + items[0]->timestamp); + } + + return m; + } + } + + if (num_items == 0) { + return m; + } + + if (! module_name) { + uint32_t mod_max = cf_rchash_get_size(g_smd->modules); + uint32_t mod_counts[mod_max]; + uint32_t count = 0; + const char *prev = NULL; + cf_vector_define(mod_vec, sizeof(msg_buf_ele), mod_max, 0); + + // Assume same item module names are clustered together. + for (uint32_t i = 0; i < num_items; i++) { + if (count != 0 && strcmp(prev, items[i]->module_name) == 0) { + mod_counts[count - 1]++; + continue; + } + + msg_buf_ele ele = { + .sz = (uint32_t)strlen(items[i]->module_name), + .ptr = (uint8_t *)items[i]->module_name + }; + + cf_vector_append(&mod_vec, &ele); + prev = items[i]->module_name; + + cf_assert(count < mod_max, AS_SMD, "unexpected item module name ordering"); + + mod_counts[count++] = 1; + } + + msg_msgpack_list_set_buf(m, AS_SMD_MSG_MODULE_LIST, &mod_vec); + msg_msgpack_list_set_uint32(m, AS_SMD_MSG_MODULE_COUNTS, mod_counts, + count); + } + + if (num_items < SMD_MAX_STACK_NUM_ITEMS) { + uint32_t gen_list[num_items]; + cf_vector_define(key_vec, sizeof(msg_buf_ele), num_items, 0); + cf_vector_define(value_vec, sizeof(msg_buf_ele), num_items, 0); + + smd_msg_fill_items(m, items, num_items, &key_vec, &value_vec, gen_list); + } + else { + cf_vector key_vec; + cf_vector value_vec; + uint32_t *gen_list = cf_malloc(sizeof(uint32_t) * num_items); + + if (cf_vector_init(&key_vec, sizeof(msg_buf_ele), num_items, 0) != 0) { + cf_crash(AS_SMD, "cf_vector_init"); + } + + if (cf_vector_init(&value_vec, sizeof(msg_buf_ele), num_items, 0) != + 0) { + cf_crash(AS_SMD, "cf_vector_init"); + } + + smd_msg_fill_items(m, items, num_items, &key_vec, &value_vec, gen_list); + + cf_vector_destroy(&key_vec); + cf_vector_destroy(&value_vec); + cf_free(gen_list); + } + + return m; +} + +/* + * Get or create a new System Metadata fabric msg to perform the given operation on the given metadata items. + */ +static msg * +as_smd_msg_get(as_smd_msg_op_t op, as_smd_item_t **item, size_t num_items, const char *module_name, uint32_t accept_opt) +{ + // TODO - collapse - don't need two functions any more. + return smd_create_msg(op, item, (uint32_t)num_items, module_name, accept_opt); +} + +/* + * Callback for fabric transact responses, both when forwarding metadata change commands to the SMD principal + * and when receiving message events from the SMD principal. + * + * Note: This function is currently shared between all System Metadata transactions, which works for now + * since the different transaction types don't require separate completion processing. + */ +static int transact_complete_fn(msg *response, void *udata, int fabric_err) +{ +// as_smd_t *smd = (as_smd_t *) udata; // (Not used.) + + if (!response) { + cf_warning(AS_SMD, "Null response message passed in transaction complete!"); + return -1; + } + + as_fabric_msg_put(response); + + if (AS_FABRIC_SUCCESS != fabric_err) { + cf_warning(AS_SMD, "System Metadata transaction failed with fabric error %d", fabric_err); + return -1; + } + + return 0; +} + +static void +smd_fabric_send(cf_node node_id, msg *m) +{ + if (node_id == g_config.self_node) { + as_smd_msgq_push(node_id, m, g_smd); + as_fabric_msg_put(m); + return; + } + + as_fabric_transact_start(node_id, m, AS_SMD_TRANSACT_TIMEOUT_MS, + transact_complete_fn, NULL); +} + +/* + * Send the metadata item change message to the SMD principal. + */ +static int as_smd_proxy_to_principal(as_smd_t *smd, as_smd_msg_op_t op, as_smd_item_t *item) +{ + if (as_smd_principal() == (cf_node)0) { + cf_warning(AS_SMD, "failed to get the SMD principal node ~~ Not proxying SMD msg"); + return -1; + } + + msg *msg = NULL; + + cf_debug(AS_SMD, "forwarding %s metadata request to SMD principal node %016lX", AS_SMD_MSG_OP_NAME(op), as_smd_principal()); + + // Get an existing (or create a new) System Metadata fabric msg for the appropriate operation and metadata item. + size_t num_items = 1; + if (!(msg = as_smd_msg_get(op, &item, num_items, item->module_name, AS_SMD_ACCEPT_OPT_API))) { + cf_warning(AS_SMD, "failed to get a System Metadata fabric msg for operation %s transact start for module \"%s\"", AS_SMD_MSG_OP_NAME(op), item->module_name); + return -1; + } + + smd_fabric_send(as_smd_principal(), msg); + + return 0; +} + +/* + * Locally change a metadata item. + */ +static int as_smd_metadata_change_local(as_smd_t *smd, as_smd_msg_op_t op, as_smd_item_t *item) +{ + int retval = 0; + + as_smd_module_t *module_obj = NULL; + + cf_debug(AS_SMD, "System Metadata thread - locally %s'ing metadata: node %016lX ; action %s ; module \"%s\" ; key \"%s\"", + AS_SMD_MSG_OP_NAME(op), item->node_id, AS_SMD_ACTION_NAME(item->action), item->module_name, item->key); + + // Find the module's object. + if (CF_RCHASH_OK != (retval = cf_rchash_get(smd->modules, item->module_name, strlen(item->module_name) + 1, (void **) &module_obj))) { + cf_warning(AS_SMD, "failed to find System Metadata module \"%s\" (retval %d)", item->module_name, retval); + return retval; + } + + if (AS_SMD_ACTION_DELETE == item->action) { + // Delete the metadata from the module's local metadata hash table. + if (CF_RCHASH_OK != (retval = cf_rchash_delete(module_obj->my_metadata, item->key, strlen(item->key) + 1))) { + cf_warning(AS_SMD, "failed to delete key \"%s\" from System Metadata module \"%s\" (retval %d)", item->key, item->module_name, retval); + } + } else if (item->key) { + // Handle the Set case: + + // Select metadata local hash table for incoming metadata. + cf_rchash *metadata_hash = module_obj->my_metadata; + + // The length of the key string includes the NULL terminator. + uint32_t key_len = strlen(item->key) + 1; + + // If the item is local, simply use the key string within the item. + void *key = item->key; + + // Default to generation 1. + if (!item->generation) { + item->generation = 1; + } + + // Default timestamp to now. + if (!item->timestamp) { + item->timestamp = cf_clepoch_milliseconds(); + } + + // Add new, replace or keep existing, metadata in the module's metadata hash table. + + as_smd_item_t *existing_item; + bool existing_wins = false; + + if (CF_RCHASH_OK == cf_rchash_get(metadata_hash, key, key_len, (void **)&existing_item)) { + existing_wins = (existing_item->generation > item->generation) || + ((existing_item->generation == item->generation) && + (existing_item->timestamp > item->timestamp)); + as_smd_item_destroy(existing_item); + } + + if (! existing_wins) { + // Add reference to item for storage in the hash table. + // (Note: One reference to the item will be released by the thread when it releases the containing command.) + cf_rc_reserve(item); + cf_rchash_put(metadata_hash, key, key_len, item); + } + } else { + cf_debug(AS_SMD, "(not setting empty metadata item for module \"%s\")", module_obj->module); + } + + // Give back the module reference. + cf_rc_release(module_obj); + + return retval; +} + +/* + * Handle a metadata change request by proxying to SMD principal or short-circuiting locally during node start-up. + */ +static int as_smd_metadata_change(as_smd_t *smd, as_smd_msg_op_t op, as_smd_item_t *item) +{ + int retval = 0; + + if ((AS_SMD_STATE_RUNNING == smd->state) && item->key) { + // Forward to SMD principal. + // [Ideally, would re-try or at least notify (via an as-yet nonexistent mechanism) upon failure.] + return as_smd_proxy_to_principal(smd, op, item); + } else { + // Short-circuit to handle change locally when this node is starting up + // or when an initially-empty module is being created, as indicated by NULL item key (and value.) + + cf_debug(AS_SMD, "handling metadata change type %s locally: module \"%s\" ; key \"%s\"", AS_SMD_MSG_OP_NAME(op), item->module_name, item->key); + + if ((retval = as_smd_metadata_change_local(smd, op, item))) { + cf_warning(AS_SMD, "failed to %s a metadata item locally: module \"%s\" ; key \"%s\" ; value \"%s\"", AS_SMD_MSG_OP_NAME(op), item->module_name, item->key, item->value); + } + + uint32_t accept_opt = AS_SMD_ACCEPT_OPT_API; + as_smd_item_list_t *item_list = NULL; + + if (!item->key) { + // Empty key (and value) indicates creation of an initially-empty module. + accept_opt = AS_SMD_ACCEPT_OPT_CREATE; + } else { + // While restoring pass this info to the module as well. This is needed + // at the boot to make sure metadata init is done before the data init is done. + item_list = as_smd_item_list_alloc(1); + item_list->item[0] = item; + } + + as_smd_module_t *module_obj = as_smd_module_get(smd, item, NULL); + + // At the end of module creation, SMD will be persisted. + if (AS_SMD_ACCEPT_OPT_CREATE == accept_opt) { + module_obj->dirty = true; + } + + if (module_obj->accept_cb) { + // Invoke the module's registered accept policy callback function. + (module_obj->accept_cb)(module_obj->module, item_list, module_obj->accept_udata, accept_opt); + } + + // Persist the accepted metadata for this module. + if (as_smd_module_persist(module_obj)) { + cf_warning(AS_SMD, "failed to persist accepted metadata for module \"%s\"", module_obj->module); + } + + cf_rc_release(module_obj); + + if (item_list) { + cf_free(item_list); + } + } + + return retval; +} + +/* + * Type representing the state of a metadata get request. + */ +typedef struct as_smd_metadata_get_state_s { + size_t num_items; // Number of matching items. + as_smd_item_t *item; // Item to compare with each item. + as_smd_item_list_t *item_list; // List of matching items. + cf_rchash_reduce_fn reduce_fn; // Reduce function to apply to matching items. +} as_smd_metadata_get_state_t; + +/* + * Reduce function to count one metadata item. + */ +static int as_smd_count_matching_item_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ +// char *smd_key = (char *) key; // (Not used.) + as_smd_item_t *item = (as_smd_item_t *) object; + as_smd_metadata_get_state_t *get_state = (as_smd_metadata_get_state_t *) udata; + + // Count each matching item. + if (!strcmp(get_state->item->key, "") || !strcmp(get_state->item->key, item->key)) { + get_state->num_items += 1; + } + + return 0; +} + +/* + * Reduce function to return a single metadata option, if it matches the pattern. + */ +static int as_smd_metadata_get_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ +// char *smd_key = (char *) key; // (Not used.) + as_smd_item_t *item = (as_smd_item_t *) object; + as_smd_metadata_get_state_t *get_state = (as_smd_metadata_get_state_t *) udata; + as_smd_item_list_t *item_list = get_state->item_list; + + // Add each matching item to the list. + if (!strcmp(get_state->item->key, "") || !strcmp(get_state->item->key, item->key)) { + cf_rc_reserve(item); + item_list->item[item_list->num_items] = item; + item_list->num_items += 1; + } + + return 0; +} + +/* + * Reduce function to perform a given reduce function on each matching module. + */ +static int as_smd_matching_module_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ + const char *module = (const char *) key; + as_smd_module_t *module_obj = (as_smd_module_t *) object; + as_smd_metadata_get_state_t *get_state = (as_smd_metadata_get_state_t *) udata; + + // Perform the given reduce function on matching module's metadata. + if (!strcmp(get_state->item->module_name, "") || !strcmp(get_state->item->module_name, module)) { + cf_rchash_reduce(module_obj->my_metadata, get_state->reduce_fn, get_state); + } + + return 0; +} + +/* + * Search for metadata according to the given search criteria. + * The incoming item's module and/or key can be NULL to perform a wildcard match. + */ +static int as_smd_metadata_get(as_smd_t *smd, as_smd_cmd_t *cmd) +{ + as_smd_item_t *item = cmd->item; + int retval = 0; + + cf_debug(AS_SMD, "System Metadata thread - get metadata: module \"%s\" ; node %016lX ; key \"%s\"", item->module_name, item->node_id, item->key); + + // Extract the user's callback function and user data. + as_smd_get_cb get_cb = cmd->a; + void *get_udata = cmd->b; + + if (!get_cb) { + cf_warning(AS_SMD, "no System Metadata get callback supplied ~~ Ignoring get metadata request!"); + return -1; + } + + as_smd_metadata_get_state_t get_state; + get_state.num_items = 0; + get_state.item = item; + get_state.item_list = NULL; + get_state.reduce_fn = as_smd_count_matching_item_reduce_fn; + + // Count the number of matching items. + cf_rchash_reduce(smd->modules, as_smd_matching_module_reduce_fn, &get_state); + + // Allocate a list of sufficient size for the get result. + as_smd_item_list_t *item_list = as_smd_item_list_alloc(get_state.num_items); + get_state.item_list = item_list; + + // (Note: Use num_items to count the position for each metadata item.) + item_list->num_items = 0; + + // Add matching items to the list. + get_state.reduce_fn = as_smd_metadata_get_reduce_fn; + cf_rchash_reduce(smd->modules, as_smd_matching_module_reduce_fn, &get_state); + + // Invoke the user's callback function. + (get_cb)(item->module_name, item_list, get_udata); + + // Release the item list. + as_smd_item_list_destroy(item_list); + + return retval; +} + +/* + * Cleanly release all System Metadata resources. + */ +static void as_smd_terminate(as_smd_t *smd) +{ + cf_debug(AS_SMD, "SMD Terminate called"); + + // After this is NULLed out, no more messages will be sent to the System Metadata queue. + g_smd = NULL; + + // De-register the System Metadata fabric transact message type. + // [Note: Don't need to remove the handler, simply drop the msg in the handler function.] +// as_fabric_transact_register(M_TYPE_SMD, NULL, 0, NULL, NULL); + + // Go to the not started up yet state. + smd->state = AS_SMD_STATE_IDLE; + + // Destroy the message queue. + cf_queue_destroy(smd->msgq); + + // Release the scoreboard hash table. + cf_shash_destroy(smd->scoreboard); + + // Release the modules hash table. + cf_rchash_destroy(smd->modules); + + // Release the System Metadata object. + cf_free(smd); +} + +/* + * Reduce function to count one metadata item. + */ +static int as_smd_count_item_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ +// char *smd_key = (char *) key; // (Not used.) +// as_smd_item_t *item = (as_smd_item_t *) object; // (Not used.) + size_t *num_items = (size_t *) udata; + + *num_items += 1; + + return 0; +} + +/* + * Reduce function to count metadata items in one module. + */ +static int as_smd_module_count_items_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ +// char *module = (char *) key; // (Not used.) + as_smd_module_t *module_obj = (as_smd_module_t *) object; + size_t *num_items = (size_t *) udata; + + // Increase the running total by the count the number of metadata items in this module. + cf_rchash_reduce(module_obj->my_metadata, as_smd_count_item_reduce_fn, num_items); + + return 0; +} + +/* + * Reduce function to serialize one metadata item. + */ +static int as_smd_item_serialize_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ +// char *smd_key = (char *) key; // (Not used.) + as_smd_item_t *item = (as_smd_item_t *) object; + as_smd_item_list_t *item_list = (as_smd_item_list_t *) udata; + + // Add a this metadata item to the list. + cf_rc_reserve(item); + item_list->item[item_list->num_items] = item; + item_list->num_items += 1; + + return 0; +} + +/* + * Reduce function to serialize all of a module's metadata items. + */ +static int as_smd_module_serialize_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ +// char *module = (char *) key; // (Not used.) + as_smd_module_t *module_obj = (as_smd_module_t *) object; + as_smd_item_list_t *item_list = (as_smd_item_list_t *) udata; + + // Serialize all of this module's metadata items. + cf_rchash_reduce(module_obj->my_metadata, as_smd_item_serialize_reduce_fn, item_list); + + return 0; +} + +static int as_smd_receive_metadata(as_smd_t *smd, as_smd_msg_t *smd_msg); + +static void +smd_expire_pending_merges() +{ + if (cf_queue_sz(&g_smd->pending_merge_queue) == 0) { + return; + } + + smd_pending_merge item; + uint64_t now = cf_getms(); + + while (cf_queue_pop(&g_smd->pending_merge_queue, &item, CF_QUEUE_NOWAIT) == + CF_QUEUE_OK) { + if (item.expire > now) { + cf_queue_push_head(&g_smd->pending_merge_queue, &item); + break; + } + + cf_free(item.m.module_name); + as_smd_item_list_destroy(item.m.items); + } +} + +static void +smd_process_pending_merges() +{ + uint64_t now = cf_getms(); + smd_pending_merge item; + int count = cf_queue_sz(&g_smd->pending_merge_queue); + + for (int i = 0; i < count; i++) { + cf_queue_pop(&g_smd->pending_merge_queue, &item, CF_QUEUE_NOWAIT); + + if (item.m.cluster_key == g_cluster_key) { + as_smd_receive_metadata(g_smd, &item.m); + } + else if (item.expire > now) { + cf_queue_push(&g_smd->pending_merge_queue, &item); + continue; + } + + cf_free(item.m.module_name); + as_smd_item_list_destroy(item.m.items); + } +} + +/* + * Handle a cluster state changed message. + * This function collects all metadata items in this node, from all the module, + * currently (UDF, SINDEX) and sends it to the SMD principal for merging the metadata. + */ +static void as_smd_cluster_changed(as_smd_t *smd, as_smd_cmd_t *cmd) +{ + cf_debug(AS_SMD, "System Metadata thread received cluster state changed cmd event!"); + + g_cluster_key = (uint64_t)cmd->a; + g_cluster_size = (uint32_t)(uint64_t)cmd->b; + memcpy(g_succession, cmd->c, g_cluster_size * sizeof(cf_node)); + + cf_free(cmd->c); + + // Determine the number of metadata items to be sent. + size_t num_items = 0; + cf_rchash_reduce(smd->modules, as_smd_module_count_items_reduce_fn, &num_items); + + cf_debug(AS_SMD, "sending %zu serialized metadata items to the SMD principal", num_items); + + // Copy all reference-counted metadata item pointers from the hash table into an item list. + // (Note: Even if this node has no metadata items, we must still send a message to the principal.) + as_smd_item_list_t *item_list = as_smd_item_list_alloc(num_items); + // (Note: Use num_items to count the position for each serialized metadata item.) + item_list->num_items = 0; + cf_rchash_reduce(smd->modules, as_smd_module_serialize_reduce_fn, item_list); + + cf_debug(AS_SMD, "aspc(): num_items = %zu (%zu)", item_list->num_items, num_items); + + // Build a System Metadata fabric msg containing serialized metadata from the item list. + msg *msg = NULL; + as_smd_msg_op_t my_smd_op = AS_SMD_MSG_OP_MY_CURRENT_METADATA; + if (!(msg = as_smd_msg_get(my_smd_op, item_list->item, item_list->num_items, NULL, 0))) { + cf_crash(AS_SMD, "failed to get a System Metadata fabric msg for operation %s transact start", AS_SMD_MSG_OP_NAME(my_smd_op)); + } + + // The metadata has been copied into the fabric msg and can now be released. + as_smd_item_list_destroy(item_list); + + smd_fabric_send(as_smd_principal(), msg); + + smd_process_pending_merges(); +} + +/* + * Destroy a node's scoreboard hash table mapping module to metadata item count. + */ +static int as_smd_scoreboard_reduce_delete_fn(const void *key, void *data, void *udata) +{ + cf_node node_id = (cf_node) key; + cf_shash *module_item_count_hash = *((cf_shash **) data); + + cf_debug(AS_SMD, "destroying module item count hash for node %016lX", node_id); + + cf_shash_destroy(module_item_count_hash); + + return CF_SHASH_REDUCE_DELETE; +} + +/* + * Remove the metadata item from the hash table. + */ +static int as_smd_reduce_delete_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ + return CF_RCHASH_REDUCE_DELETE; +} + +/* + * Delete all of this module's external metadata items. + */ +static int as_smd_delete_external_metadata_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ +// char *module = (char *) key; // (Not used.) + as_smd_module_t *module_obj = (as_smd_module_t *) object; + as_smd_t *smd = (as_smd_t *) udata; + + cf_rchash_reduce(module_obj->external_metadata, as_smd_reduce_delete_fn, smd); + cf_debug(AS_SMD, "All the entries in the scoreboard have been deleted"); + + return 0; +} + +/* + * Clear out the temporary state used to merge metadata upon cluster state change. + */ +static void as_smd_clear_scoreboard(as_smd_t *smd) +{ + cf_shash_reduce(smd->scoreboard, as_smd_scoreboard_reduce_delete_fn, smd); + cf_rchash_reduce(smd->modules, as_smd_delete_external_metadata_reduce_fn, smd); +} + +/* + * Apply a metadata change locally using the registered merge policy, defaulting to union. + */ +static int as_smd_apply_metadata_change(as_smd_t *smd, as_smd_module_t *module_obj, as_smd_msg_t *smd_msg) +{ + int retval = 0; + + as_smd_item_t *item = smd_msg->items->item[0]; // (Only log the fist item.) + cf_debug(AS_SMD, "System Metadata thread - applying metadata %s change: item 0: module \"%s\" ; key \"%s\" ; value \"%s\" ; action %d", + AS_SMD_MSG_OP_NAME(smd_msg->op), module_obj->module, item->key, item->value, item->action); + + // [Note: Only 1 item should ever be changed via this path.] + if (1 != smd_msg->num_items) { + cf_crash(AS_SMD, "unexpected number of metadata items being changed: %d != 1", smd_msg->num_items); + } + +#if 0 + if (module_obj->merge_cb) { + // Invoke the module's registered merge policy callback function. + (module_obj->merge_cb)(module_obj->module, smd_msg->item, NULL, module_obj->merge_udata); + } else { +#endif + cf_debug(AS_SMD, "asamc(): num_items %d", smd_msg->num_items); + + // By default, simply perform a union operation on an item-by-item basis. + for (int i = 0; i < smd_msg->num_items; i++) { + item = smd_msg->items->item[i]; + if (module_obj->can_accept_cb) { + int ret = (module_obj->can_accept_cb)(module_obj->module, item, module_obj->can_accept_udata); + if (ret != 0) { + cf_debug(AS_SMD, "SMD principal rejected the user operation with error code %s", as_sindex_err_str(ret)); + continue; + } else { + cf_debug(AS_SMD, "SMD principal validity check succeeded."); + } + } + + // Default timestamp to now. + if (!item->timestamp) { + item->timestamp = cf_clepoch_milliseconds(); + } + + cf_debug(AS_SMD, "asamc(): processing item %d: module \"%s\" key \"%s\" action %s gen %u ts %lu", i, item->module_name, item->key, AS_SMD_ACTION_NAME(item->action), item->generation, item->timestamp); + + // Perform the appropriate union operation. + + as_smd_item_t *existing_item = NULL; + if (CF_RCHASH_OK == cf_rchash_get(module_obj->my_metadata, item->key, strlen(item->key) + 1, (void **) &existing_item)) { + cf_debug(AS_SMD, "asamc(): Old item exists."); + } else { + cf_debug(AS_SMD, "asamc(): Old item does not exist."); + + if (AS_SMD_ACTION_DELETE == item->action) { + cf_debug(AS_SMD, "deleting a non-extant item: module \"%s\" ; key \"%s\"", item->module_name, item->key); + } + } + + if (!existing_item) { + // For delete, if item already doesn't exist, there's nothing to do. + if (AS_SMD_ACTION_DELETE == item->action) { + continue; + } else { + // Otherwise, default to generation 1. + if (!item->generation) { + item->generation = 1; + } + } + } + + // Choose the most up-to-date item data. + if (existing_item && (AS_SMD_ACTION_DELETE != item->action)) { + // Default to the next generation. + if (!item->generation) { + item->generation = existing_item->generation + 1; + } + + // Choose the newest first by the highest generation and second by the highest timestamp. + if ((existing_item->generation > item->generation) || + ((existing_item->generation == item->generation) && (existing_item->timestamp > item->timestamp))) { + + cf_debug(AS_SMD, "old item is newer"); + + // If the existing item is newer, skip the incoming item. + cf_rc_release(existing_item); + continue; + } else { + // Otherwise, advance the generation. + item->generation = existing_item->generation + 1; + + cf_debug(AS_SMD, "New items is newer: Going to gen %u ts %lu", item->generation, item->timestamp); + } + cf_rc_release(existing_item); + existing_item = NULL; + } + + // For each member of the succession list, + // Generate a new SMD fabric msg sharing the properties of the incoming msg event. + // Start a transaction to send the msg out to the node. + // The transaction recv function performs the accept metadata function locally. + + for (uint32_t i = 0; i < g_cluster_size; i++) { + msg *msg = NULL; + cf_node node_id = g_succession[i]; + as_smd_msg_op_t accept_op = AS_SMD_MSG_OP_ACCEPT_THIS_METADATA; + if (!(msg = as_smd_msg_get(accept_op, smd_msg->items->item, smd_msg->num_items, module_obj->module, AS_SMD_ACCEPT_OPT_API))) { + cf_warning(AS_SMD, "failed to get a System Metadata fabric msg for operation %s transact start ~~ Skipping node %016lX!", + AS_SMD_MSG_OP_NAME(accept_op), node_id); + continue; + } + + smd_fabric_send(node_id, msg); + } + } +#if 0 + } +#endif + + return retval; +} + +/* + * Increment hash table value by the given delta, starting from zero if not found, and return the new total. + */ +static int as_smd_shash_incr(cf_shash *ht, as_smd_module_t *module_obj, size_t delta) +{ + size_t count = 0; + + if (CF_SHASH_OK != cf_shash_get(ht, &module_obj, &count)) { + // If not found, start at zero. + count = 0; + } + + count += delta; + + cf_shash_put(ht, &module_obj, &count); + + cf_debug(AS_SMD, "incrementing metadata item count for module \"%s\" to %zu", module_obj->module, count); + + return count; +} + +/* + * Add the metadata items from this msg to the appropriate modules' external hash tables. + */ +static cf_shash *as_smd_store_metadata_by_module(as_smd_t *smd, as_smd_msg_t *smd_msg) +{ + as_smd_item_list_t *items = smd_msg->items; + cf_shash *module_item_count_hash = cf_shash_create(cf_shash_fn_ptr, sizeof(as_smd_module_t *), sizeof(size_t), 19, CF_SHASH_BIG_LOCK); + + for (int i = 0; i < items->num_items; i++) { + as_smd_item_t *item = items->item[i]; + + // Find the appropriate module's external hash table for this item. + as_smd_module_t *module_obj = NULL; + if (! (module_obj = as_smd_module_get(smd, item, NULL))) { + cf_warning(AS_SMD, "failed to get System Metadata module \"%s\" ~~ Skipping item!", item->module_name); + continue; + } + + // The length of the key string includes the NULL terminator. + uint32_t key_len = strlen(item->key) + 1; + uint32_t stack_key_len = sizeof(as_smd_external_item_key_t) + key_len; + + as_smd_external_item_key_t *stack_key = alloca(stack_key_len); + if (!stack_key) { + cf_crash(AS_SMD, "Failed to allocate stack key of size %d bytes!", stack_key_len); + } + stack_key->node_id = item->node_id; + stack_key->key_len = key_len; + memcpy(&(stack_key->key), item->key, key_len); + + // Warn if the item is already present. + as_smd_item_t *old_item = NULL; + cf_rchash *metadata_hash = module_obj->external_metadata; + if (CF_RCHASH_OK == cf_rchash_get(metadata_hash, stack_key, stack_key_len, (void **) &old_item)) { + cf_warning(AS_SMD, "found existing metadata item: node: %016lX module: \"%s\" key: \"%s\" value: \"%s\" ~~ Replacing with value: \"%s\"!", + item->node_id, item->module_name, item->key, old_item->value, item->value); + // Give back the item reference. + cf_rc_release(old_item); + } + + // Add reference to item for storage in the hash table. + // (Note: One reference to the item will be released by the thread when it releases the containing msg.) + cf_rc_reserve(item); + + // Insert the new metadata into the module's external metadata hash table, replacing any previous contents. + cf_rchash_put(metadata_hash, stack_key, stack_key_len, item); + + cf_debug(AS_SMD, "Stored metadata by module for item %d: module \"%s\" ; key \"%s\"", i, module_obj->module, stack_key->key); + // Increment the number of items for this module in this node's hash table. + as_smd_shash_incr(module_item_count_hash, module_obj, 1); + + // Give back the module reference. + cf_rc_release(module_obj); + } + + return module_item_count_hash; +} + +typedef struct smd_ext_item_search_s { + cf_node node_id; + as_smd_item_list_t *item_list; + uint32_t count; +} smd_ext_item_search; + +static int +smd_ext_items_fn(const void *key, uint32_t keylen, void *obj, void *udata) +{ + const as_smd_external_item_key_t *extkey = + (const as_smd_external_item_key_t *)key; + as_smd_item_t *item = (as_smd_item_t *)obj; + smd_ext_item_search *search = (smd_ext_item_search *)udata; + + if (extkey->node_id == search->node_id) { + cf_rc_reserve(item); + search->item_list->item[search->item_list->num_items] = item; + search->item_list->num_items++; + cf_debug(AS_SMD, "For the node \"%016lX\", num_items is %zu", extkey->node_id, search->item_list->num_items); + } + + return 0; +} + +static int +smd_ext_items_count_fn(const void *key, uint32_t keysz, void *obj, void *udata) +{ + const as_smd_external_item_key_t *extkey = + (const as_smd_external_item_key_t *)key; + smd_ext_item_search *search = (smd_ext_item_search *)udata; + + if (extkey->node_id == search->node_id) { + search->count++; + } + + return 0; +} + +/* + * Reduce function to create a list of metadata items from an rchash table. + */ +static int as_smd_list_items_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ +// char *item_key = (char *) key; // (Not used.) + as_smd_item_t *item = (as_smd_item_t *) object; + as_smd_item_list_t *item_list = (as_smd_item_list_t *) udata; + + cf_debug(AS_SMD, "adding to item list item: node: %016lX ; module: \"%s\" ; key: \"%s\"", item->node_id, item->module_name, item->key); + cf_debug(AS_SMD, "item list: %p", item_list); + cf_debug(AS_SMD, "item list length: %zu", item_list->num_items); + + cf_rc_reserve(item); + + item_list->item[item_list->num_items] = item; + item_list->num_items += 1; + + return 0; +} + +/* + * Invoke the merge policy callback function for this module. + */ +static int as_smd_invoke_merge_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ + const char *module = (const char *) key; + as_smd_module_t *module_obj = (as_smd_module_t *) object; + + cf_debug(AS_SMD, "invoking merge policy for module \"%s\"", module); + + as_smd_item_list_t *item_list_out = NULL; + as_smd_item_list_t *item_lists_in[g_cluster_size]; + int list_num = (int)g_cluster_size; + + for (uint32_t i = 0; i < g_cluster_size; i++) { + smd_ext_item_search search = { + .node_id = g_succession[i] + }; + + cf_rchash_reduce(module_obj->external_metadata, smd_ext_items_count_fn, + &search); + item_lists_in[i] = as_smd_item_list_alloc(search.count); + + if (search.count != 0) { + search.item_list = item_lists_in[i]; + item_lists_in[i]->num_items = 0; + cf_rchash_reduce(module_obj->external_metadata, smd_ext_items_fn, + &search); + } + } + + // Merge the metadata item lists for this module. + if (module_obj->merge_cb) { + // Invoke the module's registered merge policy callback function. + (module_obj->merge_cb)(module, &item_list_out, item_lists_in, list_num, module_obj->merge_udata); + } else { + cf_debug(AS_SMD, "no merge cb registered ~~ performing default merge policy: union"); + + // No merge policy registered ~~ Default to union. + cf_rchash *merge_hash = NULL; + cf_rchash_create(&merge_hash, cf_rchash_fn_fnv32, metadata_rchash_destructor_fn, 0, 127, 0); + + // Run through all metadata items in all node's lists. + for (int i = 0; i < list_num; i++) { + if (item_lists_in[i]) { + for (int j = 0; j < item_lists_in[i]->num_items; j++) { + as_smd_item_t *new_item = item_lists_in[i]->item[j]; + uint32_t key_len = strlen(new_item->key) + 1; + + // Look for an existing items with this key. + as_smd_item_t *existing_item = NULL; + if (CF_RCHASH_OK != cf_rchash_get(merge_hash, new_item->key, key_len, (void **) &existing_item)) { + // If not found, insert this item. + cf_rc_reserve(new_item); + cf_rchash_put(merge_hash, new_item->key, key_len, new_item); + } else { + // Otherwise, choose a winner. + bool existing_wins; + + if (module_obj->conflict_cb) { + // Use registered callback to determine winner. + existing_wins = (module_obj->conflict_cb)((char *)module, existing_item, new_item, module_obj->conflict_udata); + } else { + // Otherwise, choose a winner first by the highest generation and second by the highest timestamp. + existing_wins = (existing_item->generation > new_item->generation) || + ((existing_item->generation == new_item->generation) && + (existing_item->timestamp > new_item->timestamp)); + } + + // Leave existing item in hash, or replace existing item + // with new item (put releases existing item). + if (! existing_wins) { + cf_rc_reserve(new_item); + cf_rchash_put(merge_hash, new_item->key, key_len, new_item); + } + + as_smd_item_destroy(existing_item); // for cf_rchash_get + } + } + } + } + + // Create a merged items list. + size_t num_items = cf_rchash_get_size(merge_hash); + item_list_out = as_smd_item_list_alloc(num_items); + + // Populate the merged items list from the hash table. + // (Note: Use num_items to count the position for each metadata item.) + item_list_out->num_items = 0; + cf_rchash_reduce(merge_hash, as_smd_list_items_reduce_fn, item_list_out); + cf_rchash_destroy(merge_hash); + } + + // Sent out a merged metadata msg via fabric transaction to every cluster node. + msg *msg = NULL; + as_smd_msg_op_t merge_op = AS_SMD_MSG_OP_ACCEPT_THIS_METADATA; + for (uint32_t i = 0; i < g_cluster_size; i++) { + cf_node node_id = g_succession[i]; + if (!(msg = as_smd_msg_get(merge_op, item_list_out->item, item_list_out->num_items, module, AS_SMD_ACCEPT_OPT_MERGE))) { + cf_crash(AS_SMD, "failed to get a System Metadata fabric msg for operation %s", AS_SMD_MSG_OP_NAME(merge_op)); + } + + smd_fabric_send(node_id, msg); + } + + // Release the item lists. + for (int i = 0; i < list_num; i++) { + as_smd_item_list_destroy(item_lists_in[i]); + } + + // Release the merged items list. + as_smd_item_list_destroy(item_list_out); + + return 0; +} + +static void +smd_add_pending_merge(as_smd_msg_t *sm) +{ + smd_pending_merge add = { + .m = *sm, + .expire = cf_getms() + SMD_PENDING_MERGE_TIMEOUT_SEC * 1000 + }; + + // Steal memory from original. + sm->items = NULL; + sm->module_name = NULL; + + cf_queue_push(&g_smd->pending_merge_queue, &add); +} + +/* + * Receive a node's metadata on the SMD principal to be combined via the registered merge policy. + */ +static int as_smd_receive_metadata(as_smd_t *smd, as_smd_msg_t *smd_msg) +{ + int retval = 0; + + // Only the SMD principal receives other node's metadata. + if (g_config.self_node != as_smd_principal()) { + if (smd_msg->cluster_key != g_cluster_key) { + smd_add_pending_merge(smd_msg); + } + + cf_debug(AS_SMD, "non-principal node %016lX received metadata from node %016lX", g_config.self_node, smd_msg->node_id); + return -1; + } + + cf_debug(AS_SMD, "System Metadata thread - received %d metadata items from node %016lX", smd_msg->num_items, smd_msg->node_id); + + if (g_cluster_key != smd_msg->cluster_key) { + smd_add_pending_merge(smd_msg); + cf_debug(AS_SMD, "received SMD with non-current cluster key (%016lx != %016lx) from node %016lX -> Pending", + smd_msg->cluster_key, g_cluster_key, smd_msg->node_id); + return -1; + } + + // Store the all of the metadata items received from this node in the appropriate module's external metadata hash table. + // And return the item counts by module in a hash table. + cf_shash *module_item_count_hash = NULL; + if (!(module_item_count_hash = as_smd_store_metadata_by_module(smd, smd_msg))) { + cf_crash(AS_SMD, "failed to store metadata by module from node %016lX", smd_msg->node_id); + } + + // If something is already there, its obsolete, so release it. + cf_shash *prev_module_item_count_hash = NULL; + if (CF_SHASH_OK == cf_shash_get(smd->scoreboard, &(smd_msg->node_id), &prev_module_item_count_hash)) { + cf_debug(AS_SMD, "found an obsolete module item count hash for node %016lX ~~ Deleting!", smd_msg->node_id); + if (CF_SHASH_OK != cf_shash_delete(smd->scoreboard, &(smd_msg->node_id))) { + cf_warning(AS_SMD, "failed to delete obsolete module item count hash for node %016lX", smd_msg->node_id); + } + cf_shash_destroy(prev_module_item_count_hash); + } + + // Note that this node has provided its metadata for this cluster state change. + if (CF_SHASH_OK != cf_shash_put_unique(smd->scoreboard, &(smd_msg->node_id), &module_item_count_hash)) { + cf_warning(AS_SMD, "failed to put unique node %016lX into System Metadata scoreboard hash table", smd_msg->node_id); + } + + // Merge the metadata when all nodes have reported in. + if (cf_shash_get_size(smd->scoreboard) == g_cluster_size) { + cf_debug(AS_SMD, "received metadata from all %u cluster nodes ~~ invoking merge policies", g_cluster_size); + + cf_debug(AS_SMD, "Invoking merge reduce in SMD principal"); + // Invoke the merge policy for each module and send the results to all nodes. + cf_rchash_reduce(smd->modules, as_smd_invoke_merge_reduce_fn, smd); + + // Clear out the state used to notify cluster nodes of the new metadata. + as_smd_clear_scoreboard(smd); + } else if (cf_shash_get_size(smd->scoreboard) > g_cluster_size) { + // Cluster is unstable. + // While one node is coming up, one of other nodes has gone down. + // e.g Consider 3 node cluster. Add new node. Cluster size is 4. + // SMD principal has received information from 3 nodes and waiting for fourth node. + // So score board size is 3. + // But now two node has gone down. Cluster size is reduced to 2. + as_smd_clear_scoreboard(smd); + } else { + cf_debug(AS_SMD, "Cluster size = %u and smd->scoreboard size = %d ", g_cluster_size, cf_shash_get_size(smd->scoreboard)); + } + + return retval; +} + +static int metadata_local_deleteall_fn(const void *key, uint32_t key_len, void *object, void *udata) +{ + return CF_RCHASH_REDUCE_DELETE; +} + +/* + * Accept a metadata change from the SMD principal using the registered accept policy. + */ +static int as_smd_accept_metadata(as_smd_t *smd, as_smd_module_t *module_obj, as_smd_msg_t *smd_msg) +{ + int retval = 0; + + // There will be: + // 0 items when, after the merge, no valid metadata items were found according to the merge algorithm. + // 1 item when the user issues a set/delete metadata API call to a specific module (e.g., SINDEX, UDF.) + // >= 1 items when, after the merge, a non-empty list of items is valid according to the merge algorithm. + if (smd_msg->items->num_items) { + as_smd_item_t *item = smd_msg->items->item[0]; // (Only log the fist item.) + cf_debug(AS_SMD, "System Metadata thread - accepting metadata %s change: %zu items: item 0: module \"%s\" ; key \"%s\" ; value \"%s\"", + AS_SMD_MSG_OP_NAME(smd_msg->op), smd_msg->items->num_items, module_obj->module, item->key, item->value); + } else { + // Allow empty item list for merge and module create. + if (smd_msg->options & (AS_SMD_ACCEPT_OPT_MERGE | AS_SMD_ACCEPT_OPT_CREATE)) { + cf_debug(AS_SMD, "System Metadata thread - accepting metadata %s change: Zero items coming from merge", AS_SMD_MSG_OP_NAME(smd_msg->op)); + } else { + cf_debug(AS_SMD, "System Metadata thread - accepting metadata %s change: Zero items ~~ Returning!", AS_SMD_MSG_OP_NAME(smd_msg->op)); + return retval; + } + } + + cf_debug(AS_SMD, "accepting replacement metadata from incoming System Metadata msg"); + +#if 1 // DEBUG + // It should never be null. Being defensive to bail out just in case. + if (!module_obj) { + cf_crash(AS_SMD, "SMD module NULL in accept metadata!"); + } +#endif + + // In case of merge (after cluster state change) drop the existing local metadata definitions + // This is done to clean up some metadata, which could have been dropped during the merge + if (smd_msg->options & AS_SMD_ACCEPT_OPT_MERGE) { + cf_rchash_reduce(module_obj->my_metadata, metadata_local_deleteall_fn, NULL); + } + + for (int i = 0; i < smd_msg->items->num_items; i++) { + as_smd_item_t *item = smd_msg->items->item[i]; + if ((retval = as_smd_metadata_change_local(smd, smd_msg->op, item))) { + cf_warning(AS_SMD, "failed to perform the default accept replace local metadata operation %s (rv %d) for item %d: module \"%s\" ; key \"%s\" ; value \"%s\"", + AS_SMD_MSG_OP_NAME(smd_msg->op), retval, i, item->module_name, item->key, item->value); + } + } + + // Accept the metadata item list for this module. + if (module_obj->accept_cb) { + // Invoke the module's registered accept policy callback function. + cf_debug(AS_SMD, "Calling accept callback with OPT_MERGE for module %s with nitems %zu", smd_msg->module_name, smd_msg->items->num_items); + (module_obj->accept_cb)(module_obj->module, smd_msg->items, module_obj->accept_udata, smd_msg->options); + } + + // SMD should now be persisted. + module_obj->dirty = true; + + // Persist the accepted metadata for this module. + if (as_smd_module_persist(module_obj)) { + cf_warning(AS_SMD, "failed to persist accepted metadata for module \"%s\"", module_obj->module); + } + + return retval; +} + +static uint32_t key2idx_get_index(as_hashmap *map, const char *key) +{ + const as_integer *i = as_stringmap_get_integer((as_map *)map, key); + + if (i) { + return (uint32_t)as_integer_get(i); + } + + uint32_t new_index = as_hashmap_size(map); + + as_stringmap_set_int64((as_map *)map, key, (int64_t)new_index); + + return new_index; +} + +int as_smd_majority_consensus_merge(const char *module, as_smd_item_list_t **merged_list, + as_smd_item_list_t **lists_to_merge, size_t num_list, void *udata) +{ + typedef struct { + as_smd_item_t *item; // does not hold ref to item + uint32_t count; + } merge_item; + + cf_vector merge_list; + as_hashmap key2idx; + + as_hashmap_init(&key2idx, 1024); + cf_vector_init(&merge_list, sizeof(merge_item), 1024, 0); + + for(size_t i = 0; i < num_list; i++) { + size_t num_items = lists_to_merge[i]->num_items; + + for (size_t j = 0; j < num_items; j++) { + as_smd_item_t *item = lists_to_merge[i]->item[j]; + uint32_t idx = key2idx_get_index(&key2idx, item->key); + + if (idx >= cf_vector_size(&merge_list)) { + merge_item mitem = { + .item = item, + .count = 1 + }; + + cf_vector_append(&merge_list, &mitem); + continue; + } + + merge_item *p_mitem = (merge_item *)cf_vector_getp(&merge_list, idx); + bool existing_wins = (p_mitem->item->generation > item->generation) || + ((p_mitem->item->generation == item->generation) && + (p_mitem->item->timestamp > item->timestamp)); + + if (! existing_wins) { + p_mitem->item = item; + } + + p_mitem->count++; + } + } + + as_hashmap_destroy(&key2idx); + *merged_list = as_smd_item_list_alloc(cf_vector_size(&merge_list)); + + uint32_t majority_count = ((uint32_t)num_list + 1) / 2; + + for (uint32_t i = 0; i < cf_vector_size(&merge_list); i++) { + merge_item *p_mitem = (merge_item *)cf_vector_getp(&merge_list, i); + + if (p_mitem->count >= majority_count) { + cf_rc_reserve(p_mitem->item); + (*merged_list)->item[i] = p_mitem->item; + } + else { + as_smd_item_t *item = (as_smd_item_t *)cf_rc_alloc(sizeof(as_smd_item_t)); + + memset(item, 0, sizeof(as_smd_item_t)); + item->action = AS_SMD_ACTION_DELETE; + item->key = cf_strdup(p_mitem->item->key); + item->generation = p_mitem->item->generation + 1; + item->timestamp = cf_clepoch_milliseconds(); + (*merged_list)->item[i] = item; + } + } + + cf_vector_destroy(&merge_list); + + return 0; +} + +/* + * Process an SMD event, which may be either an SMD API command or an incoming SMD fabric msg. + */ +static void as_smd_process_event (as_smd_t *smd, as_smd_event_t *evt) +{ + if (AS_SMD_CMD == evt->type) { + + /***** Handle SMD API Command Event *****/ + + as_smd_cmd_t *cmd = &(evt->u.cmd); + + cf_debug(AS_SMD, "SMD thread received command: \"%s\" ; options: 0x%08x", AS_SMD_CMD_TYPE_NAME(cmd->type), cmd->options); + + if (cmd->item) { + cf_debug(AS_SMD, "SMD event item: node %016lX ; module \"%s\" ; key \"%s\" ; value %p ; generation %u ; timestamp %zu", + cmd->item->node_id, cmd->item->module_name, cmd->item->key, cmd->item->value, cmd->item->generation, cmd->item->timestamp); + } + + switch (cmd->type) { + case AS_SMD_CMD_INIT: + smd->state = AS_SMD_STATE_INITIALIZED; + break; + + case AS_SMD_CMD_START: + smd->state = AS_SMD_STATE_RUNNING; + break; + + case AS_SMD_CMD_CREATE_MODULE: + as_smd_module_create(smd, cmd); + break; + + case AS_SMD_CMD_DESTROY_MODULE: + as_smd_module_destroy(smd, cmd); + break; + + case AS_SMD_CMD_SET_METADATA: + case AS_SMD_CMD_DELETE_METADATA: + as_smd_metadata_change(smd, CMD_TYPE2MSG_OP(cmd->type), cmd->item); + break; + + case AS_SMD_CMD_GET_METADATA: + as_smd_metadata_get(smd, cmd); + break; + + case AS_SMD_CMD_CLUSTER_CHANGED: + as_smd_cluster_changed(smd, cmd); + break; + + case AS_SMD_CMD_INTERNAL: + if (cmd->options & AS_SMD_CMD_OPT_DUMP_SMD) { + as_smd_dump_metadata(smd, cmd); + } else { + cf_warning(AS_SMD, "Unknown System Metadata internal event options received: 0x%08x ~~ Ignoring event!", cmd->options); + } + break; + + case AS_SMD_CMD_SHUTDOWN: + smd->state = AS_SMD_STATE_EXITING; + break; + + default: + cf_crash(AS_SMD, "received unknown System Metadata event type %d", cmd->type); + break; + } + } else if (AS_SMD_MSG == evt->type) { + + /***** Handle SMD Fabric Transaction Message Event *****/ + + as_smd_msg_t *msg = &(evt->u.msg); + as_smd_item_t *item = NULL; + + if (msg->num_items) { + item = msg->items->item[0]; // (Only log the fist item.) + cf_debug(AS_SMD, "SMD thread received fabric msg event with op %s item: item 0: node %016lX module \"%s\" ; key \"%s\" ; value \"%s\"", + AS_SMD_MSG_OP_NAME(msg->op), item->node_id, item->module_name, item->key, item->value); + } else { + cf_debug(AS_SMD, "SMD thread received fabric msg event with op %s [Zero metadata items]", AS_SMD_MSG_OP_NAME(msg->op)); + if ((AS_SMD_MSG_OP_SET_ITEM == msg->op) || (AS_SMD_MSG_OP_DELETE_ITEM == msg->op)) { + cf_crash(AS_SMD, "SMD thread received invalid empty metadata items list from node %016lX for message %s", + msg->node_id, AS_SMD_MSG_OP_NAME(msg->op)); + } + } + + // Find (or create) the module's object. + as_smd_module_t *module_obj = as_smd_module_get(smd, (msg->num_items > 0 ? msg->items->item[0] : NULL), msg); + + switch (msg->op) { + case AS_SMD_MSG_OP_SET_ITEM: + case AS_SMD_MSG_OP_DELETE_ITEM: + as_smd_apply_metadata_change(smd, module_obj, msg); + break; + + case AS_SMD_MSG_OP_MY_CURRENT_METADATA: + as_smd_receive_metadata(smd, msg); + break; + + case AS_SMD_MSG_OP_ACCEPT_THIS_METADATA: + case AS_SMD_MSG_OP_SET_FROM_PR: + as_smd_accept_metadata(smd, module_obj, msg); + break; + } + + if (module_obj) { + // Give back the reference. + cf_rc_release(module_obj); + } + } else { + // This should never happen. + cf_warning(AS_SMD, "received unknown type of System Metadata event (%d)", evt->type); + } +} + +/* + * Thread to handle all System Metadata events, incoming via the API or the fabric. + */ +void *as_smd_thr(void *arg) +{ + as_smd_t *smd = (as_smd_t *) arg; + int retval = 0; + + cf_debug(AS_SMD, "System Metadata thread created"); + + // Receive incoming messages via the message queue. + // Process each message. + // Destroy the message after processing. + + for ( ; smd->state != AS_SMD_STATE_EXITING ; ) { + + as_smd_event_t *evt = NULL; + + if ((retval = cf_queue_pop(smd->msgq, &evt, AS_SMD_WAIT_INTERVAL_MS))) { + if (CF_QUEUE_ERR == retval) { + cf_warning(AS_SMD, "failed to pop an event (retval %d)", retval); + } + } + + if (CF_QUEUE_EMPTY == retval) { + // [Could handle any periodic / background events here when there's nothing else to do.] + cf_detail(AS_SMD, "System Metadata thread - received timeout event"); + smd_expire_pending_merges(); + } else { + as_smd_process_event(smd, evt); + + // Release the event message. + as_smd_destroy_event(evt); + } + } + + // Release System Metadata resources. + as_smd_terminate(smd); + + // Exit the System Metadata thread. + return NULL; +} diff --git a/as/src/base/thr_batch.c b/as/src/base/thr_batch.c new file mode 100644 index 00000000..9696f433 --- /dev/null +++ b/as/src/base/thr_batch.c @@ -0,0 +1,467 @@ +/* + * thr_batch.c + * + * Copyright (C) 2012-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/thr_batch.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "aerospike/as_thread_pool.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_queue.h" + +#include "dynbuf.h" +#include "hist.h" +#include "node.h" +#include "socket.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/stats.h" +#include "base/transaction.h" +#include "fabric/partition.h" +#include "storage/storage.h" + +typedef struct { + cf_node node; + cf_digest keyd; + bool done; +} batch_digest; + +typedef struct { + int n_digests; + batch_digest digest[]; +} batch_digests; + +typedef struct { + uint64_t trid; + uint64_t end_time; + as_namespace* ns; + as_file_handle* fd_h; + batch_digests* digests; + cf_vector* binlist; + bool get_data; + bool complete; +} batch_transaction; + +static as_thread_pool batch_direct_thread_pool; + +static void +as_msg_make_error_response_bufbuilder(cf_digest *keyd, int result_code, + cf_buf_builder **bb_r, const char *ns_name) +{ + size_t ns_len = strlen(ns_name); + size_t msg_sz = sizeof(as_msg) + + sizeof(as_msg_field) + sizeof(cf_digest) + + sizeof(as_msg_field) + ns_len; + + uint8_t *buf; + cf_buf_builder_reserve(bb_r, (int)msg_sz, &buf); + + as_msg *msgp = (as_msg *)buf; + + msgp->header_sz = (uint8_t)sizeof(as_msg); + msgp->info1 = 0; + msgp->info2 = 0; + msgp->info3 = 0; + msgp->unused = 0; + msgp->result_code = (uint8_t)result_code; + msgp->generation = 0; + msgp->record_ttl = 0; + msgp->transaction_ttl = 0; + msgp->n_fields = 2; + msgp->n_ops = 0; + as_msg_swap_header(msgp); + + buf += sizeof(as_msg); + + as_msg_field *mf = (as_msg_field *)buf; + + mf->field_sz = sizeof(cf_digest) + 1; + mf->type = AS_MSG_FIELD_TYPE_DIGEST_RIPE; + memcpy(mf->data, keyd, sizeof(cf_digest)); + as_msg_swap_field(mf); + buf += sizeof(as_msg_field) + sizeof(cf_digest); + + mf = (as_msg_field *)buf; + mf->field_sz = (uint32_t)ns_len + 1; + mf->type = AS_MSG_FIELD_TYPE_NAMESPACE; + memcpy(mf->data, ns_name, ns_len); + as_msg_swap_field(mf); +} + +// Build response to batch request. +static void +batch_build_response(batch_transaction* btr, cf_buf_builder** bb_r) +{ + as_namespace* ns = btr->ns; + batch_digests *bmds = btr->digests; + bool get_data = btr->get_data; + uint32_t yield_count = 0; + + for (int i = 0; i < bmds->n_digests; i++) + { + batch_digest *bmd = &bmds->digest[i]; + + if (bmd->done == false) { + // try to get the key + as_partition_reservation rsv; + cf_node other_node = 0; + + if (! *bb_r) { + *bb_r = cf_buf_builder_create_size(1024 * 4); + } + + int rv = as_partition_reserve_read(ns, as_partition_getid(&bmd->keyd), &rsv, false, &other_node); + + if (rv == 0) { + as_index_ref r_ref; + r_ref.skip_lock = false; + int rec_rv = as_record_get_live(rsv.tree, &bmd->keyd, &r_ref, ns); + + if (rec_rv == 0) { + as_index *r = r_ref.r; + + // Check to see this isn't a record waiting to die. + if (as_record_is_doomed(r, ns)) { + as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOT_FOUND, bb_r, ns->name); + } + else { + // Make sure it's brought in from storage if necessary. + as_storage_rd rd; + as_storage_record_open(ns, r, &rd); + + if (get_data) { + as_storage_rd_load_n_bins(&rd); // TODO - handle error returned + } + + // Note: this array must stay in scope until the + // response for this record has been built, since in the + // get data w/ record on device case, it's copied by + // reference directly into the record descriptor. + as_bin stack_bins[!get_data || ns->storage_data_in_memory ? 0 : rd.n_bins]; + + if (get_data) { + // Figure out which bins you want - for now, all. + as_storage_rd_load_bins(&rd, stack_bins); // TODO - handle error returned + rd.n_bins = as_bin_inuse_count(&rd); + } + + as_msg_make_response_bufbuilder(bb_r, &rd, !get_data, false, false, btr->binlist); + + as_storage_record_close(&rd); + } + as_record_done(&r_ref, ns); + } + else { + // TODO - what about empty records? + cf_debug(AS_BATCH, "batch_build_response: as_record_get returned %d : key %lx", rec_rv, *(uint64_t *)&bmd->keyd); + as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOT_FOUND, bb_r, ns->name); + } + + bmd->done = true; + + as_partition_release(&rsv); + } + else { + cf_debug(AS_BATCH, "batch_build_response: partition reserve read failed: rv %d", rv); + + as_msg_make_error_response_bufbuilder(&bmd->keyd, AS_PROTO_RESULT_FAIL_NOT_FOUND, bb_r, ns->name); + + if (other_node != 0) { + bmd->node = other_node; + cf_debug(AS_BATCH, "other_node is: %lx", other_node); + } else { + cf_debug(AS_BATCH, "other_node is NULL."); + } + } + + yield_count++; + if (yield_count % g_config.batch_priority == 0) { + usleep(1); + } + } + } +} + +// Send response to client socket. +static int +batch_send(cf_socket *sock, uint8_t* buf, size_t len, int flags) +{ + if (cf_socket_send_all(sock, buf, len, flags, + CF_SOCKET_TIMEOUT) < 0) { + // Common when a client aborts. + cf_debug(AS_BATCH, "batch send response error, errno %d fd %d", + errno, CSFD(sock)); + return -1; + } + + return 0; +} + +// Send protocol header to the requesting client. +static int +batch_send_header(cf_socket *sock, size_t len) +{ + as_proto proto; + proto.version = PROTO_VERSION; + proto.type = PROTO_TYPE_AS_MSG; + proto.sz = len; + as_proto_swap(&proto); + + return batch_send(sock, (uint8_t*) &proto, 8, MSG_NOSIGNAL | MSG_MORE); +} + +// Send protocol trailer to the requesting client. +static int +batch_send_final(cf_socket *sock, uint32_t result_code) +{ + cl_msg m; + m.proto.version = PROTO_VERSION; + m.proto.type = PROTO_TYPE_AS_MSG; + m.proto.sz = sizeof(as_msg); + as_proto_swap(&m.proto); + m.msg.header_sz = sizeof(as_msg); + m.msg.info1 = 0; + m.msg.info2 = 0; + m.msg.info3 = AS_MSG_INFO3_LAST; + m.msg.unused = 0; + m.msg.result_code = result_code; + m.msg.generation = 0; + m.msg.record_ttl = 0; + m.msg.transaction_ttl = 0; + m.msg.n_fields = 0; + m.msg.n_ops = 0; + as_msg_swap_header(&m.msg); + + return batch_send(sock, (uint8_t*) &m, sizeof(m), MSG_NOSIGNAL); +} + + +// Release memory for batch transaction. +static void +batch_transaction_done(batch_transaction* btr, bool force_close) +{ + if (btr->fd_h) { + as_end_of_transaction(btr->fd_h, force_close); + btr->fd_h = 0; + } + + if (btr->digests) { + cf_free(btr->digests); + btr->digests = 0; + } + + if (btr->binlist) { + cf_vector_destroy(btr->binlist); + btr->binlist = 0; + } +} + +// Process a batch request. +static void +batch_process_request(batch_transaction* btr) +{ + // Keep the reaper at bay. + btr->fd_h->last_used = cf_getms(); + + cf_buf_builder* bb = 0; + batch_build_response(btr, &bb); + + cf_socket *sock = &btr->fd_h->sock; + int brv; + + if (bb) { + brv = batch_send_header(sock, bb->used_sz); + + if (brv == 0) { + brv = batch_send(sock, bb->buf, bb->used_sz, MSG_NOSIGNAL | MSG_MORE); + + if (brv == 0) { + brv = batch_send_final(sock, 0); + } + } + cf_buf_builder_free(bb); + } + else { + cf_info(AS_BATCH, " batch request: returned no local responses"); + brv = batch_send_final(sock, 0); + } + + batch_transaction_done(btr, brv != 0); +} + +// Process one queue's batch requests. +static void +batch_worker(void* udata) +{ + batch_transaction* btr = (batch_transaction*)udata; + + // Check for timeouts. + if (btr->end_time != 0 && cf_getns() > btr->end_time) { + cf_atomic64_incr(&g_stats.batch_timeout); + + if (btr->fd_h) { + as_msg_send_reply(btr->fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, + 0, 0, 0, 0, 0, btr->ns, btr->trid); + btr->fd_h = 0; + } + batch_transaction_done(btr, false); + return; + } + + // Process batch request. + batch_process_request(btr); +} + +// Create bin name list from message. +static cf_vector* +as_binlist_from_op(as_msg* msg) +{ + if (msg->n_ops == 0) { + return 0; + } + + cf_vector* binlist = cf_vector_create(AS_ID_BIN_SZ, 5, 0); + as_msg_op* op = 0; + int n = 0; + int len; + char name[AS_ID_BIN_SZ]; + + while ((op = as_msg_op_iterate(msg, op, &n))) { + len = (op->name_sz <= AS_ID_BIN_SZ - 1)? op->name_sz : AS_ID_BIN_SZ - 1; + memcpy(name, op->name, len); + name[len] = 0; + cf_vector_append(binlist, name); + } + return binlist; +} + +// Initialize batch queues and worker threads. +int +as_batch_direct_init() +{ + uint32_t threads = g_config.n_batch_threads; + cf_info(AS_BATCH, "starting %u batch-threads", threads); + int status = as_thread_pool_init_fixed(&batch_direct_thread_pool, threads, batch_worker, sizeof(batch_transaction), offsetof(batch_transaction,complete)); + + if (status) { + cf_warning(AS_BATCH, "Failed to initialize batch-threads to %u: %d", threads, status); + } + return status; +} + +// Put batch request on a separate batch queue. +int +as_batch_direct_queue_task(as_transaction* tr, as_namespace *ns) +{ + cf_atomic64_incr(&g_stats.batch_initiate); + + if (g_config.n_batch_threads <= 0) { + cf_warning(AS_BATCH, "batch-threads has been disabled."); + return AS_PROTO_RESULT_FAIL_BATCH_DISABLED; + } + + as_msg* msg = &tr->msgp->msg; + + as_msg_field* dfp = as_msg_field_get(msg, AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY); + if (! dfp) { + cf_warning(AS_BATCH, "Batch digests are required."); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + uint32_t n_digests = dfp->field_sz / sizeof(cf_digest); + + if (n_digests > g_config.batch_max_requests) { + cf_warning(AS_BATCH, "Batch request size %u exceeds max %u.", n_digests, g_config.batch_max_requests); + return AS_PROTO_RESULT_FAIL_BATCH_MAX_REQUESTS; + } + + batch_transaction btr; + btr.trid = as_transaction_trid(tr); + btr.end_time = tr->end_time; + btr.get_data = !(msg->info1 & AS_MSG_INFO1_GET_NO_BINS); + btr.complete = false; + btr.ns = ns; + + // Create the master digest table. + btr.digests = (batch_digests*) cf_malloc(sizeof(batch_digests) + (sizeof(batch_digest) * n_digests)); + + batch_digests* bmd = btr.digests; + bmd->n_digests = n_digests; + uint8_t* digest_field_data = dfp->data; + + for (int i = 0; i < n_digests; i++) { + bmd->digest[i].done = false; + bmd->digest[i].node = 0; + memcpy(&bmd->digest[i].keyd, digest_field_data, sizeof(cf_digest)); + digest_field_data += sizeof(cf_digest); + } + + btr.binlist = as_binlist_from_op(msg); + btr.fd_h = tr->from.proto_fd_h; + tr->from.proto_fd_h = NULL; + btr.fd_h->last_used = cf_getms(); + + int status = as_thread_pool_queue_task_fixed(&batch_direct_thread_pool, &btr); + + if (status) { + cf_warning(AS_BATCH, "Batch enqueue failed"); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + return 0; +} + +int +as_batch_direct_queue_size() +{ + return batch_direct_thread_pool.dispatch_queue? cf_queue_sz(batch_direct_thread_pool.dispatch_queue) : 0; +} + +int +as_batch_direct_threads_resize(uint32_t threads) +{ + if (threads > MAX_BATCH_THREADS) { + cf_warning(AS_BATCH, "batch-threads %u exceeds max %u", threads, MAX_BATCH_THREADS); + return -1; + } + + cf_info(AS_BATCH, "Resize batch-threads from %u to %u", g_config.n_batch_threads, threads); + int status = as_thread_pool_resize(&batch_direct_thread_pool, threads); + g_config.n_batch_threads = batch_direct_thread_pool.thread_size; + + if (status) { + cf_warning(AS_BATCH, "Failed to resize batch-threads. status=%d, batch-threads=%d", + status, g_config.n_batch_threads); + } + return status; +} diff --git a/as/src/base/thr_demarshal.c b/as/src/base/thr_demarshal.c new file mode 100644 index 00000000..bf6f9b89 --- /dev/null +++ b/as/src/base/thr_demarshal.c @@ -0,0 +1,914 @@ +/* + * thr_demarshal.c + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/thr_demarshal.h" + +#include +#include +#include +#include +#include +#include +#include +#include // for MIN() +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_queue.h" + +#include "fault.h" +#include "hardware.h" +#include "hist.h" +#include "socket.h" +#include "tls.h" + +#include "base/as_stap.h" +#include "base/batch.h" +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/packet_compression.h" +#include "base/proto.h" +#include "base/security.h" +#include "base/stats.h" +#include "base/thr_info.h" +#include "base/thr_tsvc.h" +#include "base/transaction.h" +#include "base/xdr_serverside.h" + +#define POLL_SZ 1024 + +#define XDR_WRITE_BUFFER_SIZE (5 * 1024 * 1024) +#define XDR_READ_BUFFER_SIZE (15 * 1024 * 1024) + +extern void *thr_demarshal(void *arg); + +typedef struct { + cf_poll polls[MAX_DEMARSHAL_THREADS]; + unsigned int num_threads; + pthread_t dm_th[MAX_DEMARSHAL_THREADS]; +} demarshal_args; + +static demarshal_args *g_demarshal_args = 0; + +as_info_access g_access = { + .service = { .addrs = { .n_addrs = 0 }, .port = 0 }, + .alt_service = { .addrs = { .n_addrs = 0 }, .port = 0 }, + .tls_service = { .addrs = { .n_addrs = 0 }, .port = 0 }, + .alt_tls_service = { .addrs = { .n_addrs = 0 }, .port = 0 } +}; + +cf_serv_cfg g_service_bind = { .n_cfgs = 0 }; +cf_tls_info *g_service_tls; + +static cf_sockets g_sockets; + +// +// File handle reaper. +// + +pthread_mutex_t g_file_handle_a_LOCK = PTHREAD_MUTEX_INITIALIZER; +as_file_handle **g_file_handle_a = 0; +uint32_t g_file_handle_a_sz; +pthread_t g_demarshal_reaper_th; + +void *thr_demarshal_reaper_fn(void *arg); +static cf_queue *g_freeslot = 0; + +void +thr_demarshal_rearm(as_file_handle *fd_h) +{ + // This causes ENOENT, when we reached NextEvent_FD_Cleanup (e.g, because + // the client disconnected) while the transaction was still ongoing. + + static int32_t err_ok[] = { ENOENT }; + CF_IGNORE_ERROR(cf_poll_modify_socket_forgiving(fd_h->poll, &fd_h->sock, + EPOLLIN | EPOLLONESHOT | EPOLLRDHUP, fd_h, + sizeof(err_ok) / sizeof(int32_t), err_ok)); +} + +void +demarshal_file_handle_init() +{ + struct rlimit rl; + + pthread_mutex_lock(&g_file_handle_a_LOCK); + + if (g_file_handle_a == 0) { + if (-1 == getrlimit(RLIMIT_NOFILE, &rl)) { + cf_crash(AS_DEMARSHAL, "getrlimit: %s", cf_strerror(errno)); + } + + // Initialize the message pointer array and the unread byte counters. + g_file_handle_a = cf_calloc(rl.rlim_cur, sizeof(as_proto *)); + g_file_handle_a_sz = rl.rlim_cur; + + for (int i = 0; i < g_file_handle_a_sz; i++) { + cf_queue_push(g_freeslot, &i); + } + + pthread_create(&g_demarshal_reaper_th, 0, thr_demarshal_reaper_fn, 0); + + // If config value is 0, set a maximum proto size based on the RLIMIT. + if (g_config.n_proto_fd_max == 0) { + g_config.n_proto_fd_max = rl.rlim_cur / 2; + cf_info(AS_DEMARSHAL, "setting default client file descriptors to %d", g_config.n_proto_fd_max); + } + } + + pthread_mutex_unlock(&g_file_handle_a_LOCK); +} + +// Keep track of the connections, since they're precious. Kill anything that +// hasn't been used in a while. The file handle array keeps a reference count, +// and allows a reaper to run through and find the ones to reap. The table is +// only written by the demarshal threads, and only read by the reaper thread. +void * +thr_demarshal_reaper_fn(void *arg) +{ + uint64_t last = cf_getms(); + + while (true) { + uint64_t now = cf_getms(); + uint32_t inuse_cnt = 0; + uint64_t kill_ms = g_config.proto_fd_idle_ms; + bool refresh = false; + + if (now - last > (uint64_t)g_config.sec_cfg.privilege_refresh_period * 1000) { + refresh = true; + last = now; + } + + pthread_mutex_lock(&g_file_handle_a_LOCK); + + for (int i = 0; i < g_file_handle_a_sz; i++) { + if (g_file_handle_a[i]) { + as_file_handle *fd_h = g_file_handle_a[i]; + + if (refresh) { + as_security_refresh(fd_h); + } + + // Reap, if asked to. + if (fd_h->reap_me) { + cf_debug(AS_DEMARSHAL, "Reaping FD %d as requested", CSFD(&fd_h->sock)); + g_file_handle_a[i] = 0; + cf_queue_push(g_freeslot, &i); + as_release_file_handle(fd_h); + fd_h = 0; + } + // Reap if past kill time. + else if ((0 != kill_ms) && (fd_h->last_used + kill_ms < now)) { + if (fd_h->fh_info & FH_INFO_DONOT_REAP) { + cf_debug(AS_DEMARSHAL, "Not reaping the fd %d as it has the protection bit set", CSFD(&fd_h->sock)); + inuse_cnt++; + continue; + } + + cf_socket_shutdown(&fd_h->sock); // will trigger epoll errors + cf_debug(AS_DEMARSHAL, "remove unused connection, fd %d", CSFD(&fd_h->sock)); + g_file_handle_a[i] = 0; + cf_queue_push(g_freeslot, &i); + as_release_file_handle(fd_h); + fd_h = 0; + g_stats.reaper_count++; + } + else { + inuse_cnt++; + } + } + } + + pthread_mutex_unlock(&g_file_handle_a_LOCK); + + if ((g_file_handle_a_sz / 10) > (g_file_handle_a_sz - inuse_cnt)) { + cf_warning(AS_DEMARSHAL, "less than ten percent file handles remaining: %d max %d inuse", + g_file_handle_a_sz, inuse_cnt); + } + + // Validate the system statistics. + if (g_stats.proto_connections_opened - g_stats.proto_connections_closed != inuse_cnt) { + cf_debug(AS_DEMARSHAL, "reaper: mismatched connection count: %lu in stats vs %u calculated", + g_stats.proto_connections_opened - g_stats.proto_connections_closed, + inuse_cnt); + } + + sleep(1); + } + + return NULL; +} + +int +thr_demarshal_read_file(const char *path, char *buffer, size_t size) +{ + int res = -1; + int fd = open(path, O_RDONLY); + + if (fd < 0) { + cf_warning(AS_DEMARSHAL, "Failed to open %s for reading.", path); + goto cleanup0; + } + + size_t len = 0; + + while (len < size - 1) { + ssize_t n = read(fd, buffer + len, size - len - 1); + + if (n < 0) { + cf_warning(AS_DEMARSHAL, "Failed to read from %s", path); + goto cleanup1; + } + + if (n == 0) { + buffer[len] = 0; + res = 0; + goto cleanup1; + } + + len += n; + } + + cf_warning(AS_DEMARSHAL, "%s is too large.", path); + +cleanup1: + close(fd); + +cleanup0: + return res; +} + +int +thr_demarshal_read_integer(const char *path, int *value) +{ + char buffer[21]; + + if (thr_demarshal_read_file(path, buffer, sizeof(buffer)) < 0) { + return -1; + } + + char *end; + uint64_t x = strtoul(buffer, &end, 10); + + if (*end != '\n' || x > INT_MAX) { + cf_warning(AS_DEMARSHAL, "Invalid integer value in %s.", path); + return -1; + } + + *value = (int)x; + return 0; +} + +typedef enum { + BUFFER_TYPE_SEND, + BUFFER_TYPE_RECEIVE +} buffer_type; + +int +thr_demarshal_set_buffer(cf_socket *sock, buffer_type type, int size) +{ + static int rcv_max = -1; + static int snd_max = -1; + + const char *proc; + int *max; + + switch (type) { + case BUFFER_TYPE_RECEIVE: + proc = "/proc/sys/net/core/rmem_max"; + max = &rcv_max; + break; + + case BUFFER_TYPE_SEND: + proc = "/proc/sys/net/core/wmem_max"; + max = &snd_max; + break; + + default: + cf_crash(AS_DEMARSHAL, "Invalid buffer type: %d", (int32_t)type); + return -1; // cf_crash() should have a "noreturn" attribute, but is a macro + } + + int tmp = ck_pr_load_int(max); + + if (tmp < 0) { + if (thr_demarshal_read_integer(proc, &tmp) < 0) { + cf_warning(AS_DEMARSHAL, "Failed to read %s; should be at least %d. Please verify.", proc, size); + tmp = size; + } + } + + if (tmp < size) { + cf_warning(AS_DEMARSHAL, "Buffer limit is %d, should be at least %d. Please set %s accordingly.", + tmp, size, proc); + return -1; + } + + ck_pr_cas_int(max, -1, tmp); + + switch (type) { + case BUFFER_TYPE_RECEIVE: + cf_socket_set_receive_buffer(sock, size); + break; + + case BUFFER_TYPE_SEND: + cf_socket_set_send_buffer(sock, size); + break; + } + + return 0; +} + +int +thr_demarshal_config_xdr(cf_socket *sock) +{ + if (thr_demarshal_set_buffer(sock, BUFFER_TYPE_RECEIVE, XDR_READ_BUFFER_SIZE) < 0) { + return -1; + } + + if (thr_demarshal_set_buffer(sock, BUFFER_TYPE_SEND, XDR_WRITE_BUFFER_SIZE) < 0) { + return -1; + } + + cf_socket_set_window(sock, XDR_READ_BUFFER_SIZE); + cf_socket_enable_nagle(sock); + return 0; +} + +bool +peek_data_in_memory(const as_msg *m) +{ + as_msg_field *f = as_msg_field_get(m, AS_MSG_FIELD_TYPE_NAMESPACE); + + if (! f) { + // Should never happen, but don't bark here. + return false; + } + + as_namespace *ns = as_namespace_get_bymsgfield(f); + + // If ns is null, don't be the first to bark. + return ns && ns->storage_data_in_memory; +} + +// Set of threads which talk to client over the connection for doing the needful +// processing. Note that once fd is assigned to a thread all the work on that fd +// is done by that thread. Fair fd usage is expected of the client. First thread +// is special - also does accept [listens for new connections]. It is the only +// thread which does it. +void * +thr_demarshal(void *unused) +{ + cf_poll poll; + int nevents, i; + cf_clock last_fd_print = 0; + +#if defined(USE_SYSTEMTAP) + uint64_t nodeid = g_config.self_node; +#endif + + // Figure out my thread index. + pthread_t self = pthread_self(); + int thr_id; + for (thr_id = 0; thr_id < MAX_DEMARSHAL_THREADS; thr_id++) { + if (0 != pthread_equal(g_demarshal_args->dm_th[thr_id], self)) + break; + } + + if (thr_id == MAX_DEMARSHAL_THREADS) { + cf_debug(AS_FABRIC, "Demarshal thread could not figure own ID, bogus, exit, fu!"); + return(0); + } + + if (g_config.auto_pin != CF_TOPO_AUTO_PIN_NONE) { + cf_detail(AS_DEMARSHAL, "pinning thread to CPU %d", thr_id); + cf_topo_pin_to_cpu((cf_topo_cpu_index)thr_id); + } + + cf_poll_create(&poll); + + // First thread accepts new connection at interface socket. + if (thr_id == 0) { + demarshal_file_handle_init(); + + cf_poll_add_sockets(poll, &g_sockets, EPOLLIN | EPOLLERR | EPOLLHUP); + cf_socket_show_server(AS_DEMARSHAL, "client", &g_sockets); + } + + g_demarshal_args->polls[thr_id] = poll; + cf_detail(AS_DEMARSHAL, "demarshal thread started: id %d", thr_id); + + int id_cntr = 0; + + // Demarshal transactions from the socket. + for ( ; ; ) { + cf_poll_event events[POLL_SZ]; + + cf_detail(AS_DEMARSHAL, "calling epoll"); + + nevents = cf_poll_wait(poll, events, POLL_SZ, -1); + cf_detail(AS_DEMARSHAL, "epoll event received: nevents %d", nevents); + + uint64_t now_ns = cf_getns(); + uint64_t now_ms = now_ns / 1000000; + + // Iterate over all events. + for (i = 0; i < nevents; i++) { + cf_socket *ssock = events[i].data; + + if (cf_sockets_has_socket(&g_sockets, ssock)) { + // Accept new connections on the service socket. + cf_socket csock; + cf_sock_addr sa; + + if (cf_socket_accept(ssock, &csock, &sa) < 0) { + // This means we're out of file descriptors - could be a SYN + // flood attack or misbehaving client. Eventually we'd like + // to make the reaper fairer, but for now we'll just have to + // ignore the accept error and move on. + if ((errno == EMFILE) || (errno == ENFILE)) { + if (last_fd_print != (cf_getms() / 1000L)) { + cf_warning(AS_DEMARSHAL, "Hit OS file descriptor limit (EMFILE on accept). Consider raising limit for uid %d", g_config.uid); + last_fd_print = cf_getms() / 1000L; + } + continue; + } + cf_crash(AS_DEMARSHAL, "accept: %s (errno %d)", cf_strerror(errno), errno); + } + + char sa_str[sizeof(((as_file_handle *)NULL)->client)]; + cf_sock_addr_to_string_safe(&sa, sa_str, sizeof(sa_str)); + cf_detail(AS_DEMARSHAL, "new connection: %s (fd %d)", sa_str, CSFD(&csock)); + + // Validate the limit of protocol connections we allow. + uint32_t conns_open = g_stats.proto_connections_opened - g_stats.proto_connections_closed; + cf_sock_cfg *cfg = ssock->cfg; + if (cfg->owner != CF_SOCK_OWNER_XDR && conns_open > g_config.n_proto_fd_max) { + if ((last_fd_print + 5000L) < cf_getms()) { // no more than 5 secs + cf_warning(AS_DEMARSHAL, "dropping incoming client connection: hit limit %d connections", conns_open); + last_fd_print = cf_getms(); + } + cf_socket_shutdown(&csock); + cf_socket_close(&csock); + cf_socket_term(&csock); + continue; + } + + // Initialize the TLS part of the socket. + if (cfg->owner == CF_SOCK_OWNER_SERVICE_TLS) { + tls_socket_prepare_server(g_service_tls, &csock); + } + + // Create as_file_handle and queue it up in epoll_fd for further + // communication on one of the demarshal threads. + as_file_handle *fd_h = cf_rc_alloc(sizeof(as_file_handle)); + + strcpy(fd_h->client, sa_str); + cf_socket_copy(&csock, &fd_h->sock); + + fd_h->last_used = cf_getms(); + fd_h->reap_me = false; + fd_h->proto = 0; + fd_h->proto_unread = (uint64_t)sizeof(as_proto); + fd_h->fh_info = 0; + fd_h->security_filter = as_security_filter_create(); + + // Insert into the global table so the reaper can manage it. Do + // this before queueing it up for demarshal threads - once + // EPOLL_CTL_ADD is done it's difficult to back out (if insert + // into global table fails) because fd state could be anything. + cf_rc_reserve(fd_h); + + pthread_mutex_lock(&g_file_handle_a_LOCK); + + int j; + bool inserted = true; + + if (0 != cf_queue_pop(g_freeslot, &j, CF_QUEUE_NOWAIT)) { + inserted = false; + } + else { + g_file_handle_a[j] = fd_h; + } + + pthread_mutex_unlock(&g_file_handle_a_LOCK); + + if (!inserted) { + cf_info(AS_DEMARSHAL, "unable to add socket to file handle table"); + cf_socket_shutdown(&csock); + cf_socket_close(&csock); + cf_socket_term(&csock); + cf_rc_free(fd_h); // will free even with ref-count of 2 + } + else { + int32_t id; + + if (g_config.auto_pin == CF_TOPO_AUTO_PIN_NONE) { + cf_detail(AS_DEMARSHAL, "no CPU pinning - dispatching incoming connection round-robin"); + id = (id_cntr++) % g_demarshal_args->num_threads; + } + else { + id = cf_topo_socket_cpu(&fd_h->sock); + cf_detail(AS_DEMARSHAL, "incoming connection on CPU %d", id); + } + + fd_h->poll = g_demarshal_args->polls[id]; + + // Place the client socket in the event queue. + cf_poll_add_socket(fd_h->poll, &fd_h->sock, EPOLLIN | EPOLLONESHOT | EPOLLRDHUP, fd_h); + cf_atomic64_incr(&g_stats.proto_connections_opened); + } + } + else { + bool has_extra_ref = false; + as_file_handle *fd_h = events[i].data; + if (fd_h == 0) { + cf_info(AS_DEMARSHAL, "event with null handle, continuing"); + goto NextEvent; + } + + cf_detail(AS_DEMARSHAL, "epoll connection event: fd %d, events 0x%x", CSFD(&fd_h->sock), events[i].events); + + // Process data on an existing connection: this might be more + // activity on an already existing transaction, so we have some + // state to manage. + cf_socket *sock = &fd_h->sock; + + if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) { + cf_detail(AS_DEMARSHAL, "proto socket: remote close: fd %d event %x", CSFD(sock), events[i].events); + // no longer in use: out of epoll etc + goto NextEvent_FD_Cleanup; + } + + if (tls_socket_needs_handshake(&fd_h->sock)) { + int32_t tls_ev = tls_socket_accept(&fd_h->sock); + + if (tls_ev == EPOLLERR) { + goto NextEvent_FD_Cleanup; + } + + if (tls_ev == 0) { + tls_socket_must_not_have_data(&fd_h->sock, "service handshake"); + tls_ev = EPOLLIN; + } + + cf_poll_modify_socket(fd_h->poll, &fd_h->sock, + tls_ev | EPOLLONESHOT | EPOLLRDHUP, fd_h); + goto NextEvent; + } + + // If pointer is NULL, then we need to create a transaction and + // store it in the buffer. + if (fd_h->proto == NULL) { + int32_t recv_sz = cf_socket_recv(sock, (uint8_t *)&fd_h->proto_hdr + sizeof(as_proto) - fd_h->proto_unread, fd_h->proto_unread, 0); + + if (recv_sz <= 0) { + if (recv_sz != 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + // This can happen because TLS protocol + // overhead can trip the epoll but no + // application-level bytes are actually + // available yet. + thr_demarshal_rearm(fd_h); + goto NextEvent; + } + cf_detail(AS_DEMARSHAL, "proto socket: read header fail: error: rv %d errno %d", recv_sz, errno); + goto NextEvent_FD_Cleanup; + } + + fd_h->proto_unread -= recv_sz; + + if (fd_h->proto_unread != 0) { + tls_socket_must_not_have_data(&fd_h->sock, "partial client read (size)"); + thr_demarshal_rearm(fd_h); + goto NextEvent; + } + + // Check for a TLS ClientHello arriving at a non-TLS socket. Heuristic: + // - tls[0] == ContentType.handshake (22) + // - tls[1] == ProtocolVersion.major (3) + // - tls[5] == HandshakeType.client_hello (1) + + uint8_t *tls = (uint8_t *)&fd_h->proto_hdr; + + if (tls[0] == 22 && tls[1] == 3 && tls[5] == 1) { + cf_warning(AS_DEMARSHAL, "ignoring incoming TLS connection from %s", fd_h->client); + goto NextEvent_FD_Cleanup; + + } + + if (fd_h->proto_hdr.version != PROTO_VERSION && + // For backward compatibility, allow version 0 with + // security messages. + ! (fd_h->proto_hdr.version == 0 && fd_h->proto_hdr.type == PROTO_TYPE_SECURITY)) { + cf_warning(AS_DEMARSHAL, "proto input from %s: unsupported proto version %u", + fd_h->client, fd_h->proto_hdr.version); + goto NextEvent_FD_Cleanup; + } + + // Swap the necessary elements of the as_proto. + as_proto_swap(&fd_h->proto_hdr); + + if (fd_h->proto_hdr.sz > PROTO_SIZE_MAX) { + cf_warning(AS_DEMARSHAL, "proto input from %s: msg greater than %d, likely request from non-Aerospike client, rejecting: sz %lu", + fd_h->client, PROTO_SIZE_MAX, (uint64_t)fd_h->proto_hdr.sz); + goto NextEvent_FD_Cleanup; + } + + // Allocate the complete message buffer. + fd_h->proto = cf_malloc(sizeof(as_proto) + fd_h->proto_hdr.sz); + + memcpy(fd_h->proto, &fd_h->proto_hdr, sizeof(as_proto)); + + fd_h->proto_unread = fd_h->proto->sz; + } + + if (fd_h->proto_unread != 0) { + // Read the data. + int32_t recv_sz = cf_socket_recv(sock, fd_h->proto->data + (fd_h->proto->sz - fd_h->proto_unread), fd_h->proto_unread, 0); + + if (recv_sz <= 0) { + if (recv_sz != 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) { + thr_demarshal_rearm(fd_h); + goto NextEvent; + } + cf_info(AS_DEMARSHAL, "receive socket: fail? n %d errno %d %s closing connection.", recv_sz, errno, cf_strerror(errno)); + goto NextEvent_FD_Cleanup; + } + + // Decrement bytes-unread counter. + cf_detail(AS_DEMARSHAL, "read fd %d (%d %lu)", CSFD(sock), recv_sz, fd_h->proto_unread); + fd_h->proto_unread -= recv_sz; + + if (fd_h->proto_unread != 0) { + tls_socket_must_not_have_data(&fd_h->sock, "partial client read (body)"); + thr_demarshal_rearm(fd_h); + goto NextEvent; + } + } + + tls_socket_must_not_have_data(&fd_h->sock, "full client read"); + cf_debug(AS_DEMARSHAL, "running on CPU %hu", cf_topo_current_cpu()); + + // fd_h->proto_unread == 0 - finished reading complete proto. + // In current pipelining model, can't rearm fd_h until end of + // transaction. + as_proto *proto_p = fd_h->proto; + + fd_h->proto = NULL; + fd_h->proto_unread = (uint64_t)sizeof(as_proto); + fd_h->last_used = now_ms; + + cf_rc_reserve(fd_h); + has_extra_ref = true; + + // Info protocol requests. + if (proto_p->type == PROTO_TYPE_INFO) { + as_info_transaction it = { fd_h, proto_p, now_ns }; + + as_info(&it); + goto NextEvent; + } + + // INIT_TR + as_transaction tr; + as_transaction_init_head(&tr, NULL, (cl_msg *)proto_p); + + tr.origin = FROM_CLIENT; + tr.from.proto_fd_h = fd_h; + tr.start_time = now_ns; + + if (! as_proto_is_valid_type(proto_p)) { + cf_warning(AS_DEMARSHAL, "unsupported proto message type %u", proto_p->type); + // We got a proto message type we don't recognize, so it + // may not do any good to send back an as_msg error, but + // it's the best we can do. At least we can keep the fd. + as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); + goto NextEvent; + } + + // Check if it's compressed. + if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG_COMPRESSED) { + // Decompress it - allocate buffer to hold decompressed + // packet. + uint8_t *decompressed_buf = NULL; + size_t decompressed_buf_size = 0; + int rv = 0; + if ((rv = as_packet_decompression((uint8_t *)proto_p, &decompressed_buf, &decompressed_buf_size))) { + cf_warning(AS_DEMARSHAL, "as_proto decompression failed! (rv %d)", rv); + cf_warning_binary(AS_DEMARSHAL, (void *)proto_p, sizeof(as_proto) + proto_p->sz, CF_DISPLAY_HEX_SPACED, "compressed proto_p"); + as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); + goto NextEvent; + } + + // Free the compressed packet since we'll be using the + // decompressed packet from now on. + cf_free(proto_p); + + // Get original packet. + tr.msgp = (cl_msg *)decompressed_buf; + as_proto_swap(&(tr.msgp->proto)); + + if (! as_proto_wrapped_is_valid(&tr.msgp->proto, decompressed_buf_size)) { + cf_warning(AS_DEMARSHAL, "decompressed unusable proto: version %u, type %u, sz %lu [%lu]", + tr.msgp->proto.version, tr.msgp->proto.type, (uint64_t)tr.msgp->proto.sz, decompressed_buf_size); + as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_UNKNOWN); + goto NextEvent; + } + } + + // If it's an XDR connection and we haven't yet modified the connection settings, ... + if (tr.msgp->proto.type == PROTO_TYPE_AS_MSG && + as_transaction_is_xdr(&tr) && + (fd_h->fh_info & FH_INFO_XDR) == 0) { + // ... modify them. + if (thr_demarshal_config_xdr(&fd_h->sock) != 0) { + cf_warning(AS_DEMARSHAL, "Failed to configure XDR connection"); + goto NextEvent_FD_Cleanup; + } + + fd_h->fh_info |= FH_INFO_XDR; + } + + // Security protocol transactions. + if (tr.msgp->proto.type == PROTO_TYPE_SECURITY) { + as_security_transact(&tr); + goto NextEvent; + } + + // For now only AS_MSG's contribute to this benchmark. + if (g_config.svc_benchmarks_enabled) { + tr.benchmark_time = histogram_insert_data_point(g_stats.svc_demarshal_hist, now_ns); + } + + // Fast path for batch requests. + if (tr.msgp->msg.info1 & AS_MSG_INFO1_BATCH) { + as_batch_queue_task(&tr); + goto NextEvent; + } + + // Swap as_msg fields and bin-ops to host order, and flag + // which fields are present, to reduce re-parsing. + if (! as_transaction_prepare(&tr, true)) { + cf_warning(AS_DEMARSHAL, "bad client msg"); + as_transaction_demarshal_error(&tr, AS_PROTO_RESULT_FAIL_PARAMETER); + goto NextEvent; + } + + ASD_TRANS_DEMARSHAL(nodeid, (uint64_t) tr.msgp, as_transaction_trid(&tr)); + + // Directly process or queue the transaction. + if (g_config.n_namespaces_inlined != 0 && + (g_config.n_namespaces_not_inlined == 0 || + // Only peek if at least one of each config. + peek_data_in_memory(&tr.msgp->msg))) { + // Data-in-memory namespace - process in this thread. + as_tsvc_process_transaction(&tr); + } + else { + // Data-not-in-memory namespace - process via queues. + as_tsvc_enqueue(&tr); + } + + // Jump the proto message free & FD cleanup. If we get here, the + // above operations went smoothly. The message free & FD cleanup + // job is handled elsewhere as directed by + // thr_tsvc_process_or_enqueue(). + goto NextEvent; + +NextEvent_FD_Cleanup: + // If we allocated memory for the incoming message, free it. + if (fd_h->proto) { + cf_free(fd_h->proto); + fd_h->proto = 0; + } + // If fd has extra reference for transaction, release it. + if (has_extra_ref) { + cf_rc_release(fd_h); + } + // Remove the fd from the events list. + cf_poll_delete_socket(poll, sock); + pthread_mutex_lock(&g_file_handle_a_LOCK); + fd_h->reap_me = true; + as_release_file_handle(fd_h); + fd_h = 0; + pthread_mutex_unlock(&g_file_handle_a_LOCK); +NextEvent: + ; + } + + // We should never be canceled externally, but just in case... + pthread_testcancel(); + } + } + + return NULL; +} + +static void +add_local(cf_serv_cfg *serv_cfg, cf_sock_owner owner) +{ + // Localhost will only be added to the addresses, if we're not yet listening + // on wildcard ("any") or localhost. + + cf_ip_port port = 0; + + for (uint32_t i = 0; i < serv_cfg->n_cfgs; ++i) { + if (serv_cfg->cfgs[i].owner != owner) { + continue; + } + + port = serv_cfg->cfgs[i].port; + + if (cf_ip_addr_is_any(&serv_cfg->cfgs[i].addr) || + cf_ip_addr_is_local(&serv_cfg->cfgs[i].addr)) { + return; + } + } + + if (port == 0) { + return; + } + + cf_sock_cfg sock_cfg; + cf_sock_cfg_init(&sock_cfg, owner); + sock_cfg.port = port; + cf_ip_addr_set_local(&sock_cfg.addr); + + if (cf_serv_cfg_add_sock_cfg(serv_cfg, &sock_cfg) < 0) { + cf_crash(AS_DEMARSHAL, "Couldn't add localhost listening address"); + } +} + +// Initialize the demarshal service, start demarshal threads. +int +as_demarshal_start() +{ + demarshal_args *dm = cf_malloc(sizeof(demarshal_args)); + memset(dm, 0, sizeof(demarshal_args)); + g_demarshal_args = dm; + + g_freeslot = cf_queue_create(sizeof(int), true); + + add_local(&g_service_bind, CF_SOCK_OWNER_SERVICE); + add_local(&g_service_bind, CF_SOCK_OWNER_SERVICE_TLS); + + as_xdr_info_port(&g_service_bind); + + if (cf_socket_init_server(&g_service_bind, &g_sockets) < 0) { + cf_crash(AS_DEMARSHAL, "Couldn't initialize service socket"); + } + + // Create all the epoll_fds and wait for all the threads to come up. + + cf_info(AS_DEMARSHAL, "starting %u demarshal threads", + g_config.n_service_threads); + + dm->num_threads = g_config.n_service_threads; + + for (int32_t i = 1; i < dm->num_threads; ++i) { + if (pthread_create(&dm->dm_th[i], NULL, thr_demarshal, NULL) != 0) { + cf_crash(AS_DEMARSHAL, "Can't create demarshal threads"); + } + } + + for (int32_t i = 1; i < dm->num_threads; i++) { + while (CEFD(dm->polls[i]) == 0) { + usleep(1000); + } + } + + // Create first thread which is the listener. We do this one last, as it + // requires the other threads' epoll instances. + if (pthread_create(&dm->dm_th[0], NULL, thr_demarshal, NULL) != 0) { + cf_crash(AS_DEMARSHAL, "Can't create demarshal threads"); + } + + // For orderly startup log, wait for endpoint setup. + while (CEFD(dm->polls[0]) == 0) { + usleep(1000); + } + + return 0; +} diff --git a/as/src/base/thr_info.c b/as/src/base/thr_info.c new file mode 100644 index 00000000..7f445bac --- /dev/null +++ b/as/src/base/thr_info.c @@ -0,0 +1,7024 @@ +/* + * thr_info.c + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/thr_info.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_queue.h" +#include "citrusleaf/cf_vector.h" + +#include "cf_str.h" +#include "dynbuf.h" +#include "fault.h" +#include "meminfo.h" +#include "shash.h" +#include "socket.h" + +#include "ai_obj.h" +#include "ai_btree.h" + +#include "base/batch.h" +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/features.h" +#include "base/index.h" +#include "base/monitor.h" +#include "base/scan.h" +#include "base/thr_batch.h" +#include "base/thr_demarshal.h" +#include "base/thr_info_port.h" +#include "base/thr_sindex.h" +#include "base/thr_tsvc.h" +#include "base/transaction.h" +#include "base/secondary_index.h" +#include "base/security.h" +#include "base/stats.h" +#include "base/system_metadata.h" +#include "base/truncate.h" +#include "base/udf_cask.h" +#include "base/xdr_config.h" +#include "base/xdr_serverside.h" +#include "fabric/exchange.h" +#include "fabric/fabric.h" +#include "fabric/hb.h" +#include "fabric/hlc.h" +#include "fabric/migrate.h" +#include "fabric/partition.h" +#include "fabric/partition_balance.h" +#include "fabric/roster.h" +#include "fabric/skew_monitor.h" +#include "transaction/proxy.h" +#include "transaction/rw_request_hash.h" + +#define STR_NS "ns" +#define STR_SET "set" +#define STR_INDEXNAME "indexname" +#define STR_NUMBIN "numbins" +#define STR_INDEXDATA "indexdata" +#define STR_TYPE_NUMERIC "numeric" +#define STR_TYPE_STRING "string" +#define STR_ITYPE "indextype" +#define STR_ITYPE_DEFAULT "DEFAULT" +#define STR_ITYPE_LIST "LIST" +#define STR_ITYPE_MAPKEYS "MAPKEYS" +#define STR_ITYPE_MAPVALUES "MAPVALUES" +#define STR_BINTYPE "bintype" + +extern int as_nsup_queue_get_size(); + +int info_get_objects(char *name, cf_dyn_buf *db); +int info_get_tree_sets(char *name, char *subtree, cf_dyn_buf *db); +int info_get_tree_bins(char *name, char *subtree, cf_dyn_buf *db); +int info_get_tree_sindexes(char *name, char *subtree, cf_dyn_buf *db); +int info_get_tree_statistics(char *name, char *subtree, cf_dyn_buf *db); +void as_storage_show_wblock_stats(as_namespace *ns); +void as_storage_summarize_wblock_stats(as_namespace *ns); +int as_storage_analyze_wblock(as_namespace* ns, int device_index, uint32_t wblock_id); + + +as_stats g_stats = { 0 }; // separate .c file not worth it + +uint64_t g_start_ms; // start time of the server + +static cf_queue *g_info_work_q = 0; + +// +// Info has its own fabric service +// which allows it to communicate things like the IP addresses of +// all the other nodes +// + +#define INFO_FIELD_OP 0 +#define INFO_FIELD_GENERATION 1 +#define INFO_FIELD_SERVICE_ADDRESS 2 +#define INFO_FIELD_ALT_ADDRESS 3 +#define INFO_FIELD_SERVICES_CLEAR_STD 4 +#define INFO_FIELD_SERVICES_TLS_STD 5 +#define INFO_FIELD_SERVICES_CLEAR_ALT 6 +#define INFO_FIELD_SERVICES_TLS_ALT 7 +#define INFO_FIELD_TLS_NAME 8 + +#define INFO_OP_UPDATE 0 +#define INFO_OP_ACK 1 +#define INFO_OP_UPDATE_REQ 2 + +msg_template info_mt[] = { + { INFO_FIELD_OP, M_FT_UINT32 }, + { INFO_FIELD_GENERATION, M_FT_UINT32 }, + { INFO_FIELD_SERVICE_ADDRESS, M_FT_STR }, + { INFO_FIELD_ALT_ADDRESS, M_FT_STR }, + { INFO_FIELD_SERVICES_CLEAR_STD, M_FT_STR }, + { INFO_FIELD_SERVICES_TLS_STD, M_FT_STR }, + { INFO_FIELD_SERVICES_CLEAR_ALT, M_FT_STR }, + { INFO_FIELD_SERVICES_TLS_ALT, M_FT_STR }, + { INFO_FIELD_TLS_NAME, M_FT_STR } +}; + +#define INFO_MSG_SCRATCH_SIZE 512 + + +// +// The dynamic list has a name, and a function to call +// + +typedef struct info_static_s { + struct info_static_s *next; + bool def; // default, but default is a reserved word + char *name; + char *value; + size_t value_sz; +} info_static; + + +typedef struct info_dynamic_s { + struct info_dynamic_s *next; + bool def; // default, but that's a reserved word + char *name; + as_info_get_value_fn value_fn; +} info_dynamic; + +typedef struct info_command_s { + struct info_command_s *next; + char *name; + as_info_command_fn command_fn; + as_sec_perm required_perm; // required security permission +} info_command; + +typedef struct info_tree_s { + struct info_tree_s *next; + char *name; + as_info_get_tree_fn tree_fn; +} info_tree; + + +#define EOL '\n' // incoming commands are separated by EOL +#define SEP '\t' +#define TREE_SEP '/' + +#define INFO_COMMAND_SINDEX_FAILCODE(num, message) \ + if (db) { \ + cf_dyn_buf_append_string(db, "FAIL:"); \ + cf_dyn_buf_append_int(db, num); \ + cf_dyn_buf_append_string(db, ": "); \ + cf_dyn_buf_append_string(db, message); \ + } + + +void +info_get_aggregated_namespace_stats(cf_dyn_buf *db) +{ + uint64_t total_objects = 0; + uint64_t total_tombstones = 0; + + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_namespace *ns = g_config.namespaces[i]; + + total_objects += ns->n_objects; + total_tombstones += ns->n_tombstones; + } + + info_append_uint64(db, "objects", total_objects); + info_append_uint64(db, "tombstones", total_tombstones); +} + +// #define INFO_SEGV_TEST 1 +#ifdef INFO_SEGV_TEST +char *segv_test = "segv test"; +int +info_segv_test(char *name, cf_dyn_buf *db) +{ + *segv_test = 'E'; + cf_dyn_buf_append_string(db, "segv"); + return(0); +} +#endif + +int +info_get_stats(char *name, cf_dyn_buf *db) +{ + info_append_uint32(db, "cluster_size", as_exchange_cluster_size()); + info_append_uint64_x(db, "cluster_key", as_exchange_cluster_key()); // not in ticker + info_append_bool(db, "cluster_integrity", as_clustering_has_integrity()); // not in ticker + info_append_bool(db, "cluster_is_member", ! as_clustering_is_orphan()); // not in ticker + as_hb_info_duplicates_get(db); // not in ticker + info_append_uint32(db, "cluster_clock_skew_stop_writes_sec", clock_skew_stop_writes_sec()); // not in ticker + info_append_uint64(db, "cluster_clock_skew", as_skew_monitor_skew()); + as_skew_monitor_info(db); + + info_append_uint64(db, "uptime", (cf_getms() - g_start_ms) / 1000); // not in ticker + + int freepct; + bool swapping; + + cf_meminfo(NULL, NULL, &freepct, &swapping); + info_append_int(db, "system_free_mem_pct", freepct); + info_append_bool(db, "system_swapping", swapping); + + size_t allocated_kbytes; + size_t active_kbytes; + size_t mapped_kbytes; + double efficiency_pct; + uint32_t site_count; + + cf_alloc_heap_stats(&allocated_kbytes, &active_kbytes, &mapped_kbytes, &efficiency_pct, + &site_count); + info_append_uint64(db, "heap_allocated_kbytes", allocated_kbytes); + info_append_uint64(db, "heap_active_kbytes", active_kbytes); + info_append_uint64(db, "heap_mapped_kbytes", mapped_kbytes); + info_append_int(db, "heap_efficiency_pct", (int)(efficiency_pct + 0.5)); + info_append_uint32(db, "heap_site_count", site_count); + + info_get_aggregated_namespace_stats(db); + + info_append_int(db, "tsvc_queue", as_tsvc_queue_get_size()); + info_append_int(db, "info_queue", as_info_queue_get_size()); + info_append_int(db, "delete_queue", as_nsup_queue_get_size()); + info_append_uint32(db, "rw_in_progress", rw_request_hash_count()); + info_append_uint32(db, "proxy_in_progress", as_proxy_hash_count()); + info_append_int(db, "tree_gc_queue", as_index_tree_gc_queue_size()); + + info_append_uint64(db, "client_connections", g_stats.proto_connections_opened - g_stats.proto_connections_closed); + info_append_uint64(db, "heartbeat_connections", g_stats.heartbeat_connections_opened - g_stats.heartbeat_connections_closed); + info_append_uint64(db, "fabric_connections", g_stats.fabric_connections_opened - g_stats.fabric_connections_closed); + + info_append_uint64(db, "heartbeat_received_self", g_stats.heartbeat_received_self); + info_append_uint64(db, "heartbeat_received_foreign", g_stats.heartbeat_received_foreign); + + + info_append_uint64(db, "reaped_fds", g_stats.reaper_count); // not in ticker + + info_append_uint64(db, "info_complete", g_stats.info_complete); // not in ticker + + info_append_uint64(db, "demarshal_error", g_stats.n_demarshal_error); + info_append_uint64(db, "early_tsvc_client_error", g_stats.n_tsvc_client_error); + info_append_uint64(db, "early_tsvc_batch_sub_error", g_stats.n_tsvc_batch_sub_error); + info_append_uint64(db, "early_tsvc_udf_sub_error", g_stats.n_tsvc_udf_sub_error); + + info_append_uint64(db, "batch_index_initiate", g_stats.batch_index_initiate); // not in ticker + + cf_dyn_buf_append_string(db, "batch_index_queue="); + as_batch_queues_info(db); // not in ticker + cf_dyn_buf_append_char(db, ';'); + + info_append_uint64(db, "batch_index_complete", g_stats.batch_index_complete); + info_append_uint64(db, "batch_index_error", g_stats.batch_index_errors); + info_append_uint64(db, "batch_index_timeout", g_stats.batch_index_timeout); + + // Everything below is not in ticker... + + info_append_int(db, "batch_index_unused_buffers", as_batch_unused_buffers()); + info_append_uint64(db, "batch_index_huge_buffers", g_stats.batch_index_huge_buffers); + info_append_uint64(db, "batch_index_created_buffers", g_stats.batch_index_created_buffers); + info_append_uint64(db, "batch_index_destroyed_buffers", g_stats.batch_index_destroyed_buffers); + + info_append_uint64(db, "batch_initiate", g_stats.batch_initiate); + info_append_int(db, "batch_queue", as_batch_direct_queue_size()); + info_append_uint64(db, "batch_error", g_stats.batch_errors); + info_append_uint64(db, "batch_timeout", g_stats.batch_timeout); + + info_append_int(db, "scans_active", as_scan_get_active_job_count()); + + info_append_uint32(db, "query_short_running", g_query_short_running); + info_append_uint32(db, "query_long_running", g_query_long_running); + + info_append_uint64(db, "sindex_ucgarbage_found", g_stats.query_false_positives); + info_append_uint64(db, "sindex_gc_locktimedout", g_stats.sindex_gc_timedout); + info_append_uint64(db, "sindex_gc_list_creation_time", g_stats.sindex_gc_list_creation_time); + info_append_uint64(db, "sindex_gc_list_deletion_time", g_stats.sindex_gc_list_deletion_time); + info_append_uint64(db, "sindex_gc_objects_validated", g_stats.sindex_gc_objects_validated); + info_append_uint64(db, "sindex_gc_garbage_found", g_stats.sindex_gc_garbage_found); + info_append_uint64(db, "sindex_gc_garbage_cleaned", g_stats.sindex_gc_garbage_cleaned); + + char paxos_principal[16 + 1]; + sprintf(paxos_principal, "%lX", as_exchange_principal()); + info_append_string(db, "paxos_principal", paxos_principal); + + info_append_bool(db, "migrate_allowed", as_partition_balance_are_migrations_allowed()); + info_append_uint64(db, "migrate_partitions_remaining", as_partition_balance_remaining_migrations()); + + info_append_uint64(db, "fabric_bulk_send_rate", g_stats.fabric_bulk_s_rate); + info_append_uint64(db, "fabric_bulk_recv_rate", g_stats.fabric_bulk_r_rate); + info_append_uint64(db, "fabric_ctrl_send_rate", g_stats.fabric_ctrl_s_rate); + info_append_uint64(db, "fabric_ctrl_recv_rate", g_stats.fabric_ctrl_r_rate); + info_append_uint64(db, "fabric_meta_send_rate", g_stats.fabric_meta_s_rate); + info_append_uint64(db, "fabric_meta_recv_rate", g_stats.fabric_meta_r_rate); + info_append_uint64(db, "fabric_rw_send_rate", g_stats.fabric_rw_s_rate); + info_append_uint64(db, "fabric_rw_recv_rate", g_stats.fabric_rw_r_rate); + + as_xdr_get_stats(db); + + cf_dyn_buf_chomp(db); + + return 0; +} + +cf_atomic32 g_node_info_generation = 0; + + +int +info_get_cluster_generation(char *name, cf_dyn_buf *db) +{ + cf_dyn_buf_append_int(db, g_node_info_generation); + + return(0); +} + +void +info_get_printable_cluster_name(char *cluster_name) +{ + as_config_cluster_name_get(cluster_name); + if (cluster_name[0] == '\0'){ + strcpy(cluster_name, "null"); + } +} + +int +info_get_cluster_name(char *name, cf_dyn_buf *db) +{ + char cluster_name[AS_CLUSTER_NAME_SZ]; + info_get_printable_cluster_name(cluster_name); + cf_dyn_buf_append_string(db, cluster_name); + + return 0; +} + +int +info_get_features(char *name, cf_dyn_buf *db) +{ + cf_dyn_buf_append_string(db, as_features_info()); + + return 0; +} + +static cf_ip_port +bind_to_port(cf_serv_cfg *cfg, cf_sock_owner owner) +{ + for (uint32_t i = 0; i < cfg->n_cfgs; ++i) { + if (cfg->cfgs[i].owner == owner) { + return cfg->cfgs[i].port; + } + } + + return 0; +} + +char * +as_info_bind_to_string(const cf_serv_cfg *cfg, cf_sock_owner owner) +{ + cf_dyn_buf_define_size(db, 2500); + uint32_t count = 0; + + for (uint32_t i = 0; i < cfg->n_cfgs; ++i) { + if (cfg->cfgs[i].owner != owner) { + continue; + } + + if (count > 0) { + cf_dyn_buf_append_char(&db, ','); + } + + cf_dyn_buf_append_string(&db, cf_ip_addr_print(&cfg->cfgs[i].addr)); + ++count; + } + + char *string = cf_dyn_buf_strdup(&db); + cf_dyn_buf_free(&db); + return string != NULL ? string : cf_strdup("null"); +} + +static char * +access_to_string(cf_addr_list *addrs) +{ + cf_dyn_buf_define_size(db, 2500); + + for (uint32_t i = 0; i < addrs->n_addrs; ++i) { + if (i > 0) { + cf_dyn_buf_append_char(&db, ','); + } + + cf_dyn_buf_append_string(&db, addrs->addrs[i]); + } + + char *string = cf_dyn_buf_strdup(&db); + cf_dyn_buf_free(&db); + return string != NULL ? string : cf_strdup("null"); +} + +int +info_get_endpoints(char *name, cf_dyn_buf *db) +{ + cf_ip_port port = bind_to_port(&g_service_bind, CF_SOCK_OWNER_SERVICE); + info_append_int(db, "service.port", port); + + char *string = as_info_bind_to_string(&g_service_bind, CF_SOCK_OWNER_SERVICE); + info_append_string(db, "service.addresses", string); + cf_free(string); + + info_append_int(db, "service.access-port", g_access.service.port); + + string = access_to_string(&g_access.service.addrs); + info_append_string(db, "service.access-addresses", string); + cf_free(string); + + info_append_int(db, "service.alternate-access-port", g_access.alt_service.port); + + string = access_to_string(&g_access.alt_service.addrs); + info_append_string(db, "service.alternate-access-addresses", string); + cf_free(string); + + port = bind_to_port(&g_service_bind, CF_SOCK_OWNER_SERVICE_TLS); + info_append_int(db, "service.tls-port", port); + + string = as_info_bind_to_string(&g_service_bind, CF_SOCK_OWNER_SERVICE_TLS); + info_append_string(db, "service.tls-addresses", string); + cf_free(string); + + info_append_int(db, "service.tls-access-port", g_access.tls_service.port); + + string = access_to_string(&g_access.tls_service.addrs); + info_append_string(db, "service.tls-access-addresses", string); + cf_free(string); + + info_append_int(db, "service.tls-alternate-access-port", g_access.alt_tls_service.port); + + string = access_to_string(&g_access.alt_tls_service.addrs); + info_append_string(db, "service.tls-alternate-access-addresses", string); + cf_free(string); + + as_hb_info_endpoints_get(db); + + port = bind_to_port(&g_fabric_bind, CF_SOCK_OWNER_FABRIC); + info_append_int(db, "fabric.port", port); + + string = as_info_bind_to_string(&g_fabric_bind, CF_SOCK_OWNER_FABRIC); + info_append_string(db, "fabric.addresses", string); + cf_free(string); + + port = bind_to_port(&g_fabric_bind, CF_SOCK_OWNER_FABRIC_TLS); + info_append_int(db, "fabric.tls-port", port); + + string = as_info_bind_to_string(&g_fabric_bind, CF_SOCK_OWNER_FABRIC_TLS); + info_append_string(db, "fabric.tls-addresses", string); + cf_free(string); + + as_fabric_info_peer_endpoints_get(db); + + info_append_int(db, "info.port", g_info_port); + + string = as_info_bind_to_string(&g_info_bind, CF_SOCK_OWNER_INFO); + info_append_string(db, "info.addresses", string); + cf_free(string); + + cf_dyn_buf_chomp(db); + return(0); +} + +int +info_get_partition_generation(char *name, cf_dyn_buf *db) +{ + cf_dyn_buf_append_int(db, (int)g_partition_generation); + + return(0); +} + +int +info_get_partition_info(char *name, cf_dyn_buf *db) +{ + as_partition_getinfo_str(db); + + return(0); +} + +// Deprecate in "six months". +int +info_get_replicas_prole(char *name, cf_dyn_buf *db) +{ + as_partition_get_replicas_prole_str(db); + + return(0); +} + +int +info_get_replicas_master(char *name, cf_dyn_buf *db) +{ + as_partition_get_replicas_master_str(db); + + return(0); +} + +int +info_get_replicas_all(char *name, cf_dyn_buf *db) +{ + as_partition_get_replicas_all_str(db, false); + + return(0); +} + +int +info_get_replicas(char *name, cf_dyn_buf *db) +{ + as_partition_get_replicas_all_str(db, true); + + return(0); +} + +// +// COMMANDS +// + +int +info_command_get_sl(char *name, char *params, cf_dyn_buf *db) +{ + // Command Format: "get-sl:" + + as_exchange_info_get_succession(db); + + return 0; +} + +int +info_command_tip(char *name, char *params, cf_dyn_buf *db) +{ + cf_debug(AS_INFO, "tip command received: params %s", params); + + char host_str[50]; + int host_str_len = sizeof(host_str); + + char port_str[50]; + int port_str_len = sizeof(port_str); + int rv = -1; + + char tls_str[50]; + int tls_str_len = sizeof(tls_str); + + /* + * Command Format: "tip:host=;port=[;tls=]" + * + * where is an IP address and is a valid TCP port number. + */ + + if (0 != as_info_parameter_get(params, "host", host_str, &host_str_len)) { + cf_warning(AS_INFO, "tip command: no host, must add a host parameter"); + goto Exit; + } + + if (0 != as_info_parameter_get(params, "port", port_str, &port_str_len)) { + cf_warning(AS_INFO, "tip command: no port, must have port"); + goto Exit; + } + + if (0 != as_info_parameter_get(params, "tls", tls_str, &tls_str_len)) { + strcpy(tls_str, "false"); + } + + int port = 0; + if (0 != cf_str_atoi(port_str, &port)) { + cf_warning(AS_INFO, "tip command: port must be an integer in: %s", port_str); + goto Exit; + } + + bool tls; + if (strcmp(tls_str, "true") == 0) { + tls = true; + } + else if (strcmp(tls_str, "false") == 0) { + tls = false; + } + else { + cf_warning(AS_INFO, "The \"%s:\" command argument \"tls\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, tls_str); + goto Exit; + } + + rv = as_hb_mesh_tip(host_str, port, tls); + +Exit: + if (0 == rv) { + cf_dyn_buf_append_string(db, "ok"); + } else { + cf_dyn_buf_append_string(db, "error"); + } + + return(0); +} + +/* + * Command Format: "tip-clear:{host-port-list=}" + * + * where is either "all" or else a comma-separated list of items of the form: : + */ +int32_t +info_command_tip_clear(char* name, char* params, cf_dyn_buf* db) +{ + cf_info(AS_INFO, "tip clear command received: params %s", params); + + // Command Format: "tip-clear:{host-port-list=}" [the + // "host-port-list" argument is optional] + // where is either "all" or else a comma-separated list of items + // of the form: : or []: + + char host_port_list[3000]; + int host_port_list_len = sizeof(host_port_list); + host_port_list[0] = '\0'; + bool success = true; + uint32_t cleared = 0, not_found = 0; + + if (as_info_parameter_get(params, "host-port-list", host_port_list, + &host_port_list_len) == 0) { + if (0 != strcmp(host_port_list, "all")) { + char* save_ptr = NULL; + int port = -1; + char* host_port = + strtok_r(host_port_list, ",", &save_ptr); + + while (host_port != NULL) { + char* host_port_delim = ":"; + if (*host_port == '[') { + // Parse IPv6 address differently. + host_port++; + host_port_delim = "]"; + } + + char* host_port_save_ptr = NULL; + char* host = + strtok_r(host_port, host_port_delim, &host_port_save_ptr); + + if (host == NULL) { + cf_warning(AS_INFO, "tip clear command: invalid host:port string: %s", host_port); + success = false; + break; + } + + char* port_str = + strtok_r(NULL, host_port_delim, &host_port_save_ptr); + + if (port_str != NULL && *port_str == ':') { + // IPv6 case + port_str++; + } + if (port_str == NULL || + 0 != cf_str_atoi(port_str, &port)) { + cf_warning(AS_INFO, "tip clear command: port must be an integer in: %s", port_str); + success = false; + break; + } + + if (as_hb_mesh_tip_clear(host, port) == -1) { + success = false; + not_found++; + cf_warning(AS_INFO, "seed node %s:%d does not exist", host, port); + } else { + cleared++; + } + + host_port = strtok_r(NULL, ",", &save_ptr); + } + } else { + if (as_hb_mesh_tip_clear_all(&cleared)) { + success = false; + } + } + } else { + success = false; + } + + if (success) { + cf_info(AS_INFO, "tip clear command executed: cleared %"PRIu32", params %s", cleared, params); + cf_dyn_buf_append_string(db, "ok"); + } else { + cf_info(AS_INFO, "tip clear command failed: cleared %"PRIu32", params %s", cleared, params); + char error_msg[1024]; + sprintf(error_msg, "error: %"PRIu32" cleared, %"PRIu32" not found", cleared, not_found); + cf_dyn_buf_append_string(db, error_msg); + } + + return (0); +} + +int +info_command_show_devices(char *name, char *params, cf_dyn_buf *db) +{ + char ns_str[512]; + int ns_len = sizeof(ns_str); + + if (0 != as_info_parameter_get(params, "namespace", ns_str, &ns_len)) { + cf_info(AS_INFO, "show-devices requires namespace parameter"); + cf_dyn_buf_append_string(db, "error"); + return(0); + } + + as_namespace *ns = as_namespace_get_byname(ns_str); + if (!ns) { + cf_info(AS_INFO, "show-devices: namespace %s not found", ns_str); + cf_dyn_buf_append_string(db, "error"); + return(0); + } + as_storage_show_wblock_stats(ns); + + cf_dyn_buf_append_string(db, "ok"); + + return(0); +} + +int +info_command_dump_cluster(char *name, char *params, cf_dyn_buf *db) +{ + bool verbose = false; + char param_str[100]; + int param_str_len = sizeof(param_str); + + /* + * Command Format: "dump-cluster:{verbose=}" [the "verbose" argument is optional] + * + * where is one of: {"true" | "false"} and defaults to "false". + */ + param_str[0] = '\0'; + if (!as_info_parameter_get(params, "verbose", param_str, ¶m_str_len)) { + if (!strncmp(param_str, "true", 5)) { + verbose = true; + } else if (!strncmp(param_str, "false", 6)) { + verbose = false; + } else { + cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + } + as_clustering_dump(verbose); + as_exchange_dump(verbose); + cf_dyn_buf_append_string(db, "ok"); + return(0); +} + +int +info_command_dump_fabric(char *name, char *params, cf_dyn_buf *db) +{ + bool verbose = false; + char param_str[100]; + int param_str_len = sizeof(param_str); + + /* + * Command Format: "dump-fabric:{verbose=}" [the "verbose" argument is optional] + * + * where is one of: {"true" | "false"} and defaults to "false". + */ + param_str[0] = '\0'; + if (!as_info_parameter_get(params, "verbose", param_str, ¶m_str_len)) { + if (!strncmp(param_str, "true", 5)) { + verbose = true; + } else if (!strncmp(param_str, "false", 6)) { + verbose = false; + } else { + cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + } + as_fabric_dump(verbose); + cf_dyn_buf_append_string(db, "ok"); + return(0); +} + +int +info_command_dump_hb(char *name, char *params, cf_dyn_buf *db) +{ + bool verbose = false; + char param_str[100]; + int param_str_len = sizeof(param_str); + + /* + * Command Format: "dump-hb:{verbose=}" [the "verbose" argument is optional] + * + * where is one of: {"true" | "false"} and defaults to "false". + */ + param_str[0] = '\0'; + if (!as_info_parameter_get(params, "verbose", param_str, ¶m_str_len)) { + if (!strncmp(param_str, "true", 5)) { + verbose = true; + } else if (!strncmp(param_str, "false", 6)) { + verbose = false; + } else { + cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + } + as_hb_dump(verbose); + cf_dyn_buf_append_string(db, "ok"); + return(0); +} + +int +info_command_dump_hlc(char *name, char *params, cf_dyn_buf *db) +{ + bool verbose = false; + char param_str[100]; + int param_str_len = sizeof(param_str); + + /* + * Command Format: "dump-hlc:{verbose=}" [the "verbose" argument is optional] + * + * where is one of: {"true" | "false"} and defaults to "false". + */ + param_str[0] = '\0'; + if (!as_info_parameter_get(params, "verbose", param_str, ¶m_str_len)) { + if (!strncmp(param_str, "true", 5)) { + verbose = true; + } else if (!strncmp(param_str, "false", 6)) { + verbose = false; + } else { + cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + } + as_hlc_dump(verbose); + cf_dyn_buf_append_string(db, "ok"); + return(0); +} + + +int +info_command_dump_migrates(char *name, char *params, cf_dyn_buf *db) +{ + bool verbose = false; + char param_str[100]; + int param_str_len = sizeof(param_str); + + /* + * Command Format: "dump-migrates:{verbose=}" [the "verbose" argument is optional] + * + * where is one of: {"true" | "false"} and defaults to "false". + */ + param_str[0] = '\0'; + if (!as_info_parameter_get(params, "verbose", param_str, ¶m_str_len)) { + if (!strncmp(param_str, "true", 5)) { + verbose = true; + } else if (!strncmp(param_str, "false", 6)) { + verbose = false; + } else { + cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + } + as_migrate_dump(verbose); + cf_dyn_buf_append_string(db, "ok"); + return(0); +} + +int +info_command_dump_msgs(char *name, char *params, cf_dyn_buf *db) +{ + bool once = true; + char param_str[100]; + int param_str_len = sizeof(param_str); + + /* + * Command Format: "dump-msgs:{mode=}" [the "mode" argument is optional] + * + * where is one of: {"on" | "off" | "once"} and defaults to "once". + */ + param_str[0] = '\0'; + if (!as_info_parameter_get(params, "mode", param_str, ¶m_str_len)) { + if (!strncmp(param_str, "on", 3)) { + g_config.fabric_dump_msgs = true; + } else if (!strncmp(param_str, "off", 4)) { + g_config.fabric_dump_msgs = false; + once = false; + } else if (!strncmp(param_str, "once", 5)) { + once = true; + } else { + cf_warning(AS_INFO, "The \"%s:\" command argument \"mode\" value must be one of {\"on\", \"off\", \"once\"}, not \"%s\"", name, param_str); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + } + + if (once) { + as_fabric_msg_queue_dump(); + } + + cf_dyn_buf_append_string(db, "ok"); + return(0); +} + +static int +is_numeric_string(char *str) +{ + if (!*str) + return 0; + + while (isdigit(*str)) + str++; + + return (!*str); +} + +int +info_command_dump_wb(char *name, char *params, cf_dyn_buf *db) +{ + as_namespace *ns; + int device_index, wblock_id; + char param_str[100]; + int param_str_len; + + /* + * Command Format: "dump-wb:ns=;dev=;id=" + * + * where is the name of the namespace, + * is the drive number (a non-negative integer), and + * is a non-negative integer corresponding to an active wblock. + */ + param_str[0] = '\0'; + param_str_len = sizeof(param_str); + if (!as_info_parameter_get(params, "ns", param_str, ¶m_str_len)) { + if (!(ns = as_namespace_get_byname(param_str))) { + cf_warning(AS_INFO, "The \"%s:\" command argument \"ns\" value must be the name of an existing namespace, not \"%s\"", name, param_str); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + } else { + cf_warning(AS_INFO, "The \"%s:\" command requires an argument of the form \"ns=\"", name); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + + param_str[0] = '\0'; + param_str_len = sizeof(param_str); + if (!as_info_parameter_get(params, "dev", param_str, ¶m_str_len)) { + if (!is_numeric_string(param_str) || (0 > (device_index = atoi(param_str)))) { + cf_warning(AS_INFO, "The \"%s:\" command argument \"dev\" value must be a non-negative integer, not \"%s\"", name, param_str); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + } else { + cf_warning(AS_INFO, "The \"%s:\" command requires an argument of the form \"dev=\"", name); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + + param_str[0] = '\0'; + param_str_len = sizeof(param_str); + if (!as_info_parameter_get(params, "id", param_str, ¶m_str_len)) { + if (!is_numeric_string(param_str) || (0 > (wblock_id = atoi(param_str)))) { + cf_warning(AS_INFO, "The \"%s:\" command argument \"id\" value must be a non-negative integer, not \"%s\"", name, param_str); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + } else { + cf_warning(AS_INFO, "The \"%s:\" command requires an argument of the form \"id=\"", name); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + + if (!as_storage_analyze_wblock(ns, device_index, (uint32_t) wblock_id)) + cf_dyn_buf_append_string(db, "ok"); + else + cf_dyn_buf_append_string(db, "error"); + + return(0); +} + +int +info_command_dump_wb_summary(char *name, char *params, cf_dyn_buf *db) +{ + as_namespace *ns; + char param_str[100]; + int param_str_len = sizeof(param_str); + + /* + * Command Format: "dump-wb-summary:ns=" + * + * where is the name of an existing namespace. + */ + param_str[0] = '\0'; + if (!as_info_parameter_get(params, "ns", param_str, ¶m_str_len)) { + if (!(ns = as_namespace_get_byname(param_str))) { + cf_warning(AS_INFO, "The \"%s:\" command argument \"ns\" value must be the name of an existing namespace, not \"%s\"", name, param_str); + cf_dyn_buf_append_string(db, "error"); + return(0); + } + } else { + cf_warning(AS_INFO, "The \"%s:\" command requires an argument of the form \"ns=\"", name); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + + as_storage_summarize_wblock_stats(ns); + + cf_dyn_buf_append_string(db, "ok"); + + return(0); +} + +int +info_command_dump_rw_request_hash(char *name, char *params, cf_dyn_buf *db) +{ + rw_request_hash_dump(); + cf_dyn_buf_append_string(db, "ok"); + return(0); +} + +typedef struct rack_node_s { + uint32_t rack_id; + cf_node node; +} rack_node; + +// A comparison_fn_t used with qsort() - yields ascending rack-id order. +static inline int +compare_rack_nodes(const void* pa, const void* pb) +{ + uint32_t a = ((const rack_node*)pa)->rack_id; + uint32_t b = ((const rack_node*)pb)->rack_id; + + return a > b ? 1 : (a == b ? 0 : -1); +} + +void +namespace_rack_info(as_namespace *ns, cf_dyn_buf *db, uint32_t *rack_ids, + uint32_t n_nodes, const char *tag) +{ + if (n_nodes == 0) { + return; + } + + rack_node rack_nodes[n_nodes]; + + for (uint32_t i = 0; i < n_nodes; i++) { + rack_nodes[i].rack_id = rack_ids[i]; + rack_nodes[i].node = ns->succession[i]; + } + + qsort(rack_nodes, n_nodes, sizeof(rack_node), compare_rack_nodes); + + uint32_t cur_id = rack_nodes[0].rack_id; + + cf_dyn_buf_append_string(db, tag); + cf_dyn_buf_append_uint32(db, cur_id); + cf_dyn_buf_append_char(db, '='); + cf_dyn_buf_append_uint64_x(db, rack_nodes[0].node); + + for (uint32_t i = 1; i < n_nodes; i++) { + if (rack_nodes[i].rack_id == cur_id) { + cf_dyn_buf_append_char(db, ','); + cf_dyn_buf_append_uint64_x(db, rack_nodes[i].node); + continue; + } + + cur_id = rack_nodes[i].rack_id; + + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_string(db, tag); + cf_dyn_buf_append_uint32(db, cur_id); + cf_dyn_buf_append_char(db, '='); + cf_dyn_buf_append_uint64_x(db, rack_nodes[i].node); + } +} + +int +info_command_racks(char *name, char *params, cf_dyn_buf *db) +{ + // Command format: "racks:{namespace=}" + + char param_str[AS_ID_NAMESPACE_SZ] = { 0 }; + int param_str_len = (int)sizeof(param_str); + int rv = as_info_parameter_get(params, "namespace", param_str, ¶m_str_len); + + if (rv == -2) { + cf_warning(AS_INFO, "namespace parameter value too long"); + cf_dyn_buf_append_string(db, "ERROR::bad-namespace"); + return 0; + } + + if (rv == 0) { + as_namespace *ns = as_namespace_get_byname(param_str); + + if (! ns) { + cf_warning(AS_INFO, "unknown namespace %s", param_str); + cf_dyn_buf_append_string(db, "ERROR::unknown-namespace"); + return 0; + } + + as_exchange_info_lock(); + + namespace_rack_info(ns, db, ns->rack_ids, ns->cluster_size, "rack_"); + + if (ns->roster_count != 0) { + cf_dyn_buf_append_char(db, ':'); + namespace_rack_info(ns, db, ns->roster_rack_ids, ns->roster_count, "roster_rack_"); + } + + as_exchange_info_unlock(); + + return 0; + } + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace *ns = g_config.namespaces[ns_ix]; + + cf_dyn_buf_append_string(db, "ns="); + cf_dyn_buf_append_string(db, ns->name); + cf_dyn_buf_append_char(db, ':'); + + as_exchange_info_lock(); + + namespace_rack_info(ns, db, ns->rack_ids, ns->cluster_size, "rack_"); + + if (ns->roster_count != 0) { + cf_dyn_buf_append_char(db, ':'); + namespace_rack_info(ns, db, ns->roster_rack_ids, ns->roster_count, "roster_rack_"); + } + + as_exchange_info_unlock(); + + cf_dyn_buf_append_char(db, ';'); + } + + cf_dyn_buf_chomp(db); + + return 0; +} + +int +info_command_recluster(char *name, char *params, cf_dyn_buf *db) +{ + // Command format: "recluster:" + + int rv = as_clustering_cluster_reform(); + + // TODO - resolve error condition further? + cf_dyn_buf_append_string(db, + rv == 0 ? "ok" : (rv == 1 ? "ignored-by-non-principal" : "ERROR")); + + return 0; +} + +int +info_command_jem_stats(char *name, char *params, cf_dyn_buf *db) +{ + cf_debug(AS_INFO, "jem_stats command received: params %s", params); + + /* + * Command Format: "jem-stats:{file=;options=;sites=}" [the "file", "options", and "sites" arguments are optional] + * + * Logs the JEMalloc statistics to the console or an optionally-specified file pathname. + * Options may be a string containing any of the characters "gmablh", as defined by jemalloc(3) man page. + * The "sites" parameter optionally specifies a file to dump memory accounting information to. + * [Note: Any options are only used if an output file is specified.] + */ + + char param_str[100]; + int param_str_len = sizeof(param_str); + char *file = NULL, *options = NULL, *sites = NULL; + + param_str[0] = '\0'; + if (!as_info_parameter_get(params, "file", param_str, ¶m_str_len)) { + file = cf_strdup(param_str); + } + + param_str[0] = '\0'; + param_str_len = sizeof(param_str); + if (!as_info_parameter_get(params, "options", param_str, ¶m_str_len)) { + options = cf_strdup(param_str); + } + + param_str[0] = '\0'; + param_str_len = sizeof(param_str); + if (!as_info_parameter_get(params, "sites", param_str, ¶m_str_len)) { + sites = cf_strdup(param_str); + } + + cf_alloc_log_stats(file, options); + + if (file) { + cf_free(file); + } + + if (options) { + cf_free(options); + } + + if (sites) { + cf_alloc_log_site_infos(sites); + cf_free(sites); + } + + cf_dyn_buf_append_string(db, "ok"); + return 0; +} + +/* + * Print out System Metadata info. + */ +int +info_command_dump_smd(char *name, char *params, cf_dyn_buf *db) +{ + cf_debug(AS_INFO, "dump-smd command received: params %s", params); + + bool verbose = false; + char param_str[100]; + int param_str_len = sizeof(param_str); + + /* + * Command Format: "dump-smd:{verbose=}" [the "verbose" argument is optional] + * + * where is one of: {"true" | "false"} and defaults to "false". + */ + param_str[0] = '\0'; + if (!as_info_parameter_get(params, "verbose", param_str, ¶m_str_len)) { + if (!strncmp(param_str, "true", 5)) { + verbose = true; + } else if (!strncmp(param_str, "false", 6)) { + verbose = false; + } else { + cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str); + cf_dyn_buf_append_string(db, "error"); + return 0; + } + } + + as_smd_dump(verbose); + cf_dyn_buf_append_string(db, "ok"); + + return 0; +} + +/* + * Print out Secondary Index info. + */ +int +info_command_dump_si(char *name, char *params, cf_dyn_buf *db) +{ + cf_debug(AS_INFO, "dump-si command received: params %s", params); + + char param_str[100]; + int param_str_len = sizeof(param_str); + char *nsname = NULL, *indexname = NULL, *filename = NULL; + bool verbose = false; + + /* + * Command Format: "dump-si:ns=;indexname=;filename=;{verbose=}" [the "file" and "verbose" arguments are optional] + * + * where is one of: {"true" | "false"} and defaults to "false". + */ + param_str[0] = '\0'; + if (!as_info_parameter_get(params, "ns", param_str, ¶m_str_len)) { + nsname = cf_strdup(param_str); + } else { + cf_warning(AS_INFO, "The \"%s:\" command requires an \"ns\" parameter", name); + cf_dyn_buf_append_string(db, "error"); + goto cleanup; + } + + param_str[0] = '\0'; + param_str_len = sizeof(param_str); + if (!as_info_parameter_get(params, "indexname", param_str, ¶m_str_len)) { + indexname = cf_strdup(param_str); + } else { + cf_warning(AS_INFO, "The \"%s:\" command requires a \"indexname\" parameter", name); + cf_dyn_buf_append_string(db, "error"); + goto cleanup; + } + + param_str[0] = '\0'; + param_str_len = sizeof(param_str); + if (!as_info_parameter_get(params, "file", param_str, ¶m_str_len)) { + filename = cf_strdup(param_str); + } else { + cf_warning(AS_INFO, "The \"%s:\" command requires a \"filename\" parameter", name); + cf_dyn_buf_append_string(db, "error"); + goto cleanup; + } + + + param_str[0] = '\0'; + if (!as_info_parameter_get(params, "verbose", param_str, ¶m_str_len)) { + if (!strncmp(param_str, "true", 5)) { + verbose = true; + } else if (!strncmp(param_str, "false", 6)) { + verbose = false; + } else { + cf_warning(AS_INFO, "The \"%s:\" command argument \"verbose\" value must be one of {\"true\", \"false\"}, not \"%s\"", name, param_str); + cf_dyn_buf_append_string(db, "error"); + goto cleanup; + } + } + + as_sindex_dump(nsname, indexname, filename, verbose); + cf_dyn_buf_append_string(db, "ok"); + + + cleanup: + if (nsname) { + cf_free(nsname); + } + + if (indexname) { + cf_free(indexname); + } + + if (filename) { + cf_free(filename); + } + + return 0; +} + +/* + * Print out clock skew information. + */ +int +info_command_dump_skew(char *name, char *params, cf_dyn_buf *db) +{ + cf_debug(AS_INFO, "dump-skew command received: params %s", params); + + /* + * Command Format: "dump-skew:" + */ + as_skew_monitor_dump(); + cf_dyn_buf_append_string(db, "ok"); + return 0; +} + +int +info_command_mon_cmd(char *name, char *params, cf_dyn_buf *db) +{ + cf_debug(AS_INFO, "add-module command received: params %s", params); + + /* + * Command Format: "jobs:[module=;cmd=;]" + * asinfo -v 'jobs' -> list all jobs + * asinfo -v 'jobs:module=query' -> list all jobs for query module + * asinfo -v 'jobs:module=query;cmd=kill-job;trid=' + * asinfo -v 'jobs:module=query;cmd=set-priority;trid=;value=' + * + * where is one of following: + * - query + * - scan + */ + + char cmd[13]; + char module[21]; + char job_id[24]; + char val_str[11]; + int cmd_len = sizeof(cmd); + int module_len = sizeof(module); + int job_id_len = sizeof(job_id); + int val_len = sizeof(val_str); + uint64_t trid = 0; + uint32_t value = 0; + + cmd[0] = '\0'; + module[0] = '\0'; + job_id[0] = '\0'; + val_str[0] = '\0'; + + // Read the parameters: module cmd trid value + int rv = as_info_parameter_get(params, "module", module, &module_len); + if (rv == -1) { + as_mon_info_cmd(NULL, NULL, 0, 0, db); + return 0; + } + else if (rv == -2) { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER); + cf_dyn_buf_append_string(db, ":\"module\" parameter too long (> "); + cf_dyn_buf_append_int(db, module_len-1); + cf_dyn_buf_append_string(db, " chars)"); + return 0; + } + + rv = as_info_parameter_get(params, "cmd", cmd, &cmd_len); + if (rv == -1) { + as_mon_info_cmd(module, NULL, 0, 0, db); + return 0; + } + else if (rv == -2) { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER); + cf_dyn_buf_append_string(db, ":\"cmd\" parameter too long (> "); + cf_dyn_buf_append_int(db, cmd_len-1); + cf_dyn_buf_append_string(db, " chars)"); + return 0; + } + + rv = as_info_parameter_get(params, "trid", job_id, &job_id_len); + if (rv == 0) { + trid = strtoull(job_id, NULL, 10); + } + else if (rv == -1) { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER); + cf_dyn_buf_append_string(db, ":no \"trid\" parameter specified"); + return 0; + } + else if (rv == -2) { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER); + cf_dyn_buf_append_string(db, ":\"trid\" parameter too long (> "); + cf_dyn_buf_append_int(db, job_id_len-1); + cf_dyn_buf_append_string(db, " chars)"); + return 0; + } + + rv = as_info_parameter_get(params, "value", val_str, &val_len); + if (rv == 0) { + value = strtoul(val_str, NULL, 10); + } + else if (rv == -2) { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_PARAMETER); + cf_dyn_buf_append_string(db, ":\"value\" parameter too long (> "); + cf_dyn_buf_append_int(db, val_len-1); + cf_dyn_buf_append_string(db, " chars)"); + return 0; + } + + cf_info(AS_INFO, "%s %s %lu %u", module, cmd, trid, value); + as_mon_info_cmd(module, cmd, trid, value, db); + return 0; +} + + +static const char * +debug_allocations_string(void) +{ + switch (g_config.debug_allocations) { + case CF_ALLOC_DEBUG_NONE: + return "none"; + + case CF_ALLOC_DEBUG_TRANSIENT: + return "transient"; + + case CF_ALLOC_DEBUG_PERSISTENT: + return "persistent"; + + case CF_ALLOC_DEBUG_ALL: + return "all"; + + default: + cf_crash(CF_ALLOC, "invalid CF_ALLOC_DEBUG_* value"); + return NULL; + } +} + +static const char * +auto_pin_string(void) +{ + switch (g_config.auto_pin) { + case CF_TOPO_AUTO_PIN_NONE: + return "none"; + + case CF_TOPO_AUTO_PIN_CPU: + return "cpu"; + + case CF_TOPO_AUTO_PIN_NUMA: + return "numa"; + + default: + cf_crash(CF_ALLOC, "invalid CF_TOPO_AUTO_* value"); + return NULL; + } +} + +void +info_service_config_get(cf_dyn_buf *db) +{ + // Note - no user, group. + info_append_uint32(db, "paxos-single-replica-limit", g_config.paxos_single_replica_limit); + info_append_string_safe(db, "pidfile", g_config.pidfile); + info_append_int(db, "proto-fd-max", g_config.n_proto_fd_max); + + info_append_bool(db, "advertise-ipv6", cf_socket_advertises_ipv6()); + info_append_string(db, "auto-pin", auto_pin_string()); + info_append_int(db, "batch-threads", g_config.n_batch_threads); + info_append_uint32(db, "batch-max-buffers-per-queue", g_config.batch_max_buffers_per_queue); + info_append_uint32(db, "batch-max-requests", g_config.batch_max_requests); + info_append_uint32(db, "batch-max-unused-buffers", g_config.batch_max_unused_buffers); + info_append_uint32(db, "batch-priority", g_config.batch_priority); + info_append_uint32(db, "batch-index-threads", g_config.n_batch_index_threads); + + char cluster_name[AS_CLUSTER_NAME_SZ]; + info_get_printable_cluster_name(cluster_name); + info_append_string(db, "cluster-name", cluster_name); + + info_append_bool(db, "enable-benchmarks-fabric", g_config.fabric_benchmarks_enabled); + info_append_bool(db, "enable-benchmarks-svc", g_config.svc_benchmarks_enabled); + info_append_bool(db, "enable-hist-info", g_config.info_hist_enabled); + info_append_string(db, "feature-key-file", g_config.feature_key_file); + info_append_uint32(db, "hist-track-back", g_config.hist_track_back); + info_append_uint32(db, "hist-track-slice", g_config.hist_track_slice); + info_append_string_safe(db, "hist-track-thresholds", g_config.hist_track_thresholds); + info_append_int(db, "info-threads", g_config.n_info_threads); + info_append_bool(db, "log-local-time", cf_fault_is_using_local_time()); + info_append_uint32(db, "migrate-max-num-incoming", g_config.migrate_max_num_incoming); + info_append_uint32(db, "migrate-threads", g_config.n_migrate_threads); + info_append_uint32(db, "min-cluster-size", g_config.clustering_config.cluster_size_min); + info_append_uint64_x(db, "node-id", g_config.self_node); // may be configured or auto-generated + info_append_string_safe(db, "node-id-interface", g_config.node_id_interface); + info_append_uint32(db, "nsup-delete-sleep", g_config.nsup_delete_sleep); + info_append_uint32(db, "nsup-period", g_config.nsup_period); + info_append_bool(db, "nsup-startup-evict", g_config.nsup_startup_evict); + info_append_int(db, "proto-fd-idle-ms", g_config.proto_fd_idle_ms); + info_append_int(db, "proto-slow-netio-sleep-ms", g_config.proto_slow_netio_sleep_ms); // dynamic only + info_append_uint32(db, "query-batch-size", g_config.query_bsize); + info_append_uint32(db, "query-buf-size", g_config.query_buf_size); // dynamic only + info_append_uint32(db, "query-bufpool-size", g_config.query_bufpool_size); + info_append_bool(db, "query-in-transaction-thread", g_config.query_in_transaction_thr); + info_append_uint32(db, "query-long-q-max-size", g_config.query_long_q_max_size); + info_append_bool(db, "query-microbenchmark", g_config.query_enable_histogram); // dynamic only + info_append_bool(db, "query-pre-reserve-partitions", g_config.partitions_pre_reserved); + info_append_uint32(db, "query-priority", g_config.query_priority); + info_append_uint64(db, "query-priority-sleep-us", g_config.query_sleep_us); + info_append_uint64(db, "query-rec-count-bound", g_config.query_rec_count_bound); + info_append_bool(db, "query-req-in-query-thread", g_config.query_req_in_query_thread); + info_append_uint32(db, "query-req-max-inflight", g_config.query_req_max_inflight); + info_append_uint32(db, "query-short-q-max-size", g_config.query_short_q_max_size); + info_append_uint32(db, "query-threads", g_config.query_threads); + info_append_uint32(db, "query-threshold", g_config.query_threshold); + info_append_uint64(db, "query-untracked-time-ms", g_config.query_untracked_time_ms); + info_append_uint32(db, "query-worker-threads", g_config.query_worker_threads); + info_append_bool(db, "run-as-daemon", g_config.run_as_daemon); + info_append_uint32(db, "scan-max-active", g_config.scan_max_active); + info_append_uint32(db, "scan-max-done", g_config.scan_max_done); + info_append_uint32(db, "scan-max-udf-transactions", g_config.scan_max_udf_transactions); + info_append_uint32(db, "scan-threads", g_config.scan_threads); + info_append_uint32(db, "service-threads", g_config.n_service_threads); + info_append_uint32(db, "sindex-builder-threads", g_config.sindex_builder_threads); + info_append_uint32(db, "sindex-gc-max-rate", g_config.sindex_gc_max_rate); + info_append_uint32(db, "sindex-gc-period", g_config.sindex_gc_period); + info_append_uint32(db, "ticker-interval", g_config.ticker_interval); + info_append_int(db, "transaction-max-ms", (int)(g_config.transaction_max_ns / 1000000)); + info_append_uint32(db, "transaction-pending-limit", g_config.transaction_pending_limit); + info_append_uint32(db, "transaction-queues", g_config.n_transaction_queues); + info_append_uint32(db, "transaction-retry-ms", g_config.transaction_retry_ms); + info_append_uint32(db, "transaction-threads-per-queue", g_config.n_transaction_threads_per_queue); + info_append_string_safe(db, "work-directory", g_config.work_directory); + + info_append_string(db, "debug-allocations", debug_allocations_string()); + info_append_bool(db, "fabric-dump-msgs", g_config.fabric_dump_msgs); + info_append_uint32(db, "prole-extra-ttl", g_config.prole_extra_ttl); +} + +static void +append_addrs(cf_dyn_buf *db, const char *name, const cf_addr_list *list) +{ + for (uint32_t i = 0; i < list->n_addrs; ++i) { + info_append_string(db, name, list->addrs[i]); + } +} + +void +info_network_config_get(cf_dyn_buf *db) +{ + // Service: + + info_append_int(db, "service.port", g_config.service.bind_port); + append_addrs(db, "service.address", &g_config.service.bind); + info_append_int(db, "service.access-port", g_config.service.std_port); + append_addrs(db, "service.access-address", &g_config.service.std); + info_append_int(db, "service.alternate-access-port", g_config.service.alt_port); + append_addrs(db, "service.alternate-access-address", &g_config.service.alt); + + info_append_int(db, "service.tls-port", g_config.tls_service.bind_port); + append_addrs(db, "service.tls-address", &g_config.tls_service.bind); + info_append_int(db, "service.tls-access-port", g_config.tls_service.std_port); + append_addrs(db, "service.tls-access-address", &g_config.tls_service.std); + info_append_int(db, "service.tls-alternate-access-port", g_config.tls_service.alt_port); + append_addrs(db, "service.tls-alternate-access-address", &g_config.tls_service.alt); + info_append_string_safe(db, "service.tls-name", g_config.tls_service.tls_our_name); + + for (uint32_t i = 0; i < g_config.tls_service.n_tls_peer_names; ++i) { + info_append_string(db, "service.tls-authenticate-client", + g_config.tls_service.tls_peer_names[i]); + } + + // Heartbeat: + + as_hb_info_config_get(db); + + // Fabric: + + append_addrs(db, "fabric.address", &g_config.fabric.bind); + info_append_int(db, "fabric.port", g_config.fabric.bind_port); + append_addrs(db, "fabric.tls-address", &g_config.tls_fabric.bind); + info_append_int(db, "fabric.tls-port", g_config.tls_fabric.bind_port); + info_append_string_safe(db, "fabric.tls-name", g_config.tls_fabric.tls_our_name); + info_append_int(db, "fabric.channel-bulk-fds", g_config.n_fabric_channel_fds[AS_FABRIC_CHANNEL_BULK]); + info_append_int(db, "fabric.channel-bulk-recv-threads", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_BULK]); + info_append_int(db, "fabric.channel-ctrl-fds", g_config.n_fabric_channel_fds[AS_FABRIC_CHANNEL_CTRL]); + info_append_int(db, "fabric.channel-ctrl-recv-threads", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_CTRL]); + info_append_int(db, "fabric.channel-meta-fds", g_config.n_fabric_channel_fds[AS_FABRIC_CHANNEL_META]); + info_append_int(db, "fabric.channel-meta-recv-threads", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_META]); + info_append_int(db, "fabric.channel-rw-fds", g_config.n_fabric_channel_fds[AS_FABRIC_CHANNEL_RW]); + info_append_int(db, "fabric.channel-rw-recv-threads", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_RW]); + info_append_bool(db, "fabric.keepalive-enabled", g_config.fabric_keepalive_enabled); + info_append_int(db, "fabric.keepalive-intvl", g_config.fabric_keepalive_intvl); + info_append_int(db, "fabric.keepalive-probes", g_config.fabric_keepalive_probes); + info_append_int(db, "fabric.keepalive-time", g_config.fabric_keepalive_time); + info_append_int(db, "fabric.latency-max-ms", g_config.fabric_latency_max_ms); + info_append_int(db, "fabric.recv-rearm-threshold", g_config.fabric_recv_rearm_threshold); + info_append_int(db, "fabric.send-threads", g_config.n_fabric_send_threads); + + // Info: + + append_addrs(db, "info.address", &g_config.info.bind); + info_append_int(db, "info.port", g_config.info.bind_port); + + // TLS: + + for (uint32_t i = 0; i < g_config.n_tls_specs; ++i) { + cf_tls_spec *spec = g_config.tls_specs + i; + char key[100]; + + snprintf(key, sizeof(key), "tls[%u].name", i); + info_append_string_safe(db, key, spec->name); + + snprintf(key, sizeof(key), "tls[%u].cert_file", i); + info_append_string_safe(db, key, spec->cert_file); + + snprintf(key, sizeof(key), "tls[%u].key_file", i); + info_append_string_safe(db, key, spec->key_file); + + snprintf(key, sizeof(key), "tls[%u].ca_file", i); + info_append_string_safe(db, key, spec->ca_file); + + snprintf(key, sizeof(key), "tls[%u].ca_path", i); + info_append_string_safe(db, key, spec->ca_path); + + snprintf(key, sizeof(key), "tls[%u].cert_blacklist", i); + info_append_string_safe(db, key, spec->cert_blacklist); + + snprintf(key, sizeof(key), "tls[%u].protocols", i); + info_append_string_safe(db, key, spec->protocols); + + snprintf(key, sizeof(key), "tls[%u].cipher_suite", i); + info_append_string_safe(db, key, spec->cipher_suite); + } +} + + +void +info_namespace_config_get(char* context, cf_dyn_buf *db) +{ + as_namespace *ns = as_namespace_get_byname(context); + + if (! ns) { + cf_dyn_buf_append_string(db, "namespace not found;"); // TODO - start with "error"? + return; + } + + info_append_uint32(db, "replication-factor", ns->cfg_replication_factor); + info_append_uint64(db, "memory-size", ns->memory_size); + info_append_uint64(db, "default-ttl", ns->default_ttl); + + info_append_bool(db, "enable-xdr", ns->enable_xdr); + info_append_bool(db, "sets-enable-xdr", ns->sets_enable_xdr); + info_append_bool(db, "ns-forward-xdr-writes", ns->ns_forward_xdr_writes); + info_append_bool(db, "allow-nonxdr-writes", ns->ns_allow_nonxdr_writes); + info_append_bool(db, "allow-xdr-writes", ns->ns_allow_xdr_writes); + + // Not true config, but act as config overrides: + cf_hist_track_get_settings(ns->read_hist, db); + cf_hist_track_get_settings(ns->query_hist, db); + cf_hist_track_get_settings(ns->udf_hist, db); + cf_hist_track_get_settings(ns->write_hist, db); + + info_append_uint32(db, "cold-start-evict-ttl", ns->cold_start_evict_ttl); + + if (ns->conflict_resolution_policy == AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION) { + info_append_string(db, "conflict-resolution-policy", "generation"); + } + else if (ns->conflict_resolution_policy == AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME) { + info_append_string(db, "conflict-resolution-policy", "last-update-time"); + } + else { + info_append_string(db, "conflict-resolution-policy", "undefined"); + } + + info_append_bool(db, "data-in-index", ns->data_in_index); + info_append_bool(db, "disable-write-dup-res", ns->write_dup_res_disabled); + info_append_bool(db, "disallow-null-setname", ns->disallow_null_setname); + info_append_bool(db, "enable-benchmarks-batch-sub", ns->batch_sub_benchmarks_enabled); + info_append_bool(db, "enable-benchmarks-read", ns->read_benchmarks_enabled); + info_append_bool(db, "enable-benchmarks-udf", ns->udf_benchmarks_enabled); + info_append_bool(db, "enable-benchmarks-udf-sub", ns->udf_sub_benchmarks_enabled); + info_append_bool(db, "enable-benchmarks-write", ns->write_benchmarks_enabled); + info_append_bool(db, "enable-hist-proxy", ns->proxy_hist_enabled); + info_append_uint32(db, "evict-hist-buckets", ns->evict_hist_buckets); + info_append_uint32(db, "evict-tenths-pct", ns->evict_tenths_pct); + info_append_uint32(db, "high-water-disk-pct", ns->hwm_disk_pct); + info_append_uint32(db, "high-water-memory-pct", ns->hwm_memory_pct); + info_append_uint64(db, "max-ttl", ns->max_ttl); + info_append_uint32(db, "migrate-order", ns->migrate_order); + info_append_uint32(db, "migrate-retransmit-ms", ns->migrate_retransmit_ms); + info_append_uint32(db, "migrate-sleep", ns->migrate_sleep); + info_append_uint32(db, "obj-size-hist-max", ns->obj_size_hist_max); // not original, may have been rounded + info_append_uint32(db, "partition-tree-locks", ns->tree_shared.n_lock_pairs); + info_append_uint32(db, "partition-tree-sprigs", ns->tree_shared.n_sprigs); + info_append_uint32(db, "rack-id", ns->rack_id); + info_append_string(db, "read-consistency-level-override", NS_READ_CONSISTENCY_LEVEL_NAME()); + info_append_bool(db, "single-bin", ns->single_bin); + info_append_uint32(db, "stop-writes-pct", ns->stop_writes_pct); + info_append_bool(db, "strong-consistency", ns->cp); + info_append_bool(db, "strong-consistency-allow-expunge", ns->cp_allow_drops); + info_append_uint32(db, "tomb-raider-eligible-age", ns->tomb_raider_eligible_age); + info_append_uint32(db, "tomb-raider-period", ns->tomb_raider_period); + info_append_string(db, "write-commit-level-override", NS_WRITE_COMMIT_LEVEL_NAME()); + + info_append_string(db, "storage-engine", + (ns->storage_type == AS_STORAGE_ENGINE_MEMORY ? "memory" : + (ns->storage_type == AS_STORAGE_ENGINE_SSD ? "device" : "illegal"))); + + if (ns->storage_type == AS_STORAGE_ENGINE_SSD) { + for (int i = 0; i < AS_STORAGE_MAX_DEVICES; i++) { + if (! ns->storage_devices[i]) { + break; + } + + info_append_string(db, "storage-engine.device", ns->storage_devices[i]); + } + + for (int i = 0; i < AS_STORAGE_MAX_FILES; i++) { + if (! ns->storage_files[i]) { + break; + } + + info_append_string(db, "storage-engine.file", ns->storage_files[i]); + } + + // TODO - how to report the shadows? + + info_append_uint64(db, "storage-engine.filesize", ns->storage_filesize); + info_append_string_safe(db, "storage-engine.scheduler-mode", ns->storage_scheduler_mode); + info_append_uint32(db, "storage-engine.write-block-size", ns->storage_write_block_size); + info_append_bool(db, "storage-engine.data-in-memory", ns->storage_data_in_memory); + info_append_bool(db, "storage-engine.cold-start-empty", ns->storage_cold_start_empty); + info_append_bool(db, "storage-engine.commit-to-device", ns->storage_commit_to_device); + info_append_uint32(db, "storage-engine.commit-min-size", ns->storage_commit_min_size); + info_append_uint32(db, "storage-engine.defrag-lwm-pct", ns->storage_defrag_lwm_pct); + info_append_uint32(db, "storage-engine.defrag-queue-min", ns->storage_defrag_queue_min); + info_append_uint32(db, "storage-engine.defrag-sleep", ns->storage_defrag_sleep); + info_append_int(db, "storage-engine.defrag-startup-minimum", ns->storage_defrag_startup_minimum); + info_append_bool(db, "storage-engine.disable-odirect", ns->storage_disable_odirect); + info_append_bool(db, "storage-engine.enable-benchmarks-storage", ns->storage_benchmarks_enabled); + info_append_bool(db, "storage-engine.enable-osync", ns->storage_enable_osync); + info_append_string_safe(db, "storage-engine.encryption-key-file", ns->storage_encryption_key_file); + info_append_uint64(db, "storage-engine.flush-max-ms", ns->storage_flush_max_us / 1000); + info_append_uint64(db, "storage-engine.fsync-max-sec", ns->storage_fsync_max_us / 1000000); + info_append_uint64(db, "storage-engine.max-write-cache", ns->storage_max_write_cache); + info_append_uint32(db, "storage-engine.min-avail-pct", ns->storage_min_avail_pct); + info_append_uint32(db, "storage-engine.post-write-queue", ns->storage_post_write_queue); + info_append_uint32(db, "storage-engine.tomb-raider-sleep", ns->storage_tomb_raider_sleep); + info_append_uint32(db, "storage-engine.write-threads", ns->storage_write_threads); + } + + info_append_uint32(db, "sindex.num-partitions", ns->sindex_num_partitions); + + info_append_bool(db, "geo2dsphere-within.strict", ns->geo2dsphere_within_strict); + info_append_uint32(db, "geo2dsphere-within.min-level", (uint32_t)ns->geo2dsphere_within_min_level); + info_append_uint32(db, "geo2dsphere-within.max-level", (uint32_t)ns->geo2dsphere_within_max_level); + info_append_uint32(db, "geo2dsphere-within.max-cells", (uint32_t)ns->geo2dsphere_within_max_cells); + info_append_uint32(db, "geo2dsphere-within.level-mod", (uint32_t)ns->geo2dsphere_within_level_mod); + info_append_uint32(db, "geo2dsphere-within.earth-radius-meters", ns->geo2dsphere_within_earth_radius_meters); +} + + +// TODO - security API? +void +info_security_config_get(cf_dyn_buf *db) +{ + info_append_bool(db, "enable-security", g_config.sec_cfg.security_enabled); + info_append_uint32(db, "privilege-refresh-period", g_config.sec_cfg.privilege_refresh_period); + info_append_uint32(db, "report-authentication-sinks", g_config.sec_cfg.report.authentication); + info_append_uint32(db, "report-data-op-sinks", g_config.sec_cfg.report.data_op); + info_append_uint32(db, "report-sys-admin-sinks", g_config.sec_cfg.report.sys_admin); + info_append_uint32(db, "report-user-admin-sinks", g_config.sec_cfg.report.user_admin); + info_append_uint32(db, "report-violation-sinks", g_config.sec_cfg.report.violation); + info_append_int(db, "syslog-local", g_config.sec_cfg.syslog_local); +} + + +void +info_command_config_get_with_params(char *name, char *params, cf_dyn_buf *db) +{ + char context[1024]; + int context_len = sizeof(context); + + if (as_info_parameter_get(params, "context", context, &context_len) != 0) { + cf_dyn_buf_append_string(db, "Error: Invalid get-config parameter;"); + return; + } + + if (strcmp(context, "service") == 0) { + info_service_config_get(db); + } + else if (strcmp(context, "network") == 0) { + info_network_config_get(db); + } + else if (strcmp(context, "namespace") == 0) { + context_len = sizeof(context); + + if (as_info_parameter_get(params, "id", context, &context_len) != 0) { + cf_dyn_buf_append_string(db, "Error:invalid id;"); + return; + } + + info_namespace_config_get(context, db); + } + else if (strcmp(context, "security") == 0) { + info_security_config_get(db); + } + else if (strcmp(context, "xdr") == 0) { + as_xdr_get_config(db); + } + else { + cf_dyn_buf_append_string(db, "Error:Invalid context;"); + } +} + + +int +info_command_config_get(char *name, char *params, cf_dyn_buf *db) +{ + cf_debug(AS_INFO, "config-get command received: params %s", params); + + if (params && *params != 0) { + info_command_config_get_with_params(name, params, db); + cf_dyn_buf_chomp(db); + return 0; + } + + // We come here when context is not mentioned. + // In that case we want to print everything. + info_service_config_get(db); + info_network_config_get(db); + info_security_config_get(db); + as_xdr_get_config(db); + + cf_dyn_buf_chomp(db); + + return 0; +} + + +// +// config-set:context=service;variable=value; +// config-set:context=network;variable=heartbeat.value; +// config-set:context=namespace;id=test;variable=value; +// +int +info_command_config_set_threadsafe(char *name, char *params, cf_dyn_buf *db) +{ + cf_debug(AS_INFO, "config-set command received: params %s", params); + + char context[1024]; + int context_len = sizeof(context); + int val; + char bool_val[2][6] = {"false", "true"}; + + if (0 != as_info_parameter_get(params, "context", context, &context_len)) + goto Error; + if (strcmp(context, "service") == 0) { + context_len = sizeof(context); + if (0 == as_info_parameter_get(params, "advertise-ipv6", context, &context_len)) { + if (strcmp(context, "true") == 0 || strcmp(context, "yes") == 0) { + cf_socket_set_advertise_ipv6(true); + } + else if (strcmp(context, "false") == 0 || strcmp(context, "no") == 0) { + cf_socket_set_advertise_ipv6(false); + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "transaction-threads-per-queue", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + if (val < 1 || val > MAX_TRANSACTION_THREADS_PER_QUEUE) { + cf_warning(AS_INFO, "transaction-threads-per-queue must be between 1 and %u", MAX_TRANSACTION_THREADS_PER_QUEUE); + goto Error; + } + cf_info(AS_INFO, "Changing value of transaction-threads-per-queue from %u to %d ", g_config.n_transaction_threads_per_queue, val); + as_tsvc_set_threads_per_queue((uint32_t)val); + } + else if (0 == as_info_parameter_get(params, "transaction-retry-ms", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + if (val == 0) + goto Error; + cf_info(AS_INFO, "Changing value of transaction-retry-ms from %d to %d ", g_config.transaction_retry_ms, val); + g_config.transaction_retry_ms = val; + } + else if (0 == as_info_parameter_get(params, "transaction-max-ms", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of transaction-retry-ms from %"PRIu64" to %d ", (g_config.transaction_max_ns / 1000000), val); + g_config.transaction_max_ns = (uint64_t)val * 1000000; + } + else if (0 == as_info_parameter_get(params, "transaction-pending-limit", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of transaction-pending-limit from %d to %d ", g_config.transaction_pending_limit, val); + g_config.transaction_pending_limit = val; + } + else if (0 == as_info_parameter_get(params, "ticker-interval", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of ticker-interval from %d to %d ", g_config.ticker_interval, val); + g_config.ticker_interval = val; + } + else if (0 == as_info_parameter_get(params, "scan-max-active", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + if (val < 0 || val > 200) { + goto Error; + } + cf_info(AS_INFO, "Changing value of scan-max-active from %d to %d ", g_config.scan_max_active, val); + g_config.scan_max_active = val; + as_scan_limit_active_jobs(g_config.scan_max_active); + } + else if (0 == as_info_parameter_get(params, "scan-max-done", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + if (val < 0 || val > 1000) { + goto Error; + } + cf_info(AS_INFO, "Changing value of scan-max-done from %d to %d ", g_config.scan_max_done, val); + g_config.scan_max_done = val; + as_scan_limit_finished_jobs(g_config.scan_max_done); + } + else if (0 == as_info_parameter_get(params, "scan-max-udf-transactions", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of scan-max-udf-transactions from %d to %d ", g_config.scan_max_udf_transactions, val); + g_config.scan_max_udf_transactions = val; + } + else if (0 == as_info_parameter_get(params, "scan-threads", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + if (val < 0 || val > 128) { + goto Error; + } + cf_info(AS_INFO, "Changing value of scan-threads from %d to %d ", g_config.scan_threads, val); + g_config.scan_threads = val; + as_scan_resize_thread_pool(g_config.scan_threads); + } + else if (0 == as_info_parameter_get(params, "batch-index-threads", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + if (0 != as_batch_threads_resize(val)) + goto Error; + } + else if (0 == as_info_parameter_get(params, "batch-threads", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + if (0 != as_batch_direct_threads_resize(val)) + goto Error; + } + else if (0 == as_info_parameter_get(params, "batch-max-requests", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of batch-max-requests from %d to %d ", g_config.batch_max_requests, val); + g_config.batch_max_requests = val; + } + else if (0 == as_info_parameter_get(params, "batch-max-buffers-per-queue", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of batch-max-buffers-per-queue from %d to %d ", g_config.batch_max_buffers_per_queue, val); + g_config.batch_max_buffers_per_queue = val; + } + else if (0 == as_info_parameter_get(params, "batch-max-unused-buffers", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of batch-max-unused-buffers from %d to %d ", g_config.batch_max_unused_buffers, val); + g_config.batch_max_unused_buffers = val; + } + else if (0 == as_info_parameter_get(params, "batch-priority", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of batch-priority from %d to %d ", g_config.batch_priority, val); + g_config.batch_priority = val; + } + else if (0 == as_info_parameter_get(params, "proto-fd-max", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of proto-fd-max from %d to %d ", g_config.n_proto_fd_max, val); + g_config.n_proto_fd_max = val; + } + else if (0 == as_info_parameter_get(params, "proto-fd-idle-ms", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of proto-fd-idle-ms from %d to %d ", g_config.proto_fd_idle_ms, val); + g_config.proto_fd_idle_ms = val; + } + else if (0 == as_info_parameter_get(params, "proto-slow-netio-sleep-ms", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of proto-slow-netio-sleep-ms from %d to %d ", g_config.proto_slow_netio_sleep_ms, val); + g_config.proto_slow_netio_sleep_ms = val; + } + else if (0 == as_info_parameter_get(params, "nsup-delete-sleep", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of nsup-delete-sleep from %d to %d ", g_config.nsup_delete_sleep, val); + g_config.nsup_delete_sleep = val; + } + else if (0 == as_info_parameter_get(params, "nsup-period", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of nsup-period from %d to %d ", g_config.nsup_period, val); + g_config.nsup_period = val; + } + else if (0 == as_info_parameter_get( params, "cluster-name", context, &context_len)){ + if (!as_config_cluster_name_set(context)) { + goto Error; + } + cf_info(AS_INFO, "Changing value of cluster-name to '%s'", context); + } + else if (0 == as_info_parameter_get(params, "migrate-max-num-incoming", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + if ((uint32_t)val > AS_MIGRATE_LIMIT_MAX_NUM_INCOMING) { + cf_warning(AS_INFO, "migrate-max-num-incoming %d must be >= 0 and <= %u", val, AS_MIGRATE_LIMIT_MAX_NUM_INCOMING); + goto Error; + } + cf_info(AS_INFO, "Changing value of migrate-max-num-incoming from %u to %d ", g_config.migrate_max_num_incoming, val); + g_config.migrate_max_num_incoming = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "migrate-threads", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + if ((uint32_t)val > MAX_NUM_MIGRATE_XMIT_THREADS) { + cf_warning(AS_INFO, "migrate-threads %d must be >= 0 and <= %u", val, MAX_NUM_MIGRATE_XMIT_THREADS); + goto Error; + } + cf_info(AS_INFO, "Changing value of migrate-threads from %u to %d ", g_config.n_migrate_threads, val); + as_migrate_set_num_xmit_threads(val); + } + else if (0 == as_info_parameter_get(params, "min-cluster-size", context, &context_len)) { + if (0 != cf_str_atoi(context, &val) || (0 > val) || (as_clustering_cluster_size_min_set(val) < 0)) + goto Error; + } + else if (0 == as_info_parameter_get(params, "prole-extra-ttl", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + cf_info(AS_INFO, "Changing value of prole-extra-ttl from %d to %d ", g_config.prole_extra_ttl, val); + g_config.prole_extra_ttl = val; + } + else if (0 == as_info_parameter_get(params, "query-buf-size", context, &context_len)) { + uint64_t val = atoll(context); + cf_debug(AS_INFO, "query-buf-size = %"PRIu64"", val); + if (val < 1024) { + goto Error; + } + cf_info(AS_INFO, "Changing value of query-buf-size from %"PRIu64" to %"PRIu64"", g_config.query_buf_size, val); + g_config.query_buf_size = val; + } + else if (0 == as_info_parameter_get(params, "query-threshold", context, &context_len)) { + uint64_t val = atoll(context); + cf_debug(AS_INFO, "query-threshold = %"PRIu64"", val); + if ((int64_t)val <= 0) { + goto Error; + } + cf_info(AS_INFO, "Changing value of query-threshold from %u to %"PRIu64, g_config.query_threshold, val); + g_config.query_threshold = val; + } + else if (0 == as_info_parameter_get(params, "query-untracked-time-ms", context, &context_len)) { + uint64_t val = atoll(context); + cf_debug(AS_INFO, "query-untracked-time = %"PRIu64" milli seconds", val); + if ((int64_t)val < 0) { + goto Error; + } + cf_info(AS_INFO, "Changing value of query-untracked-time from %"PRIu64" milli seconds to %"PRIu64" milli seconds", + g_config.query_untracked_time_ms, val); + g_config.query_untracked_time_ms = val; + } + else if (0 == as_info_parameter_get(params, "query-rec-count-bound", context, &context_len)) { + uint64_t val = atoll(context); + cf_debug(AS_INFO, "query-rec-count-bound = %"PRIu64"", val); + if ((int64_t)val <= 0) { + goto Error; + } + cf_info(AS_INFO, "Changing value of query-rec-count-bound from %"PRIu64" to %"PRIu64" ", g_config.query_rec_count_bound, val); + g_config.query_rec_count_bound = val; + } + else if (0 == as_info_parameter_get(params, "sindex-builder-threads", context, &context_len)) { + int val = 0; + if (0 != cf_str_atoi(context, &val) || (val > MAX_SINDEX_BUILDER_THREADS)) { + cf_warning(AS_INFO, "sindex-builder-threads: value must be <= %d, not %s", MAX_SINDEX_BUILDER_THREADS, context); + goto Error; + } + cf_info(AS_INFO, "Changing value of sindex-builder-threads from %u to %d", g_config.sindex_builder_threads, val); + g_config.sindex_builder_threads = (uint32_t)val; + as_sbld_resize_thread_pool(g_config.sindex_builder_threads); + } + else if (0 == as_info_parameter_get(params, "sindex-gc-max-rate", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of sindex-gc-max-rate from %d to %d ", g_config.sindex_gc_max_rate, val); + g_config.sindex_gc_max_rate = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "sindex-gc-period", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + cf_info(AS_INFO, "Changing value of sindex-gc-period from %d to %d ", g_config.sindex_gc_period, val); + g_config.sindex_gc_period = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "query-threads", context, &context_len)) { + uint64_t val = atoll(context); + cf_info(AS_INFO, "query-threads = %"PRIu64, val); + if (val == 0) { + cf_warning(AS_INFO, "query-threads should be a number %s", context); + goto Error; + } + int old_val = g_config.query_threads; + int new_val = 0; + if (as_query_reinit(val, &new_val) != AS_QUERY_OK) { + cf_warning(AS_INFO, "Config not changed."); + goto Error; + } + + cf_info(AS_INFO, "Changing value of query-threads from %d to %d", + old_val, new_val); + } + else if (0 == as_info_parameter_get(params, "query-worker-threads", context, &context_len)) { + uint64_t val = atoll(context); + cf_info(AS_INFO, "query-worker-threads = %"PRIu64, val); + if (val == 0) { + cf_warning(AS_INFO, "query-worker-threads should be a number %s", context); + goto Error; + } + int old_val = g_config.query_threads; + int new_val = 0; + if (as_query_worker_reinit(val, &new_val) != AS_QUERY_OK) { + cf_warning(AS_INFO, "Config not changed."); + goto Error; + } + cf_info(AS_INFO, "Changing value of query-worker-threads from %d to %d", + old_val, new_val); + } + else if (0 == as_info_parameter_get(params, "query-priority", context, &context_len)) { + uint64_t val = atoll(context); + cf_info(AS_INFO, "query_priority = %"PRIu64, val); + if (val == 0) { + cf_warning(AS_INFO, "query_priority should be a number %s", context); + goto Error; + } + cf_info(AS_INFO, "Changing value of query-priority from %d to %"PRIu64, g_config.query_priority, val); + g_config.query_priority = val; + } + else if (0 == as_info_parameter_get(params, "query-priority-sleep-us", context, &context_len)) { + uint64_t val = atoll(context); + if(val == 0) { + cf_warning(AS_INFO, "query_sleep should be a number %s", context); + goto Error; + } + cf_info(AS_INFO, "Changing value of query-sleep from %"PRIu64" uSec to %"PRIu64" uSec ", g_config.query_sleep_us, val); + g_config.query_sleep_us = val; + } + else if (0 == as_info_parameter_get(params, "query-batch-size", context, &context_len)) { + uint64_t val = atoll(context); + cf_info(AS_INFO, "query-batch-size = %"PRIu64, val); + if((int)val <= 0) { + cf_warning(AS_INFO, "query-batch-size should be a positive number"); + goto Error; + } + cf_info(AS_INFO, "Changing value of query-batch-size from %d to %"PRIu64, g_config.query_bsize, val); + g_config.query_bsize = val; + } + else if (0 == as_info_parameter_get(params, "query-req-max-inflight", context, &context_len)) { + uint64_t val = atoll(context); + cf_info(AS_INFO, "query-req-max-inflight = %"PRIu64, val); + if((int)val <= 0) { + cf_warning(AS_INFO, "query-req-max-inflight should be a positive number"); + goto Error; + } + cf_info(AS_INFO, "Changing value of query-req-max-inflight from %d to %"PRIu64, g_config.query_req_max_inflight, val); + g_config.query_req_max_inflight = val; + } + else if (0 == as_info_parameter_get(params, "query-bufpool-size", context, &context_len)) { + uint64_t val = atoll(context); + cf_info(AS_INFO, "query-bufpool-size = %"PRIu64, val); + if((int)val <= 0) { + cf_warning(AS_INFO, "query-bufpool-size should be a positive number"); + goto Error; + } + cf_info(AS_INFO, "Changing value of query-bufpool-size from %d to %"PRIu64, g_config.query_bufpool_size, val); + g_config.query_bufpool_size = val; + } + else if (0 == as_info_parameter_get(params, "query-in-transaction-thread", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of query-in-transaction-thread from %s to %s", bool_val[g_config.query_in_transaction_thr], context); + g_config.query_in_transaction_thr = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of query-in-transaction-thread from %s to %s", bool_val[g_config.query_in_transaction_thr], context); + g_config.query_in_transaction_thr = false; + } + else + goto Error; + } + else if (0 == as_info_parameter_get(params, "query-req-in-query-thread", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of query-req-in-query-thread from %s to %s", bool_val[g_config.query_req_in_query_thread], context); + g_config.query_req_in_query_thread = true; + + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of query-req-in-query-thread from %s to %s", bool_val[g_config.query_req_in_query_thread], context); + g_config.query_req_in_query_thread = false; + } + else + goto Error; + } + else if (0 == as_info_parameter_get(params, "query-short-q-max-size", context, &context_len)) { + uint64_t val = atoll(context); + cf_info(AS_INFO, "query-short-q-max-size = %"PRIu64, val); + if((int)val <= 0) { + cf_warning(AS_INFO, "query-short-q-max-size should be a positive number"); + goto Error; + } + cf_info(AS_INFO, "Changing value of query-short-q-max-size from %d to %"PRIu64, g_config.query_short_q_max_size, val); + g_config.query_short_q_max_size = val; + } + else if (0 == as_info_parameter_get(params, "query-long-q-max-size", context, &context_len)) { + uint64_t val = atoll(context); + cf_info(AS_INFO, "query-long-q-max-size = %"PRIu64, val); + if((int)val <= 0) { + cf_warning(AS_INFO, "query-long-q-max-size should be a positive number"); + goto Error; + } + cf_info(AS_INFO, "Changing value of query-longq-max-size from %d to %"PRIu64, g_config.query_long_q_max_size, val); + g_config.query_long_q_max_size = val; + } + else if (0 == as_info_parameter_get(params, "enable-benchmarks-fabric", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-fabric to %s", context); + g_config.fabric_benchmarks_enabled = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-fabric to %s", context); + g_config.fabric_benchmarks_enabled = false; + histogram_clear(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_BULK]); + histogram_clear(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_BULK]); + histogram_clear(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_BULK]); + histogram_clear(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_BULK]); + histogram_clear(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_CTRL]); + histogram_clear(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_CTRL]); + histogram_clear(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_CTRL]); + histogram_clear(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_CTRL]); + histogram_clear(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_META]); + histogram_clear(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_META]); + histogram_clear(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_META]); + histogram_clear(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_META]); + histogram_clear(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_RW]); + histogram_clear(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_RW]); + histogram_clear(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_RW]); + histogram_clear(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_RW]); + } + } + else if (0 == as_info_parameter_get(params, "enable-benchmarks-svc", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-svc to %s", context); + g_config.svc_benchmarks_enabled = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-svc to %s", context); + g_config.svc_benchmarks_enabled = false; + histogram_clear(g_stats.svc_demarshal_hist); + histogram_clear(g_stats.svc_queue_hist); + } + } + else if (0 == as_info_parameter_get(params, "enable-hist-info", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of enable-hist-info to %s", context); + g_config.info_hist_enabled = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of enable-hist-info to %s", context); + g_config.info_hist_enabled = false; + histogram_clear(g_stats.info_hist); + } + } + else if (0 == as_info_parameter_get(params, "query-microbenchmark", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of query-enable-histogram to %s", context); + g_config.query_enable_histogram = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of query-enable-histogram to %s", context); + g_config.query_enable_histogram = false; + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "query-pre-reserve-partitions", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of query-pre-reserve-partitions to %s", context); + g_config.partitions_pre_reserved = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of query-pre-reserve-partitions to %s", context); + g_config.partitions_pre_reserved = false; + } + else { + goto Error; + } + } + else { + goto Error; + } + } + else if (strcmp(context, "network") == 0) { + context_len = sizeof(context); + if (0 == as_info_parameter_get(params, "heartbeat.interval", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + if (as_hb_tx_interval_set(val) != 0) { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "heartbeat.timeout", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + if (as_hb_max_intervals_missed_set(val) != 0){ + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "heartbeat.mtu", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) + goto Error; + as_hb_override_mtu_set(val); + } + else if (0 == as_info_parameter_get(params, "heartbeat.protocol", context, &context_len)) { + as_hb_protocol protocol = (!strcmp(context, "v3") ? AS_HB_PROTOCOL_V3 : + (!strcmp(context, "reset") ? AS_HB_PROTOCOL_RESET : + (!strcmp(context, "none") ? AS_HB_PROTOCOL_NONE : + AS_HB_PROTOCOL_UNDEF))); + if (AS_HB_PROTOCOL_UNDEF == protocol) { + cf_warning(AS_INFO, "heartbeat protocol version %s not supported", context); + goto Error; + } + cf_info(AS_INFO, "Changing value of heartbeat protocol version to %s", context); + if (0 > as_hb_protocol_set(protocol)) + goto Error; + } + else if (0 == as_info_parameter_get(params, "fabric.channel-bulk-recv-threads", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + if (val < 1 || val > MAX_FABRIC_CHANNEL_THREADS) { + cf_warning(AS_INFO, "fabric.channel-bulk-recv-threads must be between 1 and %u", MAX_FABRIC_CHANNEL_THREADS); + goto Error; + } + cf_info(AS_FABRIC, "changing fabric.channel-bulk-recv-threads from %u to %d", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_BULK], val); + as_fabric_set_recv_threads(AS_FABRIC_CHANNEL_BULK, val); + } + else if (0 == as_info_parameter_get(params, "fabric.channel-ctrl-recv-threads", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + if (val < 1 || val > MAX_FABRIC_CHANNEL_THREADS) { + cf_warning(AS_INFO, "fabric.channel-ctrl-recv-threads must be between 1 and %u", MAX_FABRIC_CHANNEL_THREADS); + goto Error; + } + cf_info(AS_FABRIC, "changing fabric.channel-ctrl-recv-threads from %u to %d", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_CTRL], val); + as_fabric_set_recv_threads(AS_FABRIC_CHANNEL_CTRL, val); + } + else if (0 == as_info_parameter_get(params, "fabric.channel-meta-recv-threads", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + if (val < 1 || val > MAX_FABRIC_CHANNEL_THREADS) { + cf_warning(AS_INFO, "fabric.channel-meta-recv-threads must be between 1 and %u", MAX_FABRIC_CHANNEL_THREADS); + goto Error; + } + cf_info(AS_FABRIC, "changing fabric.channel-meta-recv-threads from %u to %d", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_META], val); + as_fabric_set_recv_threads(AS_FABRIC_CHANNEL_META, val); + } + else if (0 == as_info_parameter_get(params, "fabric.channel-rw-recv-threads", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + if (val < 1 || val > MAX_FABRIC_CHANNEL_THREADS) { + cf_warning(AS_INFO, "fabric.channel-rw-recv-threads must be between 1 and %u", MAX_FABRIC_CHANNEL_THREADS); + goto Error; + } + cf_info(AS_FABRIC, "changing fabric.channel-rw-recv-threads from %u to %d", g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_RW], val); + as_fabric_set_recv_threads(AS_FABRIC_CHANNEL_RW, val); + } + else if (0 == as_info_parameter_get(params, "fabric.recv-rearm-threshold", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + + if (val < 0 || val > 1024 * 1024) { + goto Error; + } + + g_config.fabric_recv_rearm_threshold = (uint32_t)val; + } + else + goto Error; + } + else if (strcmp(context, "namespace") == 0) { + context_len = sizeof(context); + if (0 != as_info_parameter_get(params, "id", context, &context_len)) + goto Error; + as_namespace *ns = as_namespace_get_byname(context); + if (!ns) + goto Error; + + context_len = sizeof(context); + // configure namespace/set related parameters: + if (0 == as_info_parameter_get(params, "set", context, &context_len)) { + if (context_len == 0 || context_len >= AS_SET_NAME_MAX_SIZE) { + cf_warning(AS_INFO, "illegal length %d for set name %s", + context_len, context); + goto Error; + } + + char set_name[AS_SET_NAME_MAX_SIZE]; + size_t set_name_len = (size_t)context_len; + + strcpy(set_name, context); + + // Ideally, set operations should not be part of configs. But, + // set-delete is exception for historical reasons. Do an early check + // and bail out if set doesn't exist. + uint16_t set_id = as_namespace_get_set_id(ns, set_name); + if (set_id == INVALID_SET_ID) { + context_len = sizeof(context); + if (0 == as_info_parameter_get(params, "set-delete", context, + &context_len)) { + cf_warning(AS_INFO, "set-delete failed because set %s doesn't exist in ns %s", + set_name, ns->name); + goto Error; + } + } + + // configurations should create set if it doesn't exist. + // checks if there is a vmap set with the same name and if so returns + // a ptr to it. if not, it creates an set structure, initializes it + // and returns a ptr to it. + as_set *p_set = NULL; + if (as_namespace_get_create_set_w_len(ns, set_name, set_name_len, + &p_set, NULL) != 0) { + goto Error; + } + + context_len = sizeof(context); + if (0 == as_info_parameter_get(params, "set-enable-xdr", context, &context_len)) { + // TODO - make sure context is null-terminated. + if ((strncmp(context, "true", 4) == 0) || (strncmp(context, "yes", 3) == 0)) { + cf_info(AS_INFO, "Changing value of set-enable-xdr of ns %s set %s to %s", ns->name, p_set->name, context); + cf_atomic32_set(&p_set->enable_xdr, AS_SET_ENABLE_XDR_TRUE); + } + else if ((strncmp(context, "false", 5) == 0) || (strncmp(context, "no", 2) == 0)) { + cf_info(AS_INFO, "Changing value of set-enable-xdr of ns %s set %s to %s", ns->name, p_set->name, context); + cf_atomic32_set(&p_set->enable_xdr, AS_SET_ENABLE_XDR_FALSE); + } + else if (strncmp(context, "use-default", 11) == 0) { + cf_info(AS_INFO, "Changing value of set-enable-xdr of ns %s set %s to %s", ns->name, p_set->name, context); + cf_atomic32_set(&p_set->enable_xdr, AS_SET_ENABLE_XDR_DEFAULT); + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "set-disable-eviction", context, &context_len)) { + if ((strncmp(context, "true", 4) == 0) || (strncmp(context, "yes", 3) == 0)) { + cf_info(AS_INFO, "Changing value of set-disable-eviction of ns %s set %s to %s", ns->name, p_set->name, context); + DISABLE_SET_EVICTION(p_set, true); + } + else if ((strncmp(context, "false", 5) == 0) || (strncmp(context, "no", 2) == 0)) { + cf_info(AS_INFO, "Changing value of set-disable-eviction of ns %s set %s to %s", ns->name, p_set->name, context); + DISABLE_SET_EVICTION(p_set, false); + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "set-stop-writes-count", context, &context_len)) { + uint64_t val = atoll(context); + cf_info(AS_INFO, "Changing value of set-stop-writes-count of ns %s set %s to %lu", ns->name, p_set->name, val); + cf_atomic64_set(&p_set->stop_writes_count, val); + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "memory-size", context, &context_len)) { + uint64_t val; + + if (0 != cf_str_atoi_u64(context, &val)) { + goto Error; + } + cf_debug(AS_INFO, "memory-size = %"PRIu64"", val); + if (val > ns->memory_size) + ns->memory_size = val; + if (val < (ns->memory_size / 2L)) { // protect so someone does not reduce memory to below 1/2 current value + goto Error; + } + cf_info(AS_INFO, "Changing value of memory-size of ns %s from %"PRIu64" to %"PRIu64, ns->name, ns->memory_size, val); + ns->memory_size = val; + } + else if (0 == as_info_parameter_get(params, "high-water-disk-pct", context, &context_len)) { + if (0 != cf_str_atoi(context, &val) || val < 0 || val > 100) { + goto Error; + } + cf_info(AS_INFO, "Changing value of high-water-disk-pct of ns %s from %u to %d ", ns->name, ns->hwm_disk_pct, val); + ns->hwm_disk_pct = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "high-water-memory-pct", context, &context_len)) { + if (0 != cf_str_atoi(context, &val) || val < 0 || val > 100) { + goto Error; + } + cf_info(AS_INFO, "Changing value of high-water-memory-pct memory of ns %s from %u to %d ", ns->name, ns->hwm_memory_pct, val); + ns->hwm_memory_pct = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "evict-tenths-pct", context, &context_len)) { + cf_info(AS_INFO, "Changing value of evict-tenths-pct memory of ns %s from %d to %d ", ns->name, ns->evict_tenths_pct, atoi(context)); + ns->evict_tenths_pct = atoi(context); + } + else if (0 == as_info_parameter_get(params, "evict-hist-buckets", context, &context_len)) { + if (0 != cf_str_atoi(context, &val) || val < 100 || val > 10000000) { + goto Error; + } + cf_info(AS_INFO, "Changing value of evict-hist-buckets of ns %s from %u to %d ", ns->name, ns->evict_hist_buckets, val); + ns->evict_hist_buckets = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "stop-writes-pct", context, &context_len)) { + if (0 != cf_str_atoi(context, &val) || val < 0 || val > 100) { + goto Error; + } + cf_info(AS_INFO, "Changing value of stop-writes-pct memory of ns %s from %u to %d ", ns->name, ns->stop_writes_pct, val); + ns->stop_writes_pct = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "default-ttl", context, &context_len)) { + uint64_t val; + if (cf_str_atoi_seconds(context, &val) != 0) { + cf_warning(AS_INFO, "default-ttl must be an unsigned number with time unit (s, m, h, or d)"); + goto Error; + } + if (val > ns->max_ttl) { + cf_warning(AS_INFO, "default-ttl must be <= max-ttl (%lu seconds)", ns->max_ttl); + goto Error; + } + cf_info(AS_INFO, "Changing value of default-ttl memory of ns %s from %"PRIu64" to %"PRIu64" ", ns->name, ns->default_ttl, val); + ns->default_ttl = val; + } + else if (0 == as_info_parameter_get(params, "max-ttl", context, &context_len)) { + uint64_t val; + if (cf_str_atoi_seconds(context, &val) != 0) { + cf_warning(AS_INFO, "max-ttl must be an unsigned number with time unit (s, m, h, or d)"); + goto Error; + } + if (val == 0 || val > MAX_ALLOWED_TTL) { + cf_warning(AS_INFO, "max-ttl must be non-zero and <= %u seconds", MAX_ALLOWED_TTL); + goto Error; + } + if (val < ns->default_ttl) { + cf_warning(AS_INFO, "max-ttl must be >= default-ttl (%lu seconds)", ns->default_ttl); + goto Error; + } + cf_info(AS_INFO, "Changing value of max-ttl memory of ns %s from %"PRIu64" to %"PRIu64" ", ns->name, ns->max_ttl, val); + ns->max_ttl = val; + } + else if (0 == as_info_parameter_get(params, "migrate-order", context, &context_len)) { + if (0 != cf_str_atoi(context, &val) || val < 1 || val > 10) { + goto Error; + } + cf_info(AS_INFO, "Changing value of migrate-order of ns %s from %u to %d", ns->name, ns->migrate_order, val); + ns->migrate_order = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "migrate-retransmit-ms", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + cf_info(AS_INFO, "Changing value of migrate-retransmit-ms of ns %s from %u to %d", ns->name, ns->migrate_retransmit_ms, val); + ns->migrate_retransmit_ms = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "migrate-sleep", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + cf_info(AS_INFO, "Changing value of migrate-sleep of ns %s from %u to %d", ns->name, ns->migrate_sleep, val); + ns->migrate_sleep = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "tomb-raider-eligible-age", context, &context_len)) { + uint64_t val; + if (cf_str_atoi_seconds(context, &val) != 0) { + cf_warning(AS_INFO, "tomb-raider-eligible-age must be an unsigned number with time unit (s, m, h, or d)"); + goto Error; + } + cf_info(AS_INFO, "Changing value of tomb-raider-eligible-age of ns %s from %u to %lu", ns->name, ns->tomb_raider_eligible_age, val); + ns->tomb_raider_eligible_age = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "tomb-raider-period", context, &context_len)) { + uint64_t val; + if (cf_str_atoi_seconds(context, &val) != 0) { + cf_warning(AS_INFO, "tomb-raider-period must be an unsigned number with time unit (s, m, h, or d)"); + goto Error; + } + cf_info(AS_INFO, "Changing value of tomb-raider-period of ns %s from %u to %lu", ns->name, ns->tomb_raider_period, val); + ns->tomb_raider_period = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "tomb-raider-sleep", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + cf_info(AS_INFO, "Changing value of tomb-raider-sleep of ns %s from %u to %d", ns->name, ns->storage_tomb_raider_sleep, val); + ns->storage_tomb_raider_sleep = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "obj-size-hist-max", context, &context_len)) { + uint32_t hist_max = (uint32_t)atoi(context); + uint32_t round_to = OBJ_SIZE_HIST_NUM_BUCKETS; + uint32_t round_max = hist_max ? ((hist_max + round_to - 1) / round_to) * round_to : round_to; + if (round_max != hist_max) { + cf_info(AS_INFO, "rounding obj-size-hist-max %u up to %u", hist_max, round_max); + } + cf_info(AS_INFO, "Changing value of obj-size-hist-max of ns %s to %u", ns->name, round_max); + cf_atomic32_set(&ns->obj_size_hist_max, round_max); // in 128-byte blocks + } + else if (0 == as_info_parameter_get(params, "rack-id", context, &context_len)) { + if (as_config_error_enterprise_only()) { + cf_warning(AS_INFO, "rack-id is enterprise-only"); + goto Error; + } + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + if ((uint32_t)val > MAX_RACK_ID) { + cf_warning(AS_INFO, "rack-id %d must be >= 0 and <= %u", val, MAX_RACK_ID); + goto Error; + } + cf_info(AS_INFO, "Changing value of rack-id of ns %s from %u to %d", ns->name, ns->rack_id, val); + ns->rack_id = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "conflict-resolution-policy", context, &context_len)) { + if (ns->cp) { + cf_warning(AS_INFO, "{%s} 'conflict-resolution-policy' is not applicable with 'strong-consistency'", ns->name); + goto Error; + } + if (strncmp(context, "generation", 10) == 0) { + cf_info(AS_INFO, "Changing value of conflict-resolution-policy of ns %s from %d to %s", ns->name, ns->conflict_resolution_policy, context); + ns->conflict_resolution_policy = AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_GENERATION; + } + else if (strncmp(context, "last-update-time", 16) == 0) { + cf_info(AS_INFO, "Changing value of conflict-resolution-policy of ns %s from %d to %s", ns->name, ns->conflict_resolution_policy, context); + ns->conflict_resolution_policy = AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME; + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "defrag-lwm-pct", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + cf_info(AS_INFO, "Changing value of defrag-lwm-pct of ns %s from %d to %d ", ns->name, ns->storage_defrag_lwm_pct, val); + + uint32_t old_val = ns->storage_defrag_lwm_pct; + + ns->storage_defrag_lwm_pct = val; + ns->defrag_lwm_size = (ns->storage_write_block_size * ns->storage_defrag_lwm_pct) / 100; + + if (ns->storage_defrag_lwm_pct > old_val) { + as_storage_defrag_sweep(ns); + } + } + else if (0 == as_info_parameter_get(params, "defrag-queue-min", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + cf_info(AS_INFO, "Changing value of defrag-queue-min of ns %s from %u to %d", ns->name, ns->storage_defrag_queue_min, val); + ns->storage_defrag_queue_min = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "defrag-sleep", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + cf_info(AS_INFO, "Changing value of defrag-sleep of ns %s from %u to %d", ns->name, ns->storage_defrag_sleep, val); + ns->storage_defrag_sleep = (uint32_t)val; + } + else if (0 == as_info_parameter_get(params, "flush-max-ms", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + cf_info(AS_INFO, "Changing value of flush-max-ms of ns %s from %lu to %d", ns->name, ns->storage_flush_max_us / 1000, val); + ns->storage_flush_max_us = (uint64_t)val * 1000; + } + else if (0 == as_info_parameter_get(params, "fsync-max-sec", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + goto Error; + } + cf_info(AS_INFO, "Changing value of fsync-max-sec of ns %s from %lu to %d", ns->name, ns->storage_fsync_max_us / 1000000, val); + ns->storage_fsync_max_us = (uint64_t)val * 1000000; + } + else if (0 == as_info_parameter_get(params, "enable-xdr", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of enable-xdr of ns %s from %s to %s", ns->name, bool_val[ns->enable_xdr], context); + ns->enable_xdr = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of enable-xdr of ns %s from %s to %s", ns->name, bool_val[ns->enable_xdr], context); + ns->enable_xdr = false; + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "sets-enable-xdr", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of sets-enable-xdr of ns %s from %s to %s", ns->name, bool_val[ns->sets_enable_xdr], context); + ns->sets_enable_xdr = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of sets-enable-xdr of ns %s from %s to %s", ns->name, bool_val[ns->sets_enable_xdr], context); + ns->sets_enable_xdr = false; + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "ns-forward-xdr-writes", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of ns-forward-xdr-writes of ns %s from %s to %s", ns->name, bool_val[ns->ns_forward_xdr_writes], context); + ns->ns_forward_xdr_writes = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of ns-forward-xdr-writes of ns %s from %s to %s", ns->name, bool_val[ns->ns_forward_xdr_writes], context); + ns->ns_forward_xdr_writes = false; + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "allow-nonxdr-writes", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of allow-nonxdr-writes of ns %s from %s to %s", ns->name, bool_val[ns->ns_allow_nonxdr_writes], context); + ns->ns_allow_nonxdr_writes = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of allow-nonxdr-writes of ns %s from %s to %s", ns->name, bool_val[ns->ns_allow_nonxdr_writes], context); + ns->ns_allow_nonxdr_writes = false; + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "allow-xdr-writes", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of allow-xdr-writes of ns %s from %s to %s", ns->name, bool_val[ns->ns_allow_xdr_writes], context); + ns->ns_allow_xdr_writes = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of allow-xdr-writes of ns %s from %s to %s", ns->name, bool_val[ns->ns_allow_xdr_writes], context); + ns->ns_allow_xdr_writes = false; + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "strong-consistency-allow-expunge", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of strong-consistency-allow-expunge of ns %s from %s to %s", ns->name, bool_val[ns->cp_allow_drops], context); + ns->cp_allow_drops = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of strong-consistency-allow-expunge of ns %s from %s to %s", ns->name, bool_val[ns->cp_allow_drops], context); + ns->cp_allow_drops = false; + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "disable-write-dup-res", context, &context_len)) { + if (ns->cp) { + cf_warning(AS_INFO, "{%s} 'disable-write-dup-res' is not applicable with 'strong-consistency'", ns->name); + goto Error; + } + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of disable-write-dup-res of ns %s from %s to %s", ns->name, bool_val[ns->write_dup_res_disabled], context); + ns->write_dup_res_disabled = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of disable-write-dup-res of ns %s from %s to %s", ns->name, bool_val[ns->write_dup_res_disabled], context); + ns->write_dup_res_disabled = false; + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "disallow-null-setname", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of disallow-null-setname of ns %s from %s to %s", ns->name, bool_val[ns->disallow_null_setname], context); + ns->disallow_null_setname = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of disallow-null-setname of ns %s from %s to %s", ns->name, bool_val[ns->disallow_null_setname], context); + ns->disallow_null_setname = false; + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "enable-benchmarks-batch-sub", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-batch-sub of ns %s from %s to %s", ns->name, bool_val[ns->batch_sub_benchmarks_enabled], context); + ns->batch_sub_benchmarks_enabled = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-batch-sub of ns %s from %s to %s", ns->name, bool_val[ns->batch_sub_benchmarks_enabled], context); + ns->batch_sub_benchmarks_enabled = false; + histogram_clear(ns->batch_sub_start_hist); + histogram_clear(ns->batch_sub_restart_hist); + histogram_clear(ns->batch_sub_dup_res_hist); + histogram_clear(ns->batch_sub_repl_ping_hist); + histogram_clear(ns->batch_sub_read_local_hist); + histogram_clear(ns->batch_sub_response_hist); + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "enable-benchmarks-read", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-read of ns %s from %s to %s", ns->name, bool_val[ns->read_benchmarks_enabled], context); + ns->read_benchmarks_enabled = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-read of ns %s from %s to %s", ns->name, bool_val[ns->read_benchmarks_enabled], context); + ns->read_benchmarks_enabled = false; + histogram_clear(ns->read_start_hist); + histogram_clear(ns->read_restart_hist); + histogram_clear(ns->read_dup_res_hist); + histogram_clear(ns->read_repl_ping_hist); + histogram_clear(ns->read_local_hist); + histogram_clear(ns->read_response_hist); + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "enable-benchmarks-storage", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-storage of ns %s from %s to %s", ns->name, bool_val[ns->storage_benchmarks_enabled], context); + ns->storage_benchmarks_enabled = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-storage of ns %s from %s to %s", ns->name, bool_val[ns->storage_benchmarks_enabled], context); + ns->storage_benchmarks_enabled = false; + as_storage_histogram_clear_all(ns); + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "enable-benchmarks-udf", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-udf of ns %s from %s to %s", ns->name, bool_val[ns->udf_benchmarks_enabled], context); + ns->udf_benchmarks_enabled = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-udf of ns %s from %s to %s", ns->name, bool_val[ns->udf_benchmarks_enabled], context); + ns->udf_benchmarks_enabled = false; + histogram_clear(ns->udf_start_hist); + histogram_clear(ns->udf_restart_hist); + histogram_clear(ns->udf_dup_res_hist); + histogram_clear(ns->udf_master_hist); + histogram_clear(ns->udf_repl_write_hist); + histogram_clear(ns->udf_response_hist); + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "enable-benchmarks-udf-sub", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-udf-sub of ns %s from %s to %s", ns->name, bool_val[ns->udf_sub_benchmarks_enabled], context); + ns->udf_sub_benchmarks_enabled = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-udf-sub of ns %s from %s to %s", ns->name, bool_val[ns->udf_sub_benchmarks_enabled], context); + ns->udf_sub_benchmarks_enabled = false; + histogram_clear(ns->udf_sub_start_hist); + histogram_clear(ns->udf_sub_restart_hist); + histogram_clear(ns->udf_sub_dup_res_hist); + histogram_clear(ns->udf_sub_master_hist); + histogram_clear(ns->udf_sub_repl_write_hist); + histogram_clear(ns->udf_sub_response_hist); + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "enable-benchmarks-write", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-write of ns %s from %s to %s", ns->name, bool_val[ns->write_benchmarks_enabled], context); + ns->write_benchmarks_enabled = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of enable-benchmarks-write of ns %s from %s to %s", ns->name, bool_val[ns->write_benchmarks_enabled], context); + ns->write_benchmarks_enabled = false; + histogram_clear(ns->write_start_hist); + histogram_clear(ns->write_restart_hist); + histogram_clear(ns->write_dup_res_hist); + histogram_clear(ns->write_master_hist); + histogram_clear(ns->write_repl_write_hist); + histogram_clear(ns->write_response_hist); + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "enable-hist-proxy", context, &context_len)) { + if (strncmp(context, "true", 4) == 0 || strncmp(context, "yes", 3) == 0) { + cf_info(AS_INFO, "Changing value of enable-hist-proxy of ns %s from %s to %s", ns->name, bool_val[ns->proxy_hist_enabled], context); + ns->proxy_hist_enabled = true; + } + else if (strncmp(context, "false", 5) == 0 || strncmp(context, "no", 2) == 0) { + cf_info(AS_INFO, "Changing value of enable-hist-proxy of ns %s from %s to %s", ns->name, bool_val[ns->proxy_hist_enabled], context); + ns->proxy_hist_enabled = false; + histogram_clear(ns->proxy_hist); + } + else { + goto Error; + } + } + else if (0 == as_info_parameter_get(params, "max-write-cache", context, &context_len)) { + uint64_t val_u64; + + if (0 != cf_str_atoi_u64(context, &val_u64)) { + goto Error; + } + if (val_u64 < (1024 * 1024 * 4)) { // TODO - why enforce this? And here, but not cfg.c? + cf_warning(AS_INFO, "can't set max-write-cache less than 4M"); + goto Error; + } + cf_info(AS_INFO, "Changing value of max-write-cache of ns %s from %lu to %lu ", ns->name, ns->storage_max_write_cache, val_u64); + ns->storage_max_write_cache = val_u64; + ns->storage_max_write_q = (int)(ns->storage_max_write_cache / ns->storage_write_block_size); + } + else if (0 == as_info_parameter_get(params, "min-avail-pct", context, &context_len)) { + ns->storage_min_avail_pct = atoi(context); + cf_info(AS_INFO, "Changing value of min-avail-pct of ns %s from %u to %u ", ns->name, ns->storage_min_avail_pct, atoi(context)); + } + else if (0 == as_info_parameter_get(params, "post-write-queue", context, &context_len)) { + if (ns->storage_data_in_memory) { + cf_warning(AS_INFO, "ns %s, can't set post-write-queue if data-in-memory", ns->name); + goto Error; + } + if (0 != cf_str_atoi(context, &val)) { + cf_warning(AS_INFO, "ns %s, post-write-queue %s is not a number", ns->name, context); + goto Error; + } + if ((uint32_t)val > (4 * 1024)) { + cf_warning(AS_INFO, "ns %s, post-write-queue %u must be < 4K", ns->name, val); + goto Error; + } + cf_info(AS_INFO, "Changing value of post-write-queue of ns %s from %d to %d ", ns->name, ns->storage_post_write_queue, val); + cf_atomic32_set(&ns->storage_post_write_queue, (uint32_t)val); + } + else if (0 == as_info_parameter_get(params, "read-consistency-level-override", context, &context_len)) { + if (ns->cp) { + cf_warning(AS_INFO, "{%s} 'read-consistency-level-override' is not applicable with 'strong-consistency'", ns->name); + goto Error; + } + char *original_value = NS_READ_CONSISTENCY_LEVEL_NAME(); + if (strcmp(context, "all") == 0) { + ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_ALL; + } + else if (strcmp(context, "off") == 0) { + ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_PROTO; + } + else if (strcmp(context, "one") == 0) { + ns->read_consistency_level = AS_READ_CONSISTENCY_LEVEL_ONE; + } + else { + goto Error; + } + if (strcmp(original_value, context)) { + cf_info(AS_INFO, "Changing value of read-consistency-level-override of ns %s from %s to %s", ns->name, original_value, context); + } + } + else if (0 == as_info_parameter_get(params, "write-commit-level-override", context, &context_len)) { + if (ns->cp) { + cf_warning(AS_INFO, "{%s} 'write-commit-level-override' is not applicable with 'strong-consistency'", ns->name); + goto Error; + } + char *original_value = NS_WRITE_COMMIT_LEVEL_NAME(); + if (strcmp(context, "all") == 0) { + ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_ALL; + } + else if (strcmp(context, "master") == 0) { + ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_MASTER; + } + else if (strcmp(context, "off") == 0) { + ns->write_commit_level = AS_WRITE_COMMIT_LEVEL_PROTO; + } + else { + goto Error; + } + if (strcmp(original_value, context)) { + cf_info(AS_INFO, "Changing value of write-commit-level-override of ns %s from %s to %s", ns->name, original_value, context); + } + } + else if (0 == as_info_parameter_get(params, "geo2dsphere-within-max-cells", context, &context_len)) { + if (0 != cf_str_atoi(context, &val)) { + cf_warning(AS_INFO, "ns %s, geo2dsphere-within-max-cells %s is not a number", ns->name, context); + goto Error; + } + if (val <= 0) { + cf_warning(AS_INFO, "ns %s, geo2dsphere-within-max-cells %u must be > 0", ns->name, val); + goto Error; + } + if ((uint32_t)val > (MAX_REGION_CELLS)) { + cf_warning(AS_INFO, "ns %s, geo2dsphere-within-max-cells %u must be <= %u", ns->name, val, MAX_REGION_CELLS); + goto Error; + } + cf_info(AS_INFO, "Changing value of geo2dsphere-within-max-cells of ns %s from %d to %d ", + ns->name, ns->geo2dsphere_within_max_cells, val); + ns->geo2dsphere_within_max_cells = val; + } + else { + if (as_xdr_set_config_ns(ns->name, params) == false) { + goto Error; + } + } + } // end of namespace stanza + else if (strcmp(context, "security") == 0) { + context_len = sizeof(context); + if (0 == as_info_parameter_get(params, "privilege-refresh-period", context, &context_len)) { + if (0 != cf_str_atoi(context, &val) || val < 10 || val > 60 * 60 * 24) { + cf_warning(AS_INFO, "privilege-refresh-period must be an unsigned integer between 10 and 86400"); + goto Error; + } + cf_info(AS_INFO, "Changing value of privilege-refresh-period from %u to %d", g_config.sec_cfg.privilege_refresh_period, val); + g_config.sec_cfg.privilege_refresh_period = (uint32_t)val; + } + else { + goto Error; + } + } + else if (strcmp(context, "xdr") == 0) { + if (as_xdr_set_config(params) == false) { + goto Error; + } + } + else + goto Error; + + cf_info(AS_INFO, "config-set command completed: params %s",params); + cf_dyn_buf_append_string(db, "ok"); + return(0); + +Error: + cf_dyn_buf_append_string(db, "error"); + return(0); +} + +// Protect all set-config commands from concurrency issues. +static pthread_mutex_t g_set_cfg_lock = PTHREAD_MUTEX_INITIALIZER; + +int +info_command_config_set(char *name, char *params, cf_dyn_buf *db) +{ + pthread_mutex_lock(&g_set_cfg_lock); + + int result = info_command_config_set_threadsafe(name, params, db); + + pthread_mutex_unlock(&g_set_cfg_lock); + + return result; +} + +// +// log-set:log=id;context=foo;level=bar +// ie: +// log-set:log=0;context=rw;level=debug + + +int +info_command_log_set(char *name, char *params, cf_dyn_buf *db) +{ + cf_debug(AS_INFO, "log-set command received: params %s", params); + + char id_str[50]; + int id_str_len = sizeof(id_str); + int id = -1; + bool found_id = true; + cf_fault_sink *s = 0; + + if (0 != as_info_parameter_get(params, "id", id_str, &id_str_len)) { + if (0 != as_info_parameter_get(params, "log", id_str, &id_str_len)) { + cf_debug(AS_INFO, "log set command: no log id to be set - doing all"); + found_id = false; + } + } + if (found_id == true) { + if (0 != cf_str_atoi(id_str, &id) ) { + cf_info(AS_INFO, "log set command: id must be an integer, is: %s", id_str); + cf_dyn_buf_append_string(db, "error-id-not-integer"); + return(0); + } + s = cf_fault_sink_get_id(id); + if (!s) { + cf_info(AS_INFO, "log set command: sink id %d invalid", id); + cf_dyn_buf_append_string(db, "error-bad-id"); + return(0); + } + } + + // now, loop through all context strings. If we find a known context string, + // do the set + for (int c_id = 0; c_id < CF_FAULT_CONTEXT_UNDEF; c_id++) { + + char level_str[50]; + int level_str_len = sizeof(level_str); + char *context = cf_fault_context_strings[c_id]; + if (0 != as_info_parameter_get(params, context, level_str, &level_str_len)) { + continue; + } + for (uint32_t i = 0; level_str[i]; i++) level_str[i] = toupper(level_str[i]); + + if (0 != cf_fault_sink_addcontext(s, context, level_str)) { + cf_info(AS_INFO, "log set command: addcontext failed: context %s level %s", context, level_str); + cf_dyn_buf_append_string(db, "error-invalid-context-or-level"); + return(0); + } + } + + cf_info(AS_INFO, "log-set command executed: params %s", params); + + cf_dyn_buf_append_string(db, "ok"); + + return(0); +} + + +// latency:hist=reads;back=180;duration=60;slice=10; +// throughput:hist=reads;back=180;duration=60;slice=10; +// hist-track-start:hist=reads;back=43200;slice=30;thresholds=1,4,16,64; +// hist-track-stop:hist=reads; +// +// hist - optional histogram name - if none, command applies to all cf_hist_track objects +// +// for start command: +// back - total time span in seconds over which to cache data +// slice - period in seconds at which to cache histogram data +// thresholds - comma-separated bucket (ms) values to track, must be powers of 2. e.g: +// 1,4,16,64 +// defaults are: +// - config value for back - mandatory, serves as flag for tracking +// - config value if it exists for slice, otherwise 10 seconds +// - config value if it exists for thresholds, otherwise internal defaults (1,8,64) +// +// for query commands: +// back - start search this many seconds before now, default: minimum to get last slice +// using back=0 will get cached data from oldest cached data +// duration - seconds (forward) from start to search, default 0: everything to present +// slice - intervals (in seconds) to analyze, default 0: everything as one slice +// +// e.g. query: +// latency:hist=reads;back=180;duration=60;slice=10; +// output (CF_HIST_TRACK_FMT_PACKED format) is: +// requested value latency:hist=reads;back=180;duration=60;slice=10 +// value is reads:23:26:24-GMT,ops/sec,>1ms,>8ms,>64ms;23:26:34,30618.2,0.05,0.00,0.00; +// 23:26:44,31942.1,0.02,0.00,0.00;23:26:54,30966.9,0.01,0.00,0.00;23:27:04,30380.4,0.01,0.00,0.00; +// 23:27:14,37833.6,0.01,0.00,0.00;23:27:24,38502.7,0.01,0.00,0.00;23:27:34,39191.4,0.02,0.00,0.00; +// +// explanation: +// 23:26:24-GMT - timestamp of histogram starting first slice +// ops/sec,>1ms,>8ms,>64ms - labels for the columns: throughput, and which thresholds +// 23:26:34,30618.2,0.05,0.00,0.00; - timestamp of histogram ending slice, throughput, latencies + +int +info_command_hist_track(char *name, char *params, cf_dyn_buf *db) +{ + cf_debug(AS_INFO, "hist track %s command received: params %s", name, params); + + char value_str[50]; + int value_str_len = sizeof(value_str); + cf_hist_track* hist_p = NULL; + + if (0 != as_info_parameter_get(params, "hist", value_str, &value_str_len)) { + cf_debug(AS_INFO, "hist track %s command: no histogram specified - doing all", name); + } + else { + if (*value_str == '{') { + char* ns_name = value_str + 1; + char* ns_name_end = strchr(ns_name, '}'); + as_namespace* ns = as_namespace_get_bybuf((uint8_t*)ns_name, ns_name_end - ns_name); + + if (! ns) { + cf_info(AS_INFO, "hist track %s command: unrecognized histogram: %s", name, value_str); + cf_dyn_buf_append_string(db, "error-bad-hist-name"); + return 0; + } + + char* hist_name = ns_name_end + 1; + + if (*hist_name++ != '-') { + cf_info(AS_INFO, "hist track %s command: unrecognized histogram: %s", name, value_str); + cf_dyn_buf_append_string(db, "error-bad-hist-name"); + return 0; + } + + if (0 == strcmp(hist_name, "read")) { + hist_p = ns->read_hist; + } + else if (0 == strcmp(hist_name, "write")) { + hist_p = ns->write_hist; + } + else if (0 == strcmp(hist_name, "udf")) { + hist_p = ns->udf_hist; + } + else if (0 == strcmp(hist_name, "query")) { + hist_p = ns->query_hist; + } + else { + cf_info(AS_INFO, "hist track %s command: unrecognized histogram: %s", name, value_str); + cf_dyn_buf_append_string(db, "error-bad-hist-name"); + return 0; + } + } + else { + cf_info(AS_INFO, "hist track %s command: unrecognized histogram: %s", name, value_str); + cf_dyn_buf_append_string(db, "error-bad-hist-name"); + return 0; + } + } + + if (0 == strcmp(name, "hist-track-stop")) { + if (hist_p) { + cf_hist_track_stop(hist_p); + } + else { + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_namespace* ns = g_config.namespaces[i]; + + cf_hist_track_stop(ns->read_hist); + cf_hist_track_stop(ns->write_hist); + cf_hist_track_stop(ns->udf_hist); + cf_hist_track_stop(ns->query_hist); + } + } + + cf_dyn_buf_append_string(db, "ok"); + + return 0; + } + + bool start_cmd = 0 == strcmp(name, "hist-track-start"); + + // Note - default query params will get the most recent saved slice. + uint32_t back_sec = start_cmd ? g_config.hist_track_back : (g_config.hist_track_slice * 2) - 1; + uint32_t slice_sec = start_cmd ? g_config.hist_track_slice : 0; + int i; + + value_str_len = sizeof(value_str); + + if (0 == as_info_parameter_get(params, "back", value_str, &value_str_len)) { + if (0 == cf_str_atoi(value_str, &i)) { + back_sec = i >= 0 ? (uint32_t)i : (uint32_t)-i; + } + else { + cf_info(AS_INFO, "hist track %s command: back is not a number, using default", name); + } + } + + value_str_len = sizeof(value_str); + + if (0 == as_info_parameter_get(params, "slice", value_str, &value_str_len)) { + if (0 == cf_str_atoi(value_str, &i)) { + slice_sec = i >= 0 ? (uint32_t)i : (uint32_t)-i; + } + else { + cf_info(AS_INFO, "hist track %s command: slice is not a number, using default", name); + } + } + + if (start_cmd) { + char* thresholds = g_config.hist_track_thresholds; + + value_str_len = sizeof(value_str); + + if (0 == as_info_parameter_get(params, "thresholds", value_str, &value_str_len)) { + thresholds = value_str; + } + + cf_debug(AS_INFO, "hist track start command: back %u, slice %u, thresholds %s", + back_sec, slice_sec, thresholds ? thresholds : "null"); + + if (hist_p) { + if (cf_hist_track_start(hist_p, back_sec, slice_sec, thresholds)) { + cf_dyn_buf_append_string(db, "ok"); + } + else { + cf_dyn_buf_append_string(db, "error-bad-start-params"); + } + } + else { + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_namespace* ns = g_config.namespaces[i]; + + if ( ! (cf_hist_track_start(ns->read_hist, back_sec, slice_sec, thresholds) && + cf_hist_track_start(ns->write_hist, back_sec, slice_sec, thresholds) && + cf_hist_track_start(ns->udf_hist, back_sec, slice_sec, thresholds) && + cf_hist_track_start(ns->query_hist, back_sec, slice_sec, thresholds))) { + + cf_dyn_buf_append_string(db, "error-bad-start-params"); + return 0; + } + } + + cf_dyn_buf_append_string(db, "ok"); + } + + return 0; + } + + // From here on it's latency or throughput... + + uint32_t duration_sec = 0; + + value_str_len = sizeof(value_str); + + if (0 == as_info_parameter_get(params, "duration", value_str, &value_str_len)) { + if (0 == cf_str_atoi(value_str, &i)) { + duration_sec = i >= 0 ? (uint32_t)i : (uint32_t)-i; + } + else { + cf_info(AS_INFO, "hist track %s command: duration is not a number, using default", name); + } + } + + bool throughput_only = 0 == strcmp(name, "throughput"); + + cf_debug(AS_INFO, "hist track %s command: back %u, duration %u, slice %u", + name, back_sec, duration_sec, slice_sec); + + if (hist_p) { + cf_hist_track_get_info(hist_p, back_sec, duration_sec, slice_sec, throughput_only, CF_HIST_TRACK_FMT_PACKED, db); + } + else { + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_namespace* ns = g_config.namespaces[i]; + + cf_hist_track_get_info(ns->read_hist, back_sec, duration_sec, slice_sec, throughput_only, CF_HIST_TRACK_FMT_PACKED, db); + cf_hist_track_get_info(ns->write_hist, back_sec, duration_sec, slice_sec, throughput_only, CF_HIST_TRACK_FMT_PACKED, db); + cf_hist_track_get_info(ns->udf_hist, back_sec, duration_sec, slice_sec, throughput_only, CF_HIST_TRACK_FMT_PACKED, db); + cf_hist_track_get_info(ns->query_hist, back_sec, duration_sec, slice_sec, throughput_only, CF_HIST_TRACK_FMT_PACKED, db); + } + } + + cf_dyn_buf_chomp(db); + + return 0; +} + +// TODO - separate all these CP-related info commands. + +// Format is: +// +// revive:{namespace=} +// +int +info_command_revive(char *name, char *params, cf_dyn_buf *db) +{ + char ns_name[AS_ID_NAMESPACE_SZ] = { 0 }; + int ns_name_len = (int)sizeof(ns_name); + int rv = as_info_parameter_get(params, "namespace", ns_name, &ns_name_len); + + if (rv == -2) { + cf_warning(AS_INFO, "revive: namespace parameter value too long"); + cf_dyn_buf_append_string(db, "ERROR::bad-namespace"); + return 0; + } + + if (rv == 0) { + as_namespace *ns = as_namespace_get_byname(ns_name); + + if (! ns) { + cf_warning(AS_INFO, "revive: unknown namespace %s", ns_name); + cf_dyn_buf_append_string(db, "ERROR::unknown-namespace"); + return 0; + } + + if (! as_partition_balance_revive(ns)) { + cf_warning(AS_INFO, "revive: failed - recluster in progress"); + cf_dyn_buf_append_string(db, "ERROR::failed-revive"); + return 0; + } + + cf_info(AS_INFO, "revive: complete - issue 'recluster:' command"); + cf_dyn_buf_append_string(db, "ok"); + return 0; + } + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace *ns = g_config.namespaces[ns_ix]; + + if (! as_partition_balance_revive(ns)) { + cf_warning(AS_INFO, "revive: failed - recluster in progress"); + cf_dyn_buf_append_string(db, "ERROR::failed-revive"); + return 0; + } + } + + cf_info(AS_INFO, "revive: complete - issue 'recluster:' command"); + cf_dyn_buf_append_string(db, "ok"); + return 0; +} + +void +namespace_roster_info(as_namespace *ns, cf_dyn_buf *db) +{ + as_exchange_info_lock(); + + cf_dyn_buf_append_string(db, "roster="); + + if (ns->roster_count == 0) { + cf_dyn_buf_append_string(db, "null"); + } + else { + for (uint32_t n = 0; n < ns->roster_count; n++) { + cf_dyn_buf_append_uint64_x(db, ns->roster[n]); + + if (ns->roster_rack_ids[n] != 0) { + cf_dyn_buf_append_char(db, ROSTER_ID_PAIR_SEPARATOR); + cf_dyn_buf_append_uint32(db, ns->roster_rack_ids[n]); + } + + cf_dyn_buf_append_char(db, ','); + } + + cf_dyn_buf_chomp(db); + } + + cf_dyn_buf_append_char(db, ':'); + + cf_dyn_buf_append_string(db, "pending_roster="); + + if (ns->smd_roster_count == 0) { + cf_dyn_buf_append_string(db, "null"); + } + else { + for (uint32_t n = 0; n < ns->smd_roster_count; n++) { + cf_dyn_buf_append_uint64_x(db, ns->smd_roster[n]); + + if (ns->smd_roster_rack_ids[n] != 0) { + cf_dyn_buf_append_char(db, ROSTER_ID_PAIR_SEPARATOR); + cf_dyn_buf_append_uint32(db, ns->smd_roster_rack_ids[n]); + } + + cf_dyn_buf_append_char(db, ','); + } + + cf_dyn_buf_chomp(db); + } + + cf_dyn_buf_append_char(db, ':'); + + cf_dyn_buf_append_string(db, "observed_nodes="); + + if (ns->observed_cluster_size == 0) { + cf_dyn_buf_append_string(db, "null"); + } + else { + for (uint32_t n = 0; n < ns->observed_cluster_size; n++) { + cf_dyn_buf_append_uint64_x(db, ns->observed_succession[n]); + + if (ns->rack_ids[n] != 0) { + cf_dyn_buf_append_char(db, ROSTER_ID_PAIR_SEPARATOR); + cf_dyn_buf_append_uint32(db, ns->rack_ids[n]); + } + + cf_dyn_buf_append_char(db, ','); + } + + cf_dyn_buf_chomp(db); + } + + as_exchange_info_unlock(); +} + +// Format is: +// +// roster:{namespace=} +// +int +info_command_roster(char *name, char *params, cf_dyn_buf *db) +{ + char ns_name[AS_ID_NAMESPACE_SZ] = { 0 }; + int ns_name_len = (int)sizeof(ns_name); + int rv = as_info_parameter_get(params, "namespace", ns_name, &ns_name_len); + + if (rv == -2) { + cf_warning(AS_INFO, "namespace parameter value too long"); + cf_dyn_buf_append_string(db, "ERROR::bad-namespace"); + return 0; + } + + if (rv == 0) { + as_namespace *ns = as_namespace_get_byname(ns_name); + + if (! ns) { + cf_warning(AS_INFO, "unknown namespace %s", ns_name); + cf_dyn_buf_append_string(db, "ERROR::unknown-namespace"); + return 0; + } + + namespace_roster_info(ns, db); + + return 0; + } + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace *ns = g_config.namespaces[ns_ix]; + + cf_dyn_buf_append_string(db, "ns="); + cf_dyn_buf_append_string(db, ns->name); + cf_dyn_buf_append_char(db, ':'); + + namespace_roster_info(ns, db); + + cf_dyn_buf_append_char(db, ';'); + } + + cf_dyn_buf_chomp(db); + + return 0; +} + +// Format is: +// +// roster-set:namespace=;nodes= +// +// where is comma-separated list of node-id:rack-id pairs, and +// the :rack-id may be absent, indicating a rack-id of 0. +// +int +info_command_roster_set(char *name, char *params, cf_dyn_buf *db) +{ + // Get the namespace name. + + char ns_name[AS_ID_NAMESPACE_SZ]; + int ns_name_len = (int)sizeof(ns_name); + int ns_rv = as_info_parameter_get(params, "namespace", ns_name, &ns_name_len); + + if (ns_rv != 0 || ns_name_len == 0) { + cf_warning(AS_INFO, "roster-set command: missing or invalid namespace name in command"); + cf_dyn_buf_append_string(db, "ERROR::namespace-name"); + return 0; + } + + // Get the nodes list. + + char nodes[AS_CLUSTER_SZ * ROSTER_STRING_ELE_LEN]; + int nodes_len = (int)sizeof(nodes); + int nodes_rv = as_info_parameter_get(params, "nodes", nodes, &nodes_len); + + if (nodes_rv == -2 || (nodes_rv == 0 && nodes_len == 0)) { + cf_warning(AS_INFO, "roster-set command: invalid nodes in command"); + cf_dyn_buf_append_string(db, "ERROR::nodes"); + return 0; + } + + // Issue the roster-set command. + + bool ok = as_roster_set_nodes_cmd(ns_name, nodes); + + cf_dyn_buf_append_string(db, ok ? "ok" : "ERROR::roster-set"); + + return 0; +} + +// Format is one of: +// +// truncate:namespace=;set=;lut= +// truncate:namespace=;set= +// +// truncate:namespace=;lut= +// truncate:namespace= +// +int +info_command_truncate(char *name, char *params, cf_dyn_buf *db) +{ + // Get the namespace name. + + char ns_name[AS_ID_NAMESPACE_SZ]; + int ns_name_len = (int)sizeof(ns_name); + int ns_rv = as_info_parameter_get(params, "namespace", ns_name, &ns_name_len); + + if (ns_rv != 0 || ns_name_len == 0) { + cf_warning(AS_INFO, "truncate command: missing or invalid namespace name in command"); + cf_dyn_buf_append_string(db, "ERROR::namespace-name"); + return 0; + } + + // Get the set-name if there is one. + + char set_name[AS_SET_NAME_MAX_SIZE]; + int set_name_len = (int)sizeof(set_name); + int set_rv = as_info_parameter_get(params, "set", set_name, &set_name_len); + + if (set_rv == -2 || (set_rv == 0 && set_name_len == 0)) { + cf_warning(AS_INFO, "truncate command: invalid set name in command"); + cf_dyn_buf_append_string(db, "ERROR::set-name"); + return 0; + } + + // Get the threshold last-update-time if there is one. + + char lut_str[24]; // allow decimal, hex or octal in C constant format + int lut_str_len = (int)sizeof(lut_str); + int lut_rv = as_info_parameter_get(params, "lut", lut_str, &lut_str_len); + + if (lut_rv == -2 || (lut_rv == 0 && lut_str_len == 0)) { + cf_warning(AS_INFO, "truncate command: invalid last-update-time in command"); + cf_dyn_buf_append_string(db, "ERROR::last-update-time"); + return 0; + } + + // Issue the truncate command. + + bool ok = as_truncate_cmd(ns_name, + set_rv == 0 ? set_name : NULL, + lut_rv == 0 ? lut_str : NULL); + + cf_dyn_buf_append_string(db, ok ? "ok" : "ERROR::truncate"); + + return 0; +} + +// Format is one of: +// +// truncate-undo:namespace=;set= +// +// truncate-undo:namespace= +// +int +info_command_truncate_undo(char *name, char *params, cf_dyn_buf *db) +{ + // Get the namespace name. + + char ns_name[AS_ID_NAMESPACE_SZ]; + int ns_name_len = (int)sizeof(ns_name); + int ns_rv = as_info_parameter_get(params, "namespace", ns_name, &ns_name_len); + + if (ns_rv != 0 || ns_name_len == 0) { + cf_warning(AS_INFO, "truncate-undo command: missing or invalid namespace name in command"); + cf_dyn_buf_append_string(db, "ERROR::namespace-name"); + return 0; + } + + // Get the set-name if there is one. + + char set_name[AS_SET_NAME_MAX_SIZE]; + int set_name_len = (int)sizeof(set_name); + int set_rv = as_info_parameter_get(params, "set", set_name, &set_name_len); + + if (set_rv == -2 || (set_rv == 0 && set_name_len == 0)) { + cf_warning(AS_INFO, "truncate-undo command: invalid set name in command"); + cf_dyn_buf_append_string(db, "ERROR::set-name"); + return 0; + } + + // Issue the truncate-undo command. + + as_truncate_undo_cmd(ns_name, set_rv == 0 ? set_name : NULL); + + cf_dyn_buf_append_string(db, "ok"); + + return 0; +} + +// +// Log a message to the server. +// Limited to 2048 characters. +// +// Format: +// log-message:message=[;who=] +// +// Example: +// log-message:message=Example Log Message;who=Aerospike User +// +int +info_command_log_message(char *name, char *params, cf_dyn_buf *db) +{ + char who[128]; + int who_len = sizeof(who); + if (0 != as_info_parameter_get(params, "who", who, &who_len)) { + strcpy(who, "unknown"); + } + + char message[2048]; + int message_len = sizeof(message); + if (0 == as_info_parameter_get(params, "message", message, &message_len)) { + cf_info(AS_INFO, "%s: %s", who, message); + } + + return 0; +} + +// Generic info system functions +// These functions act when an INFO message comes in over the PROTO pipe +// collects the static and dynamic portions, puts it in a 'dyn buf', +// and sends a reply +// + +// Error strings for security check results. +static void +append_sec_err_str(cf_dyn_buf *db, uint32_t result, as_sec_perm cmd_perm) { + switch (result) { + case AS_SEC_ERR_NOT_AUTHENTICATED: + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_uint32(db, result); + cf_dyn_buf_append_string(db, ":not authenticated"); + return; + case AS_SEC_ERR_ROLE_VIOLATION: + switch (cmd_perm) { + case PERM_INDEX_MANAGE: + INFO_COMMAND_SINDEX_FAILCODE(result, "role violation"); + return; + case PERM_UDF_MANAGE: + cf_dyn_buf_append_string(db, "error=role_violation"); + return; + default: + break; + } + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_uint32(db, result); + cf_dyn_buf_append_string(db, ":role violation"); + return; + default: + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_uint32(db, result); + cf_dyn_buf_append_string(db, ":unexpected security error"); + return; + } +} + +static pthread_mutex_t g_info_lock = PTHREAD_MUTEX_INITIALIZER; +info_static *static_head = 0; +info_dynamic *dynamic_head = 0; +info_tree *tree_head = 0; +info_command *command_head = 0; +// +// Pull up all elements in both list into the buffers +// (efficient enough if you're looking for lots of things) +// But only gets 'default' values +// + +int +info_all(const as_file_handle* fd_h, cf_dyn_buf *db) +{ + uint8_t auth_result = as_security_check(fd_h, PERM_NONE); + + if (auth_result != AS_PROTO_RESULT_OK) { + as_security_log(fd_h, auth_result, PERM_NONE, "info-all request", NULL); + append_sec_err_str(db, auth_result, PERM_NONE); + cf_dyn_buf_append_char(db, EOL); + return 0; + } + + info_static *s = static_head; + while (s) { + if (s->def == true) { + cf_dyn_buf_append_string( db, s->name); + cf_dyn_buf_append_char( db, SEP ); + cf_dyn_buf_append_buf( db, (uint8_t *) s->value, s->value_sz); + cf_dyn_buf_append_char( db, EOL ); + } + s = s->next; + } + + info_dynamic *d = dynamic_head; + while (d) { + if (d->def == true) { + cf_dyn_buf_append_string( db, d->name); + cf_dyn_buf_append_char(db, SEP ); + d->value_fn(d->name, db); + cf_dyn_buf_append_char(db, EOL); + } + d = d->next; + } + + return(0); +} + +// +// Parse the input buffer. It contains a list of keys that should be spit back. +// Do the parse, call the necessary function collecting the information in question +// Filling the dynbuf + +int +info_some(char *buf, char *buf_lim, const as_file_handle* fd_h, cf_dyn_buf *db) +{ + uint8_t auth_result = as_security_check(fd_h, PERM_NONE); + + if (auth_result != AS_PROTO_RESULT_OK) { + // TODO - log null-terminated buf as detail? + as_security_log(fd_h, auth_result, PERM_NONE, "info request", NULL); + append_sec_err_str(db, auth_result, PERM_NONE); + cf_dyn_buf_append_char(db, EOL); + return 0; + } + + // For each incoming name + char *c = buf; + char *tok = c; + + while (c < buf_lim) { + + if ( *c == EOL ) { + *c = 0; + char *name = tok; + bool handled = false; + + // search the static queue first always + info_static *s = static_head; + while (s) { + if (strcmp(s->name, name) == 0) { + // return exact command string received from client + cf_dyn_buf_append_string( db, name); + cf_dyn_buf_append_char( db, SEP ); + cf_dyn_buf_append_buf( db, (uint8_t *) s->value, s->value_sz); + cf_dyn_buf_append_char( db, EOL ); + handled = true; + break; + } + s = s->next; + } + + // didn't find in static, try dynamic + if (!handled) { + info_dynamic *d = dynamic_head; + while (d) { + if (strcmp(d->name, name) == 0) { + // return exact command string received from client + cf_dyn_buf_append_string( db, d->name); + cf_dyn_buf_append_char(db, SEP ); + d->value_fn(d->name, db); + cf_dyn_buf_append_char(db, EOL); + handled = true; + break; + } + d = d->next; + } + } + + // search the tree + if (!handled) { + + // see if there's a '/', + char *branch = strchr( name, TREE_SEP); + if (branch) { + *branch = 0; + branch++; + + info_tree *t = tree_head; + while (t) { + if (strcmp(t->name, name) == 0) { + // return exact command string received from client + cf_dyn_buf_append_string( db, t->name); + cf_dyn_buf_append_char( db, TREE_SEP); + cf_dyn_buf_append_string( db, branch); + cf_dyn_buf_append_char(db, SEP ); + t->tree_fn(t->name, branch, db); + cf_dyn_buf_append_char(db, EOL); + break; + } + t = t->next; + } + } + } + + tok = c + 1; + } + // commands have parameters + else if ( *c == ':' ) { + *c = 0; + char *name = tok; + + // parse parameters + tok = c + 1; + // make sure c doesn't go beyond buf_lim + while (*c != EOL && c < buf_lim-1) c++; + if (*c != EOL) { + cf_warning(AS_INFO, "Info '%s' parameter not terminated with '\\n'.", name); + break; + } + *c = 0; + char *param = tok; + + // search the command list + info_command *cmd = command_head; + while (cmd) { + if (strcmp(cmd->name, name) == 0) { + // return exact command string received from client + cf_dyn_buf_append_string( db, name); + cf_dyn_buf_append_char( db, ':'); + cf_dyn_buf_append_string( db, param); + cf_dyn_buf_append_char( db, SEP ); + + uint8_t result = as_security_check(fd_h, cmd->required_perm); + + as_security_log(fd_h, result, cmd->required_perm, name, param); + + if (result == AS_PROTO_RESULT_OK) { + cmd->command_fn(cmd->name, param, db); + } + else { + append_sec_err_str(db, result, cmd->required_perm); + } + + cf_dyn_buf_append_char( db, EOL ); + break; + } + cmd = cmd->next; + } + + if (!cmd) { + cf_info(AS_INFO, "received command %s, not registered", name); + } + + tok = c + 1; + } + + c++; + + } + return(0); +} + +int +as_info_buffer(uint8_t *req_buf, size_t req_buf_len, cf_dyn_buf *rsp) +{ + // Either we'e doing all, or doing some + if (req_buf_len == 0) { + info_all(NULL, rsp); + } + else { + info_some((char *)req_buf, (char *)(req_buf + req_buf_len), NULL, rsp); + } + + return(0); +} + +// +// Worker threads! +// these actually do the work. There is a lot of network activity, +// writes and such, don't want to clog up the main queue +// + +void * +thr_info_fn(void *unused) +{ + for ( ; ; ) { + + as_info_transaction it; + + if (0 != cf_queue_pop(g_info_work_q, &it, CF_QUEUE_FOREVER)) { + cf_crash(AS_TSVC, "unable to pop from info work queue"); + } + + as_file_handle *fd_h = it.fd_h; + as_proto *pr = it.proto; + + // Allocate an output buffer sufficiently large to avoid ever resizing + cf_dyn_buf_define_size(db, 128 * 1024); + // write space for the header + uint64_t h = 0; + cf_dyn_buf_append_buf(&db, (uint8_t *) &h, sizeof(h)); + + // Either we'e doing all, or doing some + if (pr->sz == 0) { + info_all(fd_h, &db); + } + else { + info_some((char *)pr->data, (char *)pr->data + pr->sz, fd_h, &db); + } + + // write the proto header in the space we pre-wrote + db.buf[0] = 2; + db.buf[1] = 1; + uint64_t sz = db.used_sz - 8; + db.buf[4] = (sz >> 24) & 0xff; + db.buf[5] = (sz >> 16) & 0xff; + db.buf[6] = (sz >> 8) & 0xff; + db.buf[7] = sz & 0xff; + + // write the data buffer + if (cf_socket_send_all(&fd_h->sock, db.buf, db.used_sz, + MSG_NOSIGNAL, CF_SOCKET_TIMEOUT) < 0) { + cf_info(AS_INFO, "thr_info: can't write all bytes, fd %d error %d", + CSFD(&fd_h->sock), errno); + as_end_of_transaction_force_close(fd_h); + fd_h = NULL; + } + + cf_dyn_buf_free(&db); + + cf_free(pr); + + if (fd_h) { + as_end_of_transaction_ok(fd_h); + fd_h = NULL; + } + + G_HIST_INSERT_DATA_POINT(info_hist, it.start_time); + cf_atomic64_incr(&g_stats.info_complete); + } + + return NULL; +} + +// +// received an info request from a file descriptor +// Called by the thr_tsvc when an info message is seen +// calls functions info_all or info_some to collect the response +// calls write to send the response back +// +// Proto will be freed by the caller +// + +void +as_info(as_info_transaction *it) +{ + cf_queue_push(g_info_work_q, it); +} + +// Return the number of pending Info requests in the queue. +int +as_info_queue_get_size() +{ + return cf_queue_sz(g_info_work_q); +} + +// Registers a dynamic name-value calculator. +// the get_value_fn will be called if a request comes in for this name. +// only does the registration! +// def means it's part of the default results - will get invoked for a blank info command (asinfo -v "") + + +int +as_info_set_dynamic(char *name, as_info_get_value_fn gv_fn, bool def) +{ + int rv = -1; + pthread_mutex_lock(&g_info_lock); + + info_dynamic *e = dynamic_head; + while (e) { + if (strcmp(name, e->name) == 0) { + e->value_fn = gv_fn; + break; + } + + e = e->next; + } + + if (!e) { + e = cf_malloc(sizeof(info_dynamic)); + e->def = def; + e->name = cf_strdup(name); + e->value_fn = gv_fn; + e->next = dynamic_head; + dynamic_head = e; + } + rv = 0; + + pthread_mutex_unlock(&g_info_lock); + return(rv); +} + + +// Registers a tree-based name-value calculator. +// the get_value_fn will be called if a request comes in for this name. +// only does the registration! + + +int +as_info_set_tree(char *name, as_info_get_tree_fn gv_fn) +{ + int rv = -1; + pthread_mutex_lock(&g_info_lock); + + info_tree *e = tree_head; + while (e) { + if (strcmp(name, e->name) == 0) { + e->tree_fn = gv_fn; + break; + } + + e = e->next; + } + + if (!e) { + e = cf_malloc(sizeof(info_tree)); + e->name = cf_strdup(name); + e->tree_fn = gv_fn; + e->next = tree_head; + tree_head = e; + } + rv = 0; + + pthread_mutex_unlock(&g_info_lock); + return(rv); +} + + +// Registers a command handler +// the get_value_fn will be called if a request comes in for this name, and +// parameters will be passed in +// This function only does the registration! + +int +as_info_set_command(char *name, as_info_command_fn command_fn, as_sec_perm required_perm) +{ + int rv = -1; + pthread_mutex_lock(&g_info_lock); + + info_command *e = command_head; + while (e) { + if (strcmp(name, e->name) == 0) { + e->command_fn = command_fn; + break; + } + + e = e->next; + } + + if (!e) { + e = cf_malloc(sizeof(info_command)); + e->name = cf_strdup(name); + e->command_fn = command_fn; + e->required_perm = required_perm; + e->next = command_head; + command_head = e; + } + rv = 0; + + pthread_mutex_unlock(&g_info_lock); + return(rv); +} + + + +// +// Sets a static name-value pair +// def means it's part of the default set - will get returned if nothing is passed + +int +as_info_set_buf(const char *name, const uint8_t *value, size_t value_sz, bool def) +{ + pthread_mutex_lock(&g_info_lock); + + // Delete case + if (value_sz == 0 || value == 0) { + + info_static *p = 0; + info_static *e = static_head; + + while (e) { + if (strcmp(name, e->name) == 0) { + if (p) { + p->next = e->next; + cf_free(e->name); + cf_free(e->value); + cf_free(e); + } + else { + info_static *_t = static_head->next; + cf_free(e->name); + cf_free(e->value); + cf_free(static_head); + static_head = _t; + } + break; + } + p = e; + e = e->next; + } + } + // insert case + else { + + info_static *e = static_head; + + // search for old value and overwrite + while(e) { + if (strcmp(name, e->name) == 0) { + cf_free(e->value); + e->value = cf_malloc(value_sz); + memcpy(e->value, value, value_sz); + e->value_sz = value_sz; + break; + } + e = e->next; + } + + // not found, insert fresh + if (e == 0) { + info_static *_t = cf_malloc(sizeof(info_static)); + _t->next = static_head; + _t->def = def; + _t->name = cf_strdup(name); + _t->value = cf_malloc(value_sz); + memcpy(_t->value, value, value_sz); + _t->value_sz = value_sz; + static_head = _t; + } + } + + pthread_mutex_unlock(&g_info_lock); + return(0); + +} + +// +// A helper function. Commands have the form: +// cmd:param=value;param=value +// +// The main parser gives us the entire parameter string +// so use this function to scan through and get the particular parameter value +// you're looking for +// +// The 'param_string' is the param passed by the command parser into a command +// +// @return 0 : success +// -1 : parameter not found +// -2 : parameter found but value is too long +// + +int +as_info_parameter_get(char *param_str, char *param, char *value, int *value_len) +{ + cf_detail(AS_INFO, "parameter get: paramstr %s seeking param %s", param_str, param); + + char *c = param_str; + char *tok = param_str; + int param_len = strlen(param); + + while (*c) { + if (*c == '=') { + if ( ( param_len == c - tok) && (0 == memcmp(tok, param, param_len) ) ) { + c++; + tok = c; + while ( *c != 0 && *c != ';') c++; + if (*value_len <= c - tok) { + // The found value is too long. + return(-2); + } + *value_len = c - tok; + memcpy(value, tok, *value_len); + value[*value_len] = 0; + return(0); + } + c++; + } + else if (*c == ';') { + c++; + tok = c; + } + else c++; + + } + + return(-1); +} + +int +as_info_set(const char *name, const char *value, bool def) +{ + return(as_info_set_buf(name, (const uint8_t *) value, strlen(value), def ) ); +} + +// +// +// service interfaces management +// +// There's a worker thread - info_interfaces_fn --- +// which continually polls the interfaces to see if anything changed. +// When it changes, it updates a generation count. +// There's a hash table of all the other nodes in the cluster, and a counter +// to see that they're all up-to-date on the generation +// +// +// The fabric message in question can be expanded to do more than service interfaces. +// By expanding the 'info_node_info' structure, and the fabric_msg, you can carry +// more dynamic information than just the remote node's interfaces +// But that's all that we can think of at the moment - the paxos communication method +// makes sure that the distributed key system is properly distributed +// + +static pthread_mutex_t g_serv_lock = PTHREAD_MUTEX_INITIALIZER; +static char *g_serv_legacy = NULL; +static char *g_serv_clear_std = NULL; +static char *g_serv_clear_alt = NULL; +static char *g_serv_tls_std = NULL; +static char *g_serv_tls_alt = NULL; +static char *g_serv_tls_name = NULL; +static uint32_t g_serv_gen = 0; +static cf_atomic64 g_peers_gen = 1; + +// +// What other nodes are out there, and what are their ip addresses? +// + +typedef struct info_node_info_s { + char *service_addr; // string representing the service address + char *alternate_addr; // string representing the alternate address + uint32_t generation; // acked generation counter + char *services_clear_std; // non-TLS standard services list + char *services_tls_std; // TLS standard services list + char *services_clear_alt; // non-TLS alternate services list + char *services_tls_alt; // TLS alternate services list + char *tls_name; // TLS name + uint64_t last_changed; // generation count of last modification (for delta updates) +} info_node_info; + +typedef const char *(*info_node_proj_fn)(info_node_info *info); + +typedef struct services_printer_s { + info_node_proj_fn proj; + cf_dyn_buf *db; + const char *strip; + uint64_t since; + bool with_tls_name; + int32_t count; +} services_printer; + +typedef struct port_savings_context_s { + info_node_proj_fn proj; + uint64_t since; + uint32_t port_savings[65536]; +} port_savings_context; + +// To avoid the services bug, g_info_node_info_hash should *always* be a subset +// of g_info_node_info_history_hash. In order to ensure this, every modification +// of g_info_node_info_hash should first involve grabbing the lock for the same +// key in g_info_node_info_history_hash. +cf_shash *g_info_node_info_history_hash = NULL; +cf_shash *g_info_node_info_hash = NULL; + +int info_node_info_reduce_fn(const void *key, void *data, void *udata); + +static char * +format_services_string(const char **addrs, uint32_t n_addrs, cf_ip_port port, char sep) +{ + if (n_addrs == 0) { + return NULL; + } + + cf_dyn_buf_define(db); + + for (uint32_t i = 0; i < n_addrs; ++i) { + if (cf_ip_addr_is_dns_name(addrs[i])) { + cf_dyn_buf_append_string(&db, addrs[i]); + cf_dyn_buf_append_char(&db, ':'); + cf_dyn_buf_append_string(&db, cf_ip_port_print(port)); + } + else { + cf_sock_addr addr; + CF_NEVER_FAILS(cf_sock_addr_from_host_port(addrs[i], port, &addr)); + cf_dyn_buf_append_string(&db, cf_sock_addr_print(&addr)); + } + + cf_dyn_buf_append_char(&db, sep); + } + + if (n_addrs > 0) { + cf_dyn_buf_chomp(&db); + } + + char *res = cf_dyn_buf_strdup(&db); + cf_dyn_buf_free(&db); + return res; +} + +static char * +format_services_addr(cf_ip_addr *addrs, int32_t n_addrs, cf_ip_port port, char sep) +{ + if (n_addrs == 0) { + return NULL; + } + + cf_dyn_buf_define(db); + + for (int32_t i = 0; i < n_addrs; ++i) { + cf_sock_addr addr; + cf_sock_addr_from_addr_port(&addrs[i], port, &addr); + cf_dyn_buf_append_string(&db, cf_sock_addr_print(&addr)); + cf_dyn_buf_append_char(&db, sep); + } + + if (n_addrs > 0) { + cf_dyn_buf_chomp(&db); + } + + char *res = cf_dyn_buf_strdup(&db); + cf_dyn_buf_free(&db); + return res; +} + +static bool +detect_name_change(char **tls_name) +{ + char *node_name = cf_node_name(); + + if (node_name[0] == 0) { + cf_free(node_name); + node_name = NULL; + } + + if (*tls_name == NULL && node_name == NULL) { + return false; + } + + if (*tls_name != NULL && node_name != NULL && strcmp(*tls_name, node_name) == 0) { + cf_free(node_name); + return false; + } + + if (*tls_name != NULL) { + cf_free(*tls_name); + } + + *tls_name = node_name; + return true; +} + +static uint32_t +filter_legacy(const char **from, uint32_t n_from, const char **to) +{ + uint32_t n_to = 0; + + for (uint32_t i = 0; i < n_from; ++i) { + if (cf_ip_addr_str_is_legacy(from[i])) { + to[n_to] = from[i]; + ++n_to; + } + } + + return n_to; +} + +static void +set_static_services(void) +{ + const char *filter[CF_SOCK_CFG_MAX]; + uint32_t n_filter; + + if (g_access.service.addrs.n_addrs > 0) { + n_filter = filter_legacy(g_access.service.addrs.addrs, g_access.service.addrs.n_addrs, + filter); + g_serv_legacy = format_services_string(filter, n_filter, g_access.service.port, ';'); + + if (cf_ip_addr_legacy_only()) { + g_serv_clear_std = format_services_string(filter, n_filter, g_access.service.port, ','); + } + else { + g_serv_clear_std = format_services_string(g_access.service.addrs.addrs, + g_access.service.addrs.n_addrs, g_access.service.port, ','); + } + } + + if (g_access.alt_service.addrs.n_addrs > 0) { + if (cf_ip_addr_legacy_only()) { + n_filter = filter_legacy(g_access.alt_service.addrs.addrs, + g_access.alt_service.addrs.n_addrs, filter); + g_serv_clear_alt = format_services_string(filter, n_filter, g_access.alt_service.port, + ','); + } + else { + g_serv_clear_alt = format_services_string(g_access.alt_service.addrs.addrs, + g_access.alt_service.addrs.n_addrs, g_access.alt_service.port, ','); + } + } + + if (g_access.tls_service.addrs.n_addrs > 0 && g_access.tls_service.port != 0) { + if (cf_ip_addr_legacy_only()) { + n_filter = filter_legacy(g_access.tls_service.addrs.addrs, + g_access.tls_service.addrs.n_addrs, filter); + g_serv_tls_std = format_services_string(filter, n_filter, g_access.tls_service.port, + ','); + } + else { + g_serv_tls_std = format_services_string(g_access.tls_service.addrs.addrs, + g_access.tls_service.addrs.n_addrs, g_access.tls_service.port, ','); + } + } + + if (g_access.alt_tls_service.addrs.n_addrs > 0 && g_access.alt_tls_service.port != 0) { + if (cf_ip_addr_legacy_only()) { + n_filter = filter_legacy(g_access.alt_tls_service.addrs.addrs, + g_access.alt_tls_service.addrs.n_addrs, filter); + g_serv_tls_alt = format_services_string(filter, n_filter, g_access.alt_tls_service.port, + ','); + } + else { + g_serv_tls_alt = format_services_string(g_access.alt_tls_service.addrs.addrs, + g_access.alt_tls_service.addrs.n_addrs, g_access.alt_tls_service.port, ','); + } + } +} + +void +info_node_info_tend() +{ + cf_shash_reduce(g_info_node_info_hash, info_node_info_reduce_fn, 0); +} + +void * +info_interfaces_fn(void *unused) +{ + cf_ip_addr legacy[CF_SOCK_CFG_MAX]; + uint32_t n_legacy = 0; + + cf_ip_addr addrs[CF_SOCK_CFG_MAX]; + uint32_t n_addrs = 0; + + char *tls_name = NULL; + bool flag = cf_ip_addr_legacy_only(); + + while (true) { + bool chg_flag = cf_ip_addr_legacy_only() != flag; + bool chg_legacy = cf_inter_detect_changes_legacy(legacy, &n_legacy, CF_SOCK_CFG_MAX); + bool chg_any; + + if (cf_ip_addr_legacy_only()) { + chg_any = cf_inter_detect_changes_legacy(addrs, &n_addrs, CF_SOCK_CFG_MAX); + } + else { + chg_any = cf_inter_detect_changes(addrs, &n_addrs, CF_SOCK_CFG_MAX); + } + + if (n_legacy + n_addrs == 0) { + cf_warning(AS_INFO, "No network interface addresses detected for client access"); + } + + bool chg_name = detect_name_change(&tls_name); + + if (chg_flag || chg_legacy || chg_any || chg_name) { + pthread_mutex_lock(&g_serv_lock); + + if (chg_flag) { + set_static_services(); + flag = cf_ip_addr_legacy_only(); + } + + if (chg_legacy && g_access.service.addrs.n_addrs == 0) { + if (g_serv_legacy != NULL) { + cf_free(g_serv_legacy); + } + + g_serv_legacy = format_services_addr(legacy, n_legacy, g_access.service.port, ';'); + } + + if (chg_any && g_access.service.addrs.n_addrs == 0) { + if (g_serv_clear_std != NULL) { + cf_free(g_serv_clear_std); + } + + g_serv_clear_std = format_services_addr(addrs, n_addrs, g_access.service.port, ','); + } + + if (chg_any && g_access.tls_service.port != 0 && + g_access.tls_service.addrs.n_addrs == 0) { + if (g_serv_tls_std != NULL) { + cf_free(g_serv_tls_std); + } + + g_serv_tls_std = format_services_addr(addrs, n_addrs, g_access.tls_service.port, + ','); + } + + if (chg_name && g_config.tls_service.tls_our_name == NULL) { + g_serv_tls_name = tls_name; + } + + ++g_serv_gen; + pthread_mutex_unlock(&g_serv_lock); + } + + info_node_info_tend(); + sleep(2); + } + + return NULL; +} + +// Free the service strings of an info node. + +static void +free_node_info_service(char **string) +{ + if (*string) { + cf_free(*string); + *string = 0; + } +} + +static void +free_node_info_services(info_node_info *info) +{ + free_node_info_service(&info->service_addr); + free_node_info_service(&info->alternate_addr); + free_node_info_service(&info->services_clear_std); + free_node_info_service(&info->services_tls_std); + free_node_info_service(&info->services_clear_alt); + free_node_info_service(&info->services_tls_alt); + free_node_info_service(&info->tls_name); +} + +// Resets the service strings of an info node without freeing them. + +static void +reset_node_info_services(info_node_info *info) +{ + info->service_addr = 0; + info->alternate_addr = 0; + info->services_clear_std = 0; + info->services_tls_std = 0; + info->services_clear_alt = 0; + info->services_tls_alt = 0; + info->tls_name = 0; +} + +// Clone the service strings of an info node. + +static char * +clone_node_info_service(const char *string) +{ + return string ? cf_strdup(string) : 0; +} + +static void +clone_node_info_services(info_node_info *from, info_node_info *to) +{ + to->service_addr = clone_node_info_service(from->service_addr); + to->alternate_addr = clone_node_info_service(from->alternate_addr); + to->services_clear_std = clone_node_info_service(from->services_clear_std); + to->services_tls_std = clone_node_info_service(from->services_tls_std); + to->services_clear_alt = clone_node_info_service(from->services_clear_alt); + to->services_tls_alt = clone_node_info_service(from->services_tls_alt); + to->tls_name = clone_node_info_service(from->tls_name); +} + +// Compare the service strings of two info nodes. + +static bool +compare_node_info_service(const char *lhs, const char *rhs) +{ + if (!lhs || !rhs) { + return !lhs && !rhs; + } + + return strcmp(lhs, rhs) == 0; +} + +static bool +compare_node_info_services(info_node_info *lhs, info_node_info *rhs) +{ + return compare_node_info_service(lhs->service_addr, rhs->service_addr) && + compare_node_info_service(lhs->alternate_addr, rhs->alternate_addr) && + compare_node_info_service(lhs->services_clear_std, rhs->services_clear_std) && + compare_node_info_service(lhs->services_tls_std, rhs->services_tls_std) && + compare_node_info_service(lhs->services_clear_alt, rhs->services_clear_alt) && + compare_node_info_service(lhs->services_tls_alt, rhs->services_tls_alt) && + compare_node_info_service(lhs->tls_name, rhs->tls_name); +} + +// Dump the service strings of an info node. + +static void +dump_node_info_services(info_node_info *info) +{ + cf_debug(AS_INFO, "Service address: %s", cf_str_safe_as_null(info->service_addr)); + cf_debug(AS_INFO, "Alternate address: %s", cf_str_safe_as_null(info->alternate_addr)); + cf_debug(AS_INFO, "Clear, standard: %s", cf_str_safe_as_null(info->services_clear_std)); + cf_debug(AS_INFO, "TLS, standard: %s", cf_str_safe_as_null(info->services_tls_std)); + cf_debug(AS_INFO, "Clear, alternate: %s", cf_str_safe_as_null(info->services_clear_alt)); + cf_debug(AS_INFO, "TLS, alternate: %s", cf_str_safe_as_null(info->services_tls_alt)); + cf_debug(AS_INFO, "TLS name: %s", cf_str_safe_as_null(info->tls_name)); +} + +// This reduce function will eliminate elements from the info hash +// which are no longer in the succession list + +typedef struct reduce_context_s { + uint32_t cluster_size; + cf_node *succession; + uint32_t n_deleted; + cf_node deleted[AS_CLUSTER_SZ]; +} reduce_context; + +int32_t +info_clustering_event_reduce_fn(const void *key, void *data, void *udata) +{ + const cf_node *node = key; + info_node_info *info = data; + reduce_context *context = udata; + + for (uint32_t i = 0; i < context->cluster_size; ++i) { + if (*node == context->succession[i]) { + return CF_SHASH_OK; + } + } + + cf_debug(AS_INFO, "Clustering event reduce: removing node %" PRIx64, *node); + + uint32_t n = context->n_deleted; + context->deleted[n] = *node; + ++context->n_deleted; + + free_node_info_services(info); + return CF_SHASH_REDUCE_DELETE; +} + +// +// Maintain the info_node_info hash as a shadow of the succession list +// +static void +info_clustering_event_listener(const as_exchange_cluster_changed_event* event, void* udata) +{ + uint64_t start_ms = cf_getms(); + cf_debug(AS_INFO, "Info received new clustering state"); + + info_node_info temp; + temp.generation = 0; + temp.last_changed = 0; + reset_node_info_services(&temp); + + uint32_t i; + + for (i = 0; i < event->cluster_size; ++i) { + cf_node member_nodeid = event->succession[i]; + + if (member_nodeid == g_config.self_node) { + continue; + } + + info_node_info *info_history; + pthread_mutex_t *vlock_history; + + if (cf_shash_get_vlock(g_info_node_info_history_hash, &member_nodeid, (void **)&info_history, + &vlock_history) != CF_SHASH_OK) { + // This may fail, but this is OK. This should only fail when info_msg_fn is also trying + // to add this key, so either way the entry will be in the hash table. + cf_shash_put_unique(g_info_node_info_history_hash, &member_nodeid, &temp); + + if (cf_shash_get_vlock(g_info_node_info_history_hash, &member_nodeid, + (void **)&info_history, &vlock_history) != CF_SHASH_OK) { + cf_crash(AS_INFO, + "Could not create info history hash entry for %" PRIx64, member_nodeid); + continue; + } + } + + info_node_info *info; + pthread_mutex_t *vlock; + + if (cf_shash_get_vlock(g_info_node_info_hash, &member_nodeid, (void **)&info, + &vlock) != CF_SHASH_OK) { + clone_node_info_services(info_history, &temp); + temp.last_changed = cf_atomic64_incr(&g_peers_gen); + + if (cf_shash_put_unique(g_info_node_info_hash, &member_nodeid, &temp) == CF_SHASH_OK) { + reset_node_info_services(&temp); + info_history->last_changed = 0; // See info_clustering_event_reduce_fn(). + cf_debug(AS_INFO, "Peers generation %" PRId64 ": added node %" PRIx64, + temp.last_changed, member_nodeid); + } + else { + free_node_info_services(&temp); + cf_crash(AS_INFO, + "Could not insert node %" PRIx64 " from clustering notification", member_nodeid); + } + + temp.last_changed = 0; + } + else { + pthread_mutex_unlock(vlock); + } + + pthread_mutex_unlock(vlock_history); + } + + uint32_t before = cf_shash_get_size(g_info_node_info_hash); + cf_debug(AS_INFO, "Clustering succession list has %d element(s), info hash has %u", i, before); + + reduce_context cont = { .cluster_size = event->cluster_size, .succession = event->succession, .n_deleted = 0 }; + cf_shash_reduce(g_info_node_info_hash, info_clustering_event_reduce_fn, &cont); + + // While an alumni is gone, its last_changed field is non-zero. When it comes back, the + // field goes back to zero. + + for (uint32_t i = 0; i < cont.n_deleted; ++i) { + cf_debug(AS_INFO, "Updating alumni %" PRIx64, cont.deleted[i]); + info_node_info *info_history; + pthread_mutex_t *vlock_history; + + if (cf_shash_get_vlock(g_info_node_info_history_hash, &cont.deleted[i], + (void **)&info_history, &vlock_history) != CF_SHASH_OK) { + cf_crash(AS_INFO, "Removing a node (%" PRIx64 ") that is not an alumni", + cont.deleted[i]); + } + + info_history->last_changed = cf_atomic64_incr(&g_peers_gen); + cf_debug(AS_INFO, "Peers generation %" PRId64 ": removed node %" PRIx64, + info_history->last_changed, cont.deleted[i]); + pthread_mutex_unlock(vlock_history); + } + + uint32_t after = cf_shash_get_size(g_info_node_info_hash); + cf_debug(AS_INFO, "After removal, info hash has %u element(s)", after); + + cf_atomic32_incr(&g_node_info_generation); + cf_debug(AS_INFO, "info_clustering_event_listener took %" PRIu64 " ms", cf_getms() - start_ms); + + // Trigger an immediate tend to start peer list update across the cluster. + info_node_info_tend(); +} + +// This goes in a reduce function for retransmitting my information to another node + +int +info_node_info_reduce_fn(const void *key, void *data, void *udata) +{ + const cf_node *node = (const cf_node *)key; + info_node_info *infop = (info_node_info *) data; + + if (infop->generation < g_serv_gen) { + + cf_debug(AS_INFO, "sending service string %s to node %"PRIx64, g_serv_legacy, *node); + + pthread_mutex_lock(&g_serv_lock); + + msg *m = as_fabric_msg_get(M_TYPE_INFO); + + // If we don't have the remote node's service address, request it via our update info. msg. + msg_set_uint32(m, INFO_FIELD_OP, infop->service_addr && infop->services_clear_std ? + INFO_OP_UPDATE : INFO_OP_UPDATE_REQ); + msg_set_uint32(m, INFO_FIELD_GENERATION, g_serv_gen); + + if (g_serv_legacy) { + msg_set_str(m, INFO_FIELD_SERVICE_ADDRESS, g_serv_legacy, MSG_SET_COPY); + } + + // Legacy alternate address field. + for (uint32_t i = 0; i < g_access.alt_service.addrs.n_addrs; ++i) { + if (cf_ip_addr_str_is_legacy(g_access.alt_service.addrs.addrs[i])) { + char tmp[250]; + snprintf(tmp, sizeof(tmp), "%s:%d", g_access.alt_service.addrs.addrs[i], + g_access.service.port); + msg_set_str(m, INFO_FIELD_ALT_ADDRESS, tmp, MSG_SET_COPY); + break; + } + } + + if (g_serv_clear_std) { + msg_set_str(m, INFO_FIELD_SERVICES_CLEAR_STD, g_serv_clear_std, MSG_SET_COPY); + } + + if (g_serv_tls_std) { + msg_set_str(m, INFO_FIELD_SERVICES_TLS_STD, g_serv_tls_std, MSG_SET_COPY); + } + + if (g_serv_clear_alt) { + msg_set_str(m, INFO_FIELD_SERVICES_CLEAR_ALT, g_serv_clear_alt, MSG_SET_COPY); + } + + if (g_serv_tls_alt) { + msg_set_str(m, INFO_FIELD_SERVICES_TLS_ALT, g_serv_tls_alt, MSG_SET_COPY); + } + + if (g_serv_tls_name) { + msg_set_str(m, INFO_FIELD_TLS_NAME, g_serv_tls_name, MSG_SET_COPY); + } + + pthread_mutex_unlock(&g_serv_lock); + + if (as_fabric_send(*node, m, AS_FABRIC_CHANNEL_CTRL) != + AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } + } + + return(0); +} + +static char * +convert_legacy_services(const char *legacy) +{ + if (legacy == NULL) { + return NULL; + } + + char *res = cf_strdup(legacy); + + for (size_t i = 0; res[i] != 0; ++i) { + if (res[i] == ';') { + res[i] = ','; + } + } + + return res; +} + +// +// Receive a message from a remote node, jam it in my table +// + +int +info_msg_fn(cf_node node, msg *m, void *udata) +{ + uint32_t op; + + if (msg_get_uint32(m, INFO_FIELD_OP, &op) != 0) { + as_fabric_msg_put(m); + return 0; + } + + switch (op) { + case INFO_OP_UPDATE: + case INFO_OP_UPDATE_REQ: + { + cf_debug(AS_INFO, "Received service address from node %" PRIx64 "; op = %u", node, op); + info_node_info temp; + temp.generation = 0; + temp.last_changed = 0; + reset_node_info_services(&temp); + bool node_info_tend_required = false; + + info_node_info *info_history; + pthread_mutex_t *vlock_history; + + if (cf_shash_get_vlock(g_info_node_info_history_hash, &node, (void **)&info_history, + &vlock_history) != CF_SHASH_OK) { + // This may fail, but this is ok. This should only fail when as_info_paxos_event + // is concurrently trying to add this key, so either way the entry will be in the + // hash table. + cf_shash_put_unique(g_info_node_info_history_hash, &node, &temp); + + if (cf_shash_get_vlock(g_info_node_info_history_hash, &node, (void **)&info_history, + &vlock_history) != CF_SHASH_OK) { + cf_crash(AS_INFO, + "Could not create info history hash entry for %" PRIx64, node); + break; + } + } + + free_node_info_services(info_history); + + if (msg_get_str(m, INFO_FIELD_SERVICE_ADDRESS, &info_history->service_addr, + 0, MSG_GET_COPY_MALLOC) != 0 || !info_history->service_addr) { + cf_debug(AS_INFO, "No service address in message from node %" PRIx64, node); + } + + if (msg_get_str(m, INFO_FIELD_ALT_ADDRESS, &info_history->alternate_addr, + 0, MSG_GET_COPY_MALLOC) != 0) { + cf_debug(AS_INFO, "No alternate address message from node %" PRIx64, node); + } + + if (msg_get_str(m, INFO_FIELD_SERVICES_CLEAR_STD, &info_history->services_clear_std, + 0, MSG_GET_COPY_MALLOC) != 0 || !info_history->services_clear_std) { + cf_debug(AS_INFO, "No services-clear-std in message from node %" PRIx64, node); + info_history->services_clear_std = + convert_legacy_services(info_history->service_addr); + } + + if (msg_get_str(m, INFO_FIELD_SERVICES_TLS_STD, &info_history->services_tls_std, + 0, MSG_GET_COPY_MALLOC) != 0) { + cf_debug(AS_INFO, "No services-tls-std in message from node %" PRIx64, node); + } + + if (msg_get_str(m, INFO_FIELD_SERVICES_CLEAR_ALT, &info_history->services_clear_alt, + 0, MSG_GET_COPY_MALLOC) != 0) { + cf_debug(AS_INFO, "No services-clear-alt in message from node %" PRIx64, node); + info_history->services_clear_alt = + convert_legacy_services(info_history->alternate_addr); + } + + if (msg_get_str(m, INFO_FIELD_SERVICES_TLS_ALT, &info_history->services_tls_alt, + 0, MSG_GET_COPY_MALLOC) != 0) { + cf_debug(AS_INFO, "No services-tls-alt in message from node %" PRIx64, node); + } + + if (msg_get_str(m, INFO_FIELD_TLS_NAME, &info_history->tls_name, + 0, MSG_GET_COPY_MALLOC) != 0) { + cf_debug(AS_INFO, "No tls-name in message from node %" PRIx64, node); + } + + dump_node_info_services(info_history); + + info_node_info *info; + pthread_mutex_t *vlock; + info_node_info info_to_tend = { 0 }; + + if (cf_shash_get_vlock(g_info_node_info_hash, &node, (void **)&info, &vlock) == CF_SHASH_OK) { + if (!compare_node_info_services(info_history, info)) { + cf_debug(AS_INFO, "Changed node info entry, was:"); + dump_node_info_services(info); + info->last_changed = cf_atomic64_incr(&g_peers_gen); + cf_debug(AS_INFO, "Peers generation %" PRId64 ": updated node %" PRIx64, + info->last_changed, node); + } + + free_node_info_services(info); + clone_node_info_services(info_history, info); + if (INFO_OP_UPDATE_REQ == op) { + cf_debug(AS_INFO, "Received request for info update from node %" PRIx64 " ~~ setting node's info generation to 0!", node); + info->generation = 0; + node_info_tend_required = true; + memcpy(&info_to_tend, info, sizeof(info_to_tend)); + } + + pthread_mutex_unlock(vlock); + } + else { + // Before history hash was added to the code base, we would throw away the message + // in this case. + cf_debug(AS_INFO, "Node %" PRIx64 " not in info hash, saving service address in info history hash", node); + } + + pthread_mutex_unlock(vlock_history); + + // Send the ACK. + msg_preserve_fields(m, 1, INFO_FIELD_GENERATION); + msg_set_uint32(m, INFO_FIELD_OP, INFO_OP_ACK); + + int rv = as_fabric_send(node, m, AS_FABRIC_CHANNEL_CTRL); + + if (rv != AS_FABRIC_SUCCESS) { + cf_warning(AS_INFO, "Failed to send message %p with type %d to node %"PRIu64" (rv %d)", + m, (int32_t)m->type, node, rv); + as_fabric_msg_put(m); + } + + if (node_info_tend_required) { + // Send our service update to the source. + info_node_info_reduce_fn(&node, &info_to_tend, NULL); + } + } + + break; + + case INFO_OP_ACK: + { + + cf_debug(AS_INFO, " received ACK from node %"PRIx64, node); + + // TODO - dangerous to continue if no generation ??? + uint32_t gen = 0; + msg_get_uint32(m, INFO_FIELD_GENERATION, &gen); + info_node_info *info; + pthread_mutex_t *vlock; + if (0 == cf_shash_get_vlock(g_info_node_info_hash, &node, (void **) &info, &vlock)) { + + info->generation = gen; + + pthread_mutex_unlock(vlock); + } + + as_fabric_msg_put(m); + + } + break; + + default: + as_fabric_msg_put(m); + break; + } + + return(0); +} + +// +// This dynamic function reduces the info_node_info hash and builds up the string of services +// + +int32_t +info_get_x_legacy_reduce_fn(const void *key, void *data, void *udata) +{ + services_printer *sp = udata; + info_node_info *info = data; + + info_node_proj_fn proj = sp->proj; + cf_dyn_buf *db = sp->db; + const char *services = proj(info); + + if (services == NULL) { + return 0; + } + + if (sp->count > 0) { + cf_dyn_buf_append_char(db, ';'); + } + + cf_dyn_buf_append_string(db, services); + ++sp->count; + return 0; +} + +int32_t +info_get_x_legacy_reduce(cf_shash *h, info_node_proj_fn proj, cf_dyn_buf *db) +{ + services_printer sp = { .proj = proj, .db = db }; + cf_shash_reduce(h, info_get_x_legacy_reduce_fn, (void *)&sp); + return 0; +} + +static const char * +project_services(info_node_info *info) +{ + return info->service_addr; +} + +int32_t +info_get_services(char *name, cf_dyn_buf *db) +{ + return info_get_x_legacy_reduce(g_info_node_info_hash, project_services, db); +} + +int32_t +info_get_services_alumni(char *name, cf_dyn_buf *db) +{ + return info_get_x_legacy_reduce(g_info_node_info_history_hash, project_services, db); +} + +static const char * +project_alt_addr(info_node_info *info) +{ + return info->alternate_addr; +} + +int32_t +info_get_alt_addr(char *name, cf_dyn_buf *db) +{ + return info_get_x_legacy_reduce(g_info_node_info_hash, project_alt_addr, db); +} + +int32_t +info_port_savings_reduce_fn(const void *key, void *data, void *udata) +{ + port_savings_context *psc = udata; + info_node_info *info = data; + + if (info->last_changed <= psc->since) { + return 0; + } + + const char *services = psc->proj(info); + + if (services == NULL) { + return 0; + } + + int32_t curr; + + for (int32_t end = strlen(services); end > 0; end = curr) { + int32_t mult = 1; + int32_t port = 0; + + for (curr = end - 1; curr >= 0; --curr) { + char ch = services[curr]; + + if (ch == ':') { + break; + } + + if (ch < '0' || ch > '9') { + cf_warning(AS_INFO, "Invalid port number in services string: %s", services); + return 0; + } + + port += (ch - '0') * mult; + mult *= 10; + } + + int32_t savings = end - curr; + cf_debug(AS_INFO, "Default port %d saves %d byte(s)", port, savings); + psc->port_savings[port] += savings; + + while (curr >= 0 && services[curr] != ',') { + --curr; + } + } + + return 0; +} + +static char * +strip_service_suffixes(const char *services, const char *strip) +{ + const int32_t services_len = strlen(services); + const int32_t strip_len = strlen(strip); + + char *clone = cf_strdup(services); + + int32_t left = services_len; + int32_t right = services_len; + + while (left >= strip_len) { + if (memcmp(clone + left - strip_len, strip, strip_len) == 0) { + left -= strip_len; + } + + while (left > 0) { + clone[--right] = clone[--left]; + + if (clone[left] == ',') { + break; + } + } + } + + memmove(clone, clone + right, services_len - right + 1); + return clone; +} + +int32_t +info_get_services_x_reduce_fn(const void *key, void *data, void *udata) +{ + services_printer *sp = udata; + const cf_node *node = key; + info_node_info *info = data; + + if (info->last_changed <= sp->since) { + return 0; + } + + const char *services = sp->proj(info); + + if (services == NULL) { + return 0; + } + + cf_dyn_buf *db = sp->db; + + if (sp->count > 0) { + cf_dyn_buf_append_char(db, ','); + } + + char node_id[17]; + cf_str_itoa_u64(*node, node_id, 16); + + cf_dyn_buf_append_char(db, '['); + cf_dyn_buf_append_string(db, node_id); + cf_dyn_buf_append_char(db, ','); + + if (sp->with_tls_name && info->tls_name) { + cf_dyn_buf_append_string(db, info->tls_name); + } + + cf_dyn_buf_append_char(db, ','); + cf_dyn_buf_append_char(db, '['); + + if (sp->strip != NULL) { + char *stripped = strip_service_suffixes(services, sp->strip); + cf_dyn_buf_append_string(db, stripped); + cf_free(stripped); + } + else { + cf_dyn_buf_append_string(db, services); + } + + cf_dyn_buf_append_char(db, ']'); + cf_dyn_buf_append_char(db, ']'); + + ++sp->count; + return 0; +} + +int32_t +info_get_services_x(cf_shash *h, info_node_proj_fn proj, cf_dyn_buf *db, uint64_t since, + bool with_tls_name) +{ + // Pick the default port that saves us the most space. + port_savings_context psc = { .proj = proj, .since = since }; + cf_shash_reduce(h, info_port_savings_reduce_fn, &psc); + + int32_t best_savings = 0; + int32_t best_port = 0; + + for (int32_t i = 0; i < 65536; ++i) { + if (psc.port_savings[i] > best_savings) { + best_savings = psc.port_savings[i]; + best_port = i; + } + } + + cf_debug(AS_INFO, "Best default port is %d, saves %d byte(s)", best_port, best_savings); + + cf_dyn_buf_append_uint64(db, cf_atomic64_get(g_peers_gen)); + cf_dyn_buf_append_char(db, ','); + + if (best_port > 0) { + cf_dyn_buf_append_int(db, best_port); + } + + cf_dyn_buf_append_char(db, ','); + + cf_dyn_buf_append_char(db, '['); + + char strip[7]; + snprintf(strip, sizeof(strip), ":%d", best_port); + + services_printer sp = { .proj = proj, .db = db, .strip = strip, .since = since, + .with_tls_name = with_tls_name }; + cf_shash_reduce(h, info_get_services_x_reduce_fn, (void *)&sp); + + cf_dyn_buf_append_char(db, ']'); + return sp.count; +} + +int32_t +info_get_services_x_gone_reduce_fn(const void *key, void *data, void *udata) +{ + services_printer *sp = udata; + const cf_node *node = key; + info_node_info *info = data; + + if (info->last_changed <= sp->since || sp->proj(info) == NULL) { + return 0; + } + + cf_dyn_buf *db = sp->db; + + if (sp->count > 0) { + cf_dyn_buf_append_char(db, ','); + } + + char node_id[17]; + cf_str_itoa_u64(*node, node_id, 16); + + cf_dyn_buf_append_char(db, '['); + cf_dyn_buf_append_string(db, node_id); + cf_dyn_buf_append_char(db, ','); + cf_dyn_buf_append_char(db, ','); + cf_dyn_buf_append_char(db, ']'); + + ++sp->count; + return 0; +} + +void +info_get_services_x_delta(info_node_proj_fn proj, cf_dyn_buf *db, char *params, bool with_tls_name) +{ + uint64_t since; + + if (cf_str_atoi_64(params, (int64_t *)&since) < 0) { + cf_warning(AS_INFO, "Invalid peers generation %s", params); + cf_dyn_buf_append_string(db, "ERROR"); + return; + } + + uint64_t orig_gen = cf_atomic64_get(g_peers_gen); + + while (true) { + int32_t count = info_get_services_x(g_info_node_info_hash, proj, db, since, with_tls_name); + cf_dyn_buf_chomp(db); // Remove the "]". + + services_printer sp = { .proj = proj, .db = db, .since = since, .count = count }; + cf_shash_reduce(g_info_node_info_history_hash, info_get_services_x_gone_reduce_fn, &sp); + + cf_dyn_buf_append_char(db, ']'); // Re-add the "]". + + // Doing the above two reductions doesn't happen atomically. Theoretically, peers can + // arrive or leave between the two invocations, leading to duplicate or missing peers in + // the list. In this case, simply try again. + + uint64_t gen = cf_atomic64_get(g_peers_gen); + + if (gen == orig_gen) { + break; + } + + db->used_sz = 0; + orig_gen = gen; + } +} + +static const char * +project_services_clear_std(info_node_info *info) +{ + return info->services_clear_std; +} + +int32_t +info_get_services_clear_std(char *name, cf_dyn_buf *db) +{ + info_get_services_x(g_info_node_info_hash, project_services_clear_std, db, 0, false); + return 0; +} + +int32_t +info_get_services_clear_std_delta(char *name, char *params, cf_dyn_buf *db) +{ + info_get_services_x_delta(project_services_clear_std, db, params, false); + return 0; +} + +int32_t +info_get_alumni_clear_std(char *name, cf_dyn_buf *db) +{ + info_get_services_x(g_info_node_info_history_hash, project_services_clear_std, db, 0, false); + return 0; +} + +static const char * +project_services_tls_std(info_node_info *info) +{ + return info->services_tls_std; +} + +int32_t +info_get_services_tls_std(char *name, cf_dyn_buf *db) +{ + info_get_services_x(g_info_node_info_hash, project_services_tls_std, db, 0, true); + return 0; +} + +int32_t +info_get_services_tls_std_delta(char *name, char *params, cf_dyn_buf *db) +{ + info_get_services_x_delta(project_services_tls_std, db, params, true); + return 0; +} + +int32_t +info_get_alumni_tls_std(char *name, cf_dyn_buf *db) +{ + info_get_services_x(g_info_node_info_history_hash, project_services_tls_std, db, 0, true); + return 0; +} + +static const char * +project_services_clear_alt(info_node_info *info) +{ + return info->services_clear_alt; +} + +int32_t +info_get_services_clear_alt(char *name, cf_dyn_buf *db) +{ + info_get_services_x(g_info_node_info_hash, project_services_clear_alt, db, 0, false); + return 0; +} + +int32_t +info_get_services_clear_alt_delta(char *name, char *params, cf_dyn_buf *db) +{ + info_get_services_x_delta(project_services_clear_alt, db, params, false); + return 0; +} + +static const char * +project_services_tls_alt(info_node_info *info) +{ + return info->services_tls_alt; +} + +int32_t +info_get_services_tls_alt(char *name, cf_dyn_buf *db) +{ + info_get_services_x(g_info_node_info_hash, project_services_tls_alt, db, 0, true); + return 0; +} + +int32_t +info_get_services_tls_alt_delta(char *name, char *params, cf_dyn_buf *db) +{ + info_get_services_x_delta(project_services_tls_alt, db, params, true); + return 0; +} + +int32_t +info_get_services_generation(char *name, cf_dyn_buf *db) +{ + cf_dyn_buf_append_uint64(db, cf_atomic64_get(g_peers_gen)); + return 0; +} + +// +// This dynamic function removes nodes from g_info_node_info_history_hash that +// aren't present in g_info_node_info_hash. +// +int +history_purge_reduce_fn(const void *key, void *data, void *udata) +{ + return CF_SHASH_OK == cf_shash_get(g_info_node_info_hash, key, NULL) ? CF_SHASH_OK : CF_SHASH_REDUCE_DELETE; +} + +int +info_services_alumni_reset(char *name, cf_dyn_buf *db) +{ + cf_shash_reduce(g_info_node_info_history_hash, history_purge_reduce_fn, NULL); + cf_info(AS_INFO, "services alumni list reset"); + cf_dyn_buf_append_string(db, "ok"); + + return(0); +} + + + +// +// Iterate through the current namespace list and cons up a string +// + +int +info_get_namespaces(char *name, cf_dyn_buf *db) +{ + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + cf_dyn_buf_append_string(db, g_config.namespaces[i]->name); + cf_dyn_buf_append_char(db, ';'); + } + + if (g_config.n_namespaces > 0) { + cf_dyn_buf_chomp(db); + } + + return(0); +} + +int +info_get_logs(char *name, cf_dyn_buf *db) +{ + cf_fault_sink_strlist(db); + return(0); +} + +int +info_get_objects(char *name, cf_dyn_buf *db) +{ + uint64_t objects = 0; + + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + objects += g_config.namespaces[i]->n_objects; + } + + cf_dyn_buf_append_uint64(db, objects); + return(0); +} + +int +info_get_sets(char *name, cf_dyn_buf *db) +{ + return info_get_tree_sets(name, "", db); +} + +int +info_get_bins(char *name, cf_dyn_buf *db) +{ + return info_get_tree_bins(name, "", db); +} + +int +info_get_config( char* name, cf_dyn_buf *db) +{ + return info_command_config_get(name, NULL, db); +} + +int +info_get_sindexes(char *name, cf_dyn_buf *db) +{ + return info_get_tree_sindexes(name, "", db); +} + + +void +info_get_namespace_info(as_namespace *ns, cf_dyn_buf *db) +{ + // Cluster size. + + // Using ns_ prefix to avoid confusion with global cluster_size. + info_append_uint32(db, "ns_cluster_size", ns->cluster_size); + + // Using effective_ prefix to avoid confusion with configured value. + info_append_uint32(db, "effective_replication_factor", ns->replication_factor); + + // Object counts. + + info_append_uint64(db, "objects", ns->n_objects); + info_append_uint64(db, "tombstones", ns->n_tombstones); + + repl_stats mp; + as_partition_get_replica_stats(ns, &mp); + + info_append_uint64(db, "master_objects", mp.n_master_objects); + info_append_uint64(db, "master_tombstones", mp.n_master_tombstones); + info_append_uint64(db, "prole_objects", mp.n_prole_objects); + info_append_uint64(db, "prole_tombstones", mp.n_prole_tombstones); + info_append_uint64(db, "non_replica_objects", mp.n_non_replica_objects); + info_append_uint64(db, "non_replica_tombstones", mp.n_non_replica_tombstones); + + // Consistency info. + + info_append_uint32(db, "dead_partitions", ns->n_dead_partitions); + info_append_uint32(db, "unavailable_partitions", ns->n_unavailable_partitions); + info_append_bool(db, "clock_skew_stop_writes", ns->clock_skew_stop_writes); + + // Expiration & eviction (nsup) stats. + + info_append_bool(db, "stop_writes", ns->stop_writes != 0); + info_append_bool(db, "hwm_breached", ns->hwm_breached != 0); + + info_append_uint64(db, "current_time", as_record_void_time_get()); + info_append_uint64(db, "non_expirable_objects", ns->non_expirable_objects); + info_append_uint64(db, "expired_objects", ns->n_expired_objects); + info_append_uint64(db, "evicted_objects", ns->n_evicted_objects); + info_append_uint64(db, "evict_ttl", ns->evict_ttl); + info_append_uint32(db, "nsup_cycle_duration", ns->nsup_cycle_duration); + info_append_uint32(db, "nsup_cycle_sleep_pct", ns->nsup_cycle_sleep_pct); + + // Truncate stats. + + info_append_uint64(db, "truncate_lut", ns->truncate.lut); + info_append_uint64(db, "truncated_records", ns->truncate.n_records); + + // Memory usage stats. + + uint64_t data_memory = ns->n_bytes_memory; + uint64_t index_memory = as_index_size_get(ns) * (ns->n_objects + ns->n_tombstones); + uint64_t sindex_memory = ns->n_bytes_sindex_memory; + uint64_t used_memory = data_memory + index_memory + sindex_memory; + + info_append_uint64(db, "memory_used_bytes", used_memory); + info_append_uint64(db, "memory_used_data_bytes", data_memory); + info_append_uint64(db, "memory_used_index_bytes", index_memory); + info_append_uint64(db, "memory_used_sindex_bytes", sindex_memory); + + uint64_t free_pct = (ns->memory_size != 0 && (ns->memory_size > used_memory)) ? + ((ns->memory_size - used_memory) * 100L) / ns->memory_size : 0; + + info_append_uint64(db, "memory_free_pct", free_pct); + + // Persistent memory block keys' namespace ID (enterprise only). + info_append_uint32(db, "xmem_id", ns->xmem_id); + + // Remaining bin-name slots (yes, this can be negative). + if (! ns->single_bin) { + info_append_int(db, "available_bin_names", BIN_NAMES_QUOTA - (int)cf_vmapx_count(ns->p_bin_name_vmap)); + } + + // Persistent storage stats. + + if (ns->storage_type == AS_STORAGE_ENGINE_SSD) { + int available_pct = 0; + uint64_t inuse_disk_bytes = 0; + as_storage_stats(ns, &available_pct, &inuse_disk_bytes); + + info_append_uint64(db, "device_total_bytes", ns->ssd_size); + info_append_uint64(db, "device_used_bytes", inuse_disk_bytes); + + free_pct = (ns->ssd_size != 0 && (ns->ssd_size > inuse_disk_bytes)) ? + ((ns->ssd_size - inuse_disk_bytes) * 100L) / ns->ssd_size : 0; + + info_append_uint64(db, "device_free_pct", free_pct); + info_append_int(db, "device_available_pct", available_pct); + + if (! ns->storage_data_in_memory) { + info_append_int(db, "cache_read_pct", (int)(ns->cache_read_pct + 0.5)); + } + } + + // Migration stats. + + info_append_uint64(db, "migrate_tx_partitions_imbalance", ns->migrate_tx_partitions_imbalance); + + info_append_uint64(db, "migrate_tx_instances", ns->migrate_tx_instance_count); + info_append_uint64(db, "migrate_rx_instances", ns->migrate_rx_instance_count); + + info_append_uint64(db, "migrate_tx_partitions_active", ns->migrate_tx_partitions_active); + info_append_uint64(db, "migrate_rx_partitions_active", ns->migrate_rx_partitions_active); + + info_append_uint64(db, "migrate_tx_partitions_initial", ns->migrate_tx_partitions_initial); + info_append_uint64(db, "migrate_tx_partitions_remaining", ns->migrate_tx_partitions_remaining); + + info_append_uint64(db, "migrate_rx_partitions_initial", ns->migrate_rx_partitions_initial); + info_append_uint64(db, "migrate_rx_partitions_remaining", ns->migrate_rx_partitions_remaining); + + info_append_uint64(db, "migrate_records_skipped", ns->migrate_records_skipped); + info_append_uint64(db, "migrate_records_transmitted", ns->migrate_records_transmitted); + info_append_uint64(db, "migrate_record_retransmits", ns->migrate_record_retransmits); + info_append_uint64(db, "migrate_record_receives", ns->migrate_record_receives); + + info_append_uint64(db, "migrate_signals_active", ns->migrate_signals_active); + info_append_uint64(db, "migrate_signals_remaining", ns->migrate_signals_remaining); + + info_append_uint64(db, "appeals_tx_active", ns->appeals_tx_active); + info_append_uint64(db, "appeals_rx_active", ns->appeals_rx_active); + + info_append_uint64(db, "appeals_tx_remaining", ns->appeals_tx_remaining); + + info_append_uint64(db, "appeals_records_exonerated", ns->appeals_records_exonerated); + + // From-client transaction stats. + + info_append_uint64(db, "client_tsvc_error", ns->n_client_tsvc_error); + info_append_uint64(db, "client_tsvc_timeout", ns->n_client_tsvc_timeout); + + info_append_uint64(db, "client_proxy_complete", ns->n_client_proxy_complete); + info_append_uint64(db, "client_proxy_error", ns->n_client_proxy_error); + info_append_uint64(db, "client_proxy_timeout", ns->n_client_proxy_timeout); + + info_append_uint64(db, "client_read_success", ns->n_client_read_success); + info_append_uint64(db, "client_read_error", ns->n_client_read_error); + info_append_uint64(db, "client_read_timeout", ns->n_client_read_timeout); + info_append_uint64(db, "client_read_not_found", ns->n_client_read_not_found); + + info_append_uint64(db, "client_write_success", ns->n_client_write_success); + info_append_uint64(db, "client_write_error", ns->n_client_write_error); + info_append_uint64(db, "client_write_timeout", ns->n_client_write_timeout); + + // Subset of n_client_write_... above, respectively. + info_append_uint64(db, "xdr_write_success", ns->n_xdr_write_success); + info_append_uint64(db, "xdr_write_error", ns->n_xdr_write_error); + info_append_uint64(db, "xdr_write_timeout", ns->n_xdr_write_timeout); + + info_append_uint64(db, "client_delete_success", ns->n_client_delete_success); + info_append_uint64(db, "client_delete_error", ns->n_client_delete_error); + info_append_uint64(db, "client_delete_timeout", ns->n_client_delete_timeout); + info_append_uint64(db, "client_delete_not_found", ns->n_client_delete_not_found); + + // Subset of n_client_delete_... above, respectively. + info_append_uint64(db, "xdr_delete_success", ns->n_xdr_delete_success); + info_append_uint64(db, "xdr_delete_error", ns->n_xdr_delete_error); + info_append_uint64(db, "xdr_delete_timeout", ns->n_xdr_delete_timeout); + info_append_uint64(db, "xdr_delete_not_found", ns->n_xdr_delete_not_found); + + info_append_uint64(db, "client_udf_complete", ns->n_client_udf_complete); + info_append_uint64(db, "client_udf_error", ns->n_client_udf_error); + info_append_uint64(db, "client_udf_timeout", ns->n_client_udf_timeout); + + info_append_uint64(db, "client_lang_read_success", ns->n_client_lang_read_success); + info_append_uint64(db, "client_lang_write_success", ns->n_client_lang_write_success); + info_append_uint64(db, "client_lang_delete_success", ns->n_client_lang_delete_success); + info_append_uint64(db, "client_lang_error", ns->n_client_lang_error); + + // Batch sub-transaction stats. + + info_append_uint64(db, "batch_sub_tsvc_error", ns->n_batch_sub_tsvc_error); + info_append_uint64(db, "batch_sub_tsvc_timeout", ns->n_batch_sub_tsvc_timeout); + + info_append_uint64(db, "batch_sub_proxy_complete", ns->n_batch_sub_proxy_complete); + info_append_uint64(db, "batch_sub_proxy_error", ns->n_batch_sub_proxy_error); + info_append_uint64(db, "batch_sub_proxy_timeout", ns->n_batch_sub_proxy_timeout); + + info_append_uint64(db, "batch_sub_read_success", ns->n_batch_sub_read_success); + info_append_uint64(db, "batch_sub_read_error", ns->n_batch_sub_read_error); + info_append_uint64(db, "batch_sub_read_timeout", ns->n_batch_sub_read_timeout); + info_append_uint64(db, "batch_sub_read_not_found", ns->n_batch_sub_read_not_found); + + // Internal-UDF sub-transaction stats. + + info_append_uint64(db, "udf_sub_tsvc_error", ns->n_udf_sub_tsvc_error); + info_append_uint64(db, "udf_sub_tsvc_timeout", ns->n_udf_sub_tsvc_timeout); + + info_append_uint64(db, "udf_sub_udf_complete", ns->n_udf_sub_udf_complete); + info_append_uint64(db, "udf_sub_udf_error", ns->n_udf_sub_udf_error); + info_append_uint64(db, "udf_sub_udf_timeout", ns->n_udf_sub_udf_timeout); + + info_append_uint64(db, "udf_sub_lang_read_success", ns->n_udf_sub_lang_read_success); + info_append_uint64(db, "udf_sub_lang_write_success", ns->n_udf_sub_lang_write_success); + info_append_uint64(db, "udf_sub_lang_delete_success", ns->n_udf_sub_lang_delete_success); + info_append_uint64(db, "udf_sub_lang_error", ns->n_udf_sub_lang_error); + + // Transaction retransmit stats. + + info_append_uint64(db, "retransmit_client_read_dup_res", ns->n_retransmit_client_read_dup_res); + + info_append_uint64(db, "retransmit_client_write_dup_res", ns->n_retransmit_client_write_dup_res); + info_append_uint64(db, "retransmit_client_write_repl_write", ns->n_retransmit_client_write_repl_write); + + info_append_uint64(db, "retransmit_client_delete_dup_res", ns->n_retransmit_client_delete_dup_res); + info_append_uint64(db, "retransmit_client_delete_repl_write", ns->n_retransmit_client_delete_repl_write); + + info_append_uint64(db, "retransmit_client_udf_dup_res", ns->n_retransmit_client_udf_dup_res); + info_append_uint64(db, "retransmit_client_udf_repl_write", ns->n_retransmit_client_udf_repl_write); + + info_append_uint64(db, "retransmit_batch_sub_dup_res", ns->n_retransmit_batch_sub_dup_res); + + info_append_uint64(db, "retransmit_udf_sub_dup_res", ns->n_retransmit_udf_sub_dup_res); + info_append_uint64(db, "retransmit_udf_sub_repl_write", ns->n_retransmit_udf_sub_repl_write); + + // Scan stats. + + info_append_uint64(db, "scan_basic_complete", ns->n_scan_basic_complete); + info_append_uint64(db, "scan_basic_error", ns->n_scan_basic_error); + info_append_uint64(db, "scan_basic_abort", ns->n_scan_basic_abort); + + info_append_uint64(db, "scan_aggr_complete", ns->n_scan_aggr_complete); + info_append_uint64(db, "scan_aggr_error", ns->n_scan_aggr_error); + info_append_uint64(db, "scan_aggr_abort", ns->n_scan_aggr_abort); + + info_append_uint64(db, "scan_udf_bg_complete", ns->n_scan_udf_bg_complete); + info_append_uint64(db, "scan_udf_bg_error", ns->n_scan_udf_bg_error); + info_append_uint64(db, "scan_udf_bg_abort", ns->n_scan_udf_bg_abort); + + // Query stats. + + uint64_t agg = ns->n_aggregation; + uint64_t agg_success = ns->n_agg_success; + uint64_t agg_err = ns->n_agg_errs; + uint64_t agg_abort = ns->n_agg_abort; + uint64_t agg_records = ns->agg_num_records; + + uint64_t lkup = ns->n_lookup; + uint64_t lkup_success = ns->n_lookup_success; + uint64_t lkup_err = ns->n_lookup_errs; + uint64_t lkup_abort = ns->n_lookup_abort; + uint64_t lkup_records = ns->lookup_num_records; + + info_append_uint64(db, "query_reqs", ns->query_reqs); + info_append_uint64(db, "query_fail", ns->query_fail); + + info_append_uint64(db, "query_short_queue_full", ns->query_short_queue_full); + info_append_uint64(db, "query_long_queue_full", ns->query_long_queue_full); + info_append_uint64(db, "query_short_reqs", ns->query_short_reqs); + info_append_uint64(db, "query_long_reqs", ns->query_long_reqs); + + info_append_uint64(db, "query_agg", agg); + info_append_uint64(db, "query_agg_success", agg_success); + info_append_uint64(db, "query_agg_error", agg_err); + info_append_uint64(db, "query_agg_abort", agg_abort); + info_append_uint64(db, "query_agg_avg_rec_count", agg ? agg_records / agg : 0); + + info_append_uint64(db, "query_lookups", lkup); + info_append_uint64(db, "query_lookup_success", lkup_success); + info_append_uint64(db, "query_lookup_error", lkup_err); + info_append_uint64(db, "query_lookup_abort", lkup_abort); + info_append_uint64(db, "query_lookup_avg_rec_count", lkup ? lkup_records / lkup : 0); + + info_append_uint64(db, "query_udf_bg_success", ns->n_query_udf_bg_success); + info_append_uint64(db, "query_udf_bg_failure", ns->n_query_udf_bg_failure); + + // Geospatial query stats: + info_append_uint64(db, "geo_region_query_reqs", ns->geo_region_query_count); + info_append_uint64(db, "geo_region_query_cells", ns->geo_region_query_cells); + info_append_uint64(db, "geo_region_query_points", ns->geo_region_query_points); + info_append_uint64(db, "geo_region_query_falsepos", ns->geo_region_query_falsepos); + + // Re-replication stats - relevant only for enterprise edition. + + info_append_uint64(db, "re_repl_success", ns->n_re_repl_success); + info_append_uint64(db, "re_repl_error", ns->n_re_repl_error); + info_append_uint64(db, "re_repl_timeout", ns->n_re_repl_timeout); + + // Special errors that deserve their own counters: + + info_append_uint64(db, "fail_xdr_forbidden", ns->n_fail_xdr_forbidden); + info_append_uint64(db, "fail_key_busy", ns->n_fail_key_busy); + info_append_uint64(db, "fail_generation", ns->n_fail_generation); + info_append_uint64(db, "fail_record_too_big", ns->n_fail_record_too_big); + + // Special non-error counters: + + info_append_uint64(db, "deleted_last_bin", ns->n_deleted_last_bin); +} + +// +// Iterate through the current namespace list and cons up a string +// + +int +info_get_tree_namespace(char *name, char *subtree, cf_dyn_buf *db) +{ + as_namespace *ns = as_namespace_get_byname(subtree); + + if (! ns) { + cf_dyn_buf_append_string(db, "type=unknown"); // TODO - better message? + return 0; + } + + info_get_namespace_info(ns, db); + info_namespace_config_get(ns->name, db); + + cf_dyn_buf_chomp(db); + + return 0; +} + +int +info_get_tree_sets(char *name, char *subtree, cf_dyn_buf *db) +{ + char *set_name = NULL; + as_namespace *ns = NULL; + + // if there is a subtree, get the namespace + if (subtree && strlen(subtree) > 0) { + // see if subtree has a sep as well + set_name = strchr(subtree, TREE_SEP); + + // pull out namespace, and namespace name... + if (set_name) { + int ns_name_len = (set_name - subtree); + char ns_name[ns_name_len + 1]; + memcpy(ns_name, subtree, ns_name_len); + ns_name[ns_name_len] = '\0'; + ns = as_namespace_get_byname(ns_name); + set_name++; // currently points to the TREE_SEP, which is not what we want. + } + else { + ns = as_namespace_get_byname(subtree); + } + + if (!ns) { + cf_dyn_buf_append_string(db, "ns_type=unknown"); + return(0); + } + } + + // format w/o namespace is ns1:set1:prop1=val1:prop2=val2:..propn=valn;ns1:set2...;ns2:set1...; + if (!ns) { + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_namespace_get_set_info(g_config.namespaces[i], set_name, db); + } + } + // format w namespace w/o set name is ns:set1:prop1=val1:prop2=val2...propn=valn;ns:set2...; + // format w namespace & set name is prop1=val1:prop2=val2...propn=valn; + else { + as_namespace_get_set_info(ns, set_name, db); + } + return(0); +} + +int +info_get_tree_statistics(char *name, char *subtree, cf_dyn_buf *db) +{ + if (strcmp(subtree, "xdr") == 0) { + as_xdr_get_stats(db); + cf_dyn_buf_chomp(db); + return 0; + } + + cf_dyn_buf_append_string(db, "error"); + return -1; +} + +int +info_get_tree_bins(char *name, char *subtree, cf_dyn_buf *db) +{ + as_namespace *ns = NULL; + + // if there is a subtree, get the namespace + if (subtree && strlen(subtree) > 0) { + ns = as_namespace_get_byname(subtree); + + if (!ns) { + cf_dyn_buf_append_string(db, "ns_type=unknown"); + return 0; + } + } + + // format w/o namespace is + // ns:num-bin-names=val1,bin-names-quota=val2,name1,name2,...;ns:... + if (!ns) { + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_namespace_get_bins_info(g_config.namespaces[i], db, true); + } + } + // format w/namespace is + // num-bin-names=val1,bin-names-quota=val2,name1,name2,... + else { + as_namespace_get_bins_info(ns, db, false); + } + + return 0; +} + +int +info_command_hist_dump(char *name, char *params, cf_dyn_buf *db) +{ + char value_str[128]; + int value_str_len = sizeof(value_str); + + if (0 != as_info_parameter_get(params, "ns", value_str, &value_str_len)) { + cf_info(AS_INFO, "hist-dump %s command: no namespace specified", name); + cf_dyn_buf_append_string(db, "error-no-namespace"); + return 0; + } + + as_namespace *ns = as_namespace_get_byname(value_str); + + if (!ns) { + cf_info(AS_INFO, "hist-dump %s command: unknown namespace: %s", name, value_str); + cf_dyn_buf_append_string(db, "error-unknown-namespace"); + return 0; + } + + value_str_len = sizeof(value_str); + + if (0 != as_info_parameter_get(params, "hist", value_str, &value_str_len)) { + cf_info(AS_INFO, "hist-dump %s command:", name); + cf_dyn_buf_append_string(db, "error-no-hist-name"); + + return 0; + } + + // get optional set field + char set_name_str[AS_SET_NAME_MAX_SIZE]; + int set_name_str_len = sizeof(set_name_str); + set_name_str[0] = 0; + + as_info_parameter_get(params, "set", set_name_str, &set_name_str_len); + + // format is ns1:ns_hist1=bucket_count,offset,b1,b2,b3...; + as_namespace_get_hist_info(ns, set_name_str, value_str, db, true); + + return 0; +} + + +int +info_get_tree_log(char *name, char *subtree, cf_dyn_buf *db) +{ + // see if subtree has a sep as well + int sink_id; + char *context = strchr(subtree, TREE_SEP); + if (context) { // this means: log/id/context , + *context = 0; + context++; + + if (0 != cf_str_atoi(subtree, &sink_id)) return(-1); + + cf_fault_sink_context_strlist(sink_id, context, db); + } + else { // this means just: log/id , so get all contexts + if (0 != cf_str_atoi(subtree, &sink_id)) return(-1); + + cf_fault_sink_context_all_strlist(sink_id, db); + } + + return(0); +} + + +int +info_get_tree_sindexes(char *name, char *subtree, cf_dyn_buf *db) +{ + char *index_name = NULL; + as_namespace *ns = NULL; + + // if there is a subtree, get the namespace + if (subtree && strlen(subtree) > 0) { + // see if subtree has a sep as well + index_name = strchr(subtree, TREE_SEP); + + // pull out namespace, and namespace name... + if (index_name) { + int ns_name_len = (index_name - subtree); + char ns_name[ns_name_len + 1]; + memcpy(ns_name, subtree, ns_name_len); + ns_name[ns_name_len] = '\0'; + ns = as_namespace_get_byname(ns_name); + index_name++; // currently points to the TREE_SEP, which is not what we want. + } + else { + ns = as_namespace_get_byname(subtree); + } + + if (!ns) { + cf_dyn_buf_append_string(db, "ns_type=unknown"); + return(0); + } + } + + // format w/o namespace is: + // ns=ns1:set=set1:indexname=index1:prop1=val1:...:propn=valn;ns=ns1:set=set2:indexname=index2:...;ns=ns2:set=set1:...; + if (!ns) { + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_sindex_list_str(g_config.namespaces[i], db); + } + } + // format w namespace w/o index name is: + // ns=ns1:set=set1:indexname=index1:prop1=val1:...:propn=valn;ns=ns1:set=set2:indexname=indexname2:...; + else if (!index_name) { + as_sindex_list_str(ns, db); + } + else { + // format w namespace & index name is: + // prop1=val1;prop2=val2;...;propn=valn + int resp = as_sindex_stats_str(ns, index_name, db); + if (resp) { + cf_warning(AS_INFO, "Failed to get statistics for index %s: err = %d", index_name, resp); + INFO_COMMAND_SINDEX_FAILCODE( + as_sindex_err_to_clienterr(resp, __FILE__, __LINE__), + as_sindex_err_str(resp)); + } + } + return(0); +} + +int32_t +info_get_service(char *name, cf_dyn_buf *db) +{ + pthread_mutex_lock(&g_serv_lock); + cf_dyn_buf_append_string(db, g_serv_legacy != NULL ? g_serv_legacy : ""); + pthread_mutex_unlock(&g_serv_lock); + return 0; +} + +int32_t +info_get_service_clear_std(char *name, cf_dyn_buf *db) +{ + pthread_mutex_lock(&g_serv_lock); + cf_dyn_buf_append_string(db, g_serv_clear_std != NULL ? g_serv_clear_std : ""); + pthread_mutex_unlock(&g_serv_lock); + return 0; +} + +int32_t +info_get_service_tls_std(char *name, cf_dyn_buf *db) +{ + pthread_mutex_lock(&g_serv_lock); + cf_dyn_buf_append_string(db, g_serv_tls_std != NULL ? g_serv_tls_std : ""); + pthread_mutex_unlock(&g_serv_lock); + return 0; +} + +int32_t +info_get_service_clear_alt(char *name, cf_dyn_buf *db) +{ + pthread_mutex_lock(&g_serv_lock); + cf_dyn_buf_append_string(db, g_serv_clear_alt != NULL ? g_serv_clear_alt : ""); + pthread_mutex_unlock(&g_serv_lock); + return 0; +} + +int32_t +info_get_service_tls_alt(char *name, cf_dyn_buf *db) +{ + pthread_mutex_lock(&g_serv_lock); + cf_dyn_buf_append_string(db, g_serv_tls_alt != NULL ? g_serv_tls_alt : ""); + pthread_mutex_unlock(&g_serv_lock); + return 0; +} + +// SINDEX wire protocol examples: +// 1.) NUMERIC: sindex-create:ns=usermap;set=demo;indexname=um_age;indexdata=age,numeric +// 2.) STRING: sindex-create:ns=usermap;set=demo;indexname=um_state;indexdata=state,string +/* + * Parameters: + * params --- string passed to asinfo call + * imd -- parses the params and fills this sindex struct. + * + * Returns + * AS_SINDEX_OK if it successfully fills up imd + * AS_SINDEX_ERR_PARAM otherwise + * TODO REVIEW : send cmd as argument + */ +int +as_info_parse_params_to_sindex_imd(char* params, as_sindex_metadata *imd, cf_dyn_buf* db, + bool is_create, bool *is_smd_op, char * OP) +{ + if (! imd) { + cf_warning(AS_INFO, "%s : Failed. internal error.", OP); + return AS_SINDEX_ERR_PARAM; + } + + char indexname_str[AS_ID_INAME_SZ]; + int indname_len = sizeof(indexname_str); + int ret = as_info_parameter_get(params, STR_INDEXNAME, indexname_str, + &indname_len); + if ( ret == -1 ) { + cf_warning(AS_INFO, "%s : Failed. Missing Index name.", OP); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Missing Index name"); + return AS_SINDEX_ERR_PARAM; + } + else if ( ret == -2 ) { + cf_warning(AS_INFO, "%s : Failed. Index name longer than allowed %d.", + OP, AS_ID_INAME_SZ-1); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Index name too long"); + return AS_SINDEX_ERR_PARAM; + } + + char cmd[128]; + snprintf(cmd, 128, "%s %s", OP, indexname_str); + + char ns_str[AS_ID_NAMESPACE_SZ]; + int ns_len = sizeof(ns_str); + ret = as_info_parameter_get(params, STR_NS, ns_str, &ns_len); + if ( ret == -1 ) { + cf_warning(AS_INFO, "%s : Failed. Missing Namespace name.", cmd); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Missing Namespace name"); + return AS_SINDEX_ERR_PARAM; + } + else if (ret == -2 ) { + cf_warning(AS_INFO, "%s : Failed. Namespace name longer than allowed %d.", + cmd, AS_ID_NAMESPACE_SZ - 1); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Namespace name too long"); + return AS_SINDEX_ERR_PARAM; + } + + as_namespace *ns = as_namespace_get_byname(ns_str); + if (! ns) { + cf_warning(AS_INFO, "%s : Failed. Namespace '%s' not found %d", + cmd, ns_str, ns_len); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, "Namespace Not Found"); + return AS_SINDEX_ERR_PARAM; + } + if (ns->single_bin) { + cf_warning(AS_INFO, "%s : Failed. Secondary Index is not allowed on single bin " + "namespace '%s'.", cmd, ns_str); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Single bin namespace"); + return AS_SINDEX_ERR_PARAM; + } + + char set_str[AS_SET_NAME_MAX_SIZE]; + int set_len = sizeof(set_str); + if (imd->set) { + cf_free(imd->set); + imd->set = NULL; + } + ret = as_info_parameter_get(params, STR_SET, set_str, &set_len); + if (!ret && set_len != 0) { + if (as_namespace_get_create_set_w_len(ns, set_str, set_len, NULL, NULL) + != 0) { + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Set name quota full"); + return AS_SINDEX_ERR_PARAM; + } + imd->set = cf_strdup(set_str); + } else if (ret == -2) { + cf_warning(AS_INFO, "%s : Failed. Setname longer than %d for index.", + cmd, AS_SET_NAME_MAX_SIZE - 1); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Set name too long"); + return AS_SINDEX_ERR_PARAM; + } + + char cluster_op[6]; + int cluster_op_len = sizeof(cluster_op); + if (as_info_parameter_get(params, "cluster_op", cluster_op, &cluster_op_len) + != 0) { + *is_smd_op = true; + } + else if (strcmp(cluster_op, "true") == 0) { + *is_smd_op = true; + } + else if (strcmp(cluster_op, "false") == 0) { + *is_smd_op = false; + } + + // Delete only need parsing till here + if (!is_create) { + imd->ns_name = cf_strdup(ns->name); + imd->iname = cf_strdup(indexname_str); + return 0; + } + + char indextype_str[AS_SINDEX_TYPE_STR_SIZE]; + int indtype_len = sizeof(indextype_str); + ret = as_info_parameter_get(params, STR_ITYPE, indextype_str, &indtype_len); + if (ret == -1) { + // if not specified the index type is DEFAULT + imd->itype = AS_SINDEX_ITYPE_DEFAULT; + } + else if (ret == -2) { + cf_warning(AS_INFO, "%s : Failed. Indextype str longer than allowed %d.", + cmd, AS_SINDEX_TYPE_STR_SIZE-1); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Indextype is too long"); + return AS_SINDEX_ERR_PARAM; + + } + else { + if (strncasecmp(indextype_str, STR_ITYPE_DEFAULT, 7) == 0) { + imd->itype = AS_SINDEX_ITYPE_DEFAULT; + } + else if (strncasecmp(indextype_str, STR_ITYPE_LIST, 4) == 0) { + imd->itype = AS_SINDEX_ITYPE_LIST; + } + else if (strncasecmp(indextype_str, STR_ITYPE_MAPKEYS, 7) == 0) { + imd->itype = AS_SINDEX_ITYPE_MAPKEYS; + } + else if (strncasecmp(indextype_str, STR_ITYPE_MAPVALUES, 9) == 0) { + imd->itype = AS_SINDEX_ITYPE_MAPVALUES; + } + else { + cf_warning(AS_INFO, "%s : Failed. Invalid indextype '%s'.", cmd, + indextype_str); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Invalid indextype. Should be one of [DEFAULT, LIST, MAPKEYS, MAPVALUES]"); + return AS_SINDEX_ERR_PARAM; + } + } + + // Indexdata = binpath,keytype + char indexdata_str[AS_SINDEXDATA_STR_SIZE]; + int indexdata_len = sizeof(indexdata_str); + if (as_info_parameter_get(params, STR_INDEXDATA, indexdata_str, + &indexdata_len)) { + cf_warning(AS_INFO, "%s : Failed. Invalid indexdata '%s'.", cmd, + indexdata_str); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Invalid indexdata"); + return AS_SINDEX_ERR_PARAM; + } + + cf_vector *str_v = cf_vector_create(sizeof(void *), 10, VECTOR_FLAG_INITZERO); + cf_str_split(",", indexdata_str, str_v); + if ((cf_vector_size(str_v)) > 2) { + cf_warning(AS_INFO, "%s : Failed. >1 bins specified in indexdata.", + cmd); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Number of bins more than 1"); + cf_vector_destroy(str_v); + return AS_SINDEX_ERR_PARAM; + } + + char *path_str = NULL; + cf_vector_get(str_v, 0, &path_str); + if (! path_str) { + cf_warning(AS_INFO, "%s : Failed. Missing Bin Name.", cmd); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Missing Bin name"); + cf_vector_destroy(str_v); + return AS_SINDEX_ERR_PARAM; + } + + if (as_sindex_extract_bin_path(imd, path_str) + || ! imd->bname) { + cf_warning(AS_INFO, "%s : Failed. Invalid Bin Path '%s'.", cmd, path_str); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Invalid Bin path"); + return AS_SINDEX_ERR_PARAM; + } + + if (imd->bname && strlen(imd->bname) >= AS_ID_BIN_SZ) { + cf_warning(AS_INFO, "%s : Failed. Bin Name longer than allowed %d", + cmd, AS_ID_BIN_SZ - 1); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, "Bin Name too long"); + cf_vector_destroy(str_v); + return AS_SINDEX_ERR_PARAM; + } + + char *type_str = NULL; + cf_vector_get(str_v, 1, &type_str); + if (! type_str) { + cf_warning(AS_INFO, "%s : Failed. Missing Bin type", cmd); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Missing Bin Type."); + cf_vector_destroy(str_v); + return AS_SINDEX_ERR_PARAM; + } + + as_sindex_ktype ktype = as_sindex_ktype_from_string(type_str); + if (ktype == COL_TYPE_INVALID) { + cf_warning(AS_INFO, "%s : Failed. Invalid Bin type '%s'.", cmd, type_str); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Invalid Bin type. Supported types [Numeric, String, Geo2dsphere]"); + cf_vector_destroy(str_v); + return AS_SINDEX_ERR_PARAM; + } + imd->sktype = ktype; + + + + cf_vector_destroy(str_v); + + if (is_create) { + imd->ns_name = cf_strdup(ns->name); + imd->iname = cf_strdup(indexname_str); + } + imd->path_str = cf_strdup(path_str); + return AS_SINDEX_OK; +} + +int info_command_sindex_create(char *name, char *params, cf_dyn_buf *db) +{ + as_sindex_metadata imd; + memset((void *)&imd, 0, sizeof(imd)); + bool is_smd_op = true; + + // Check info-command params for correctness. + int res = as_info_parse_params_to_sindex_imd(params, &imd, db, true, &is_smd_op, "SINDEX CREATE"); + + if (res != 0) { + goto ERR; + } + + as_namespace *ns = as_namespace_get_byname(imd.ns_name); + res = as_sindex_create_check_params(ns, &imd); + + if (res == AS_SINDEX_ERR_FOUND) { + cf_warning(AS_INFO, "SINDEX CREATE: Index already exists on namespace '%s', either with same name '%s' or same bin '%s' / type '%s' combination.", + imd.ns_name, imd.iname, imd.bname, + as_sindex_ktype_str(imd.sktype)); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_INDEX_FOUND, + "Index with the same name already exists or this bin has already been indexed."); + goto ERR; + } + else if (res == AS_SINDEX_ERR_MAXCOUNT) { + cf_warning(AS_INFO, "SINDEX CREATE : More than %d index are not allowed per namespace.", AS_SINDEX_MAX); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_INDEX_MAXCOUNT, + "Reached maximum number of sindex allowed"); + goto ERR; + } + + if (is_smd_op == true) + { + cf_info(AS_INFO, "SINDEX CREATE : Request received for %s:%s via SMD", imd.ns_name, imd.iname); + + char smd_key[SINDEX_SMD_KEY_SIZE]; + + as_sindex_imd_to_smd_key(&imd, smd_key); + res = as_smd_set_metadata(SINDEX_MODULE, smd_key, imd.iname); + + if (res != 0) { + cf_warning(AS_INFO, "SINDEX CREATE : Queuing the index %s metadata to SMD failed with error %s", + imd.iname, as_sindex_err_str(res)); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, as_sindex_err_str(res)); + goto ERR; + } + } + else if (is_smd_op == false) { + cf_info(AS_INFO, "SINDEX CREATE : Request received for %s:%s via info", imd.ns_name, imd.iname); + res = as_sindex_create(ns, &imd); + if (0 != res) { + cf_warning(AS_INFO, "SINDEX CREATE : Failed with error %s for index %s", + as_sindex_err_str(res), imd.iname); + INFO_COMMAND_SINDEX_FAILCODE(as_sindex_err_to_clienterr(res, __FILE__, __LINE__), + as_sindex_err_str(res)); + goto ERR; + } + } + cf_dyn_buf_append_string(db, "OK"); +ERR: + as_sindex_imd_free(&imd); + return(0); + +} + +int info_command_sindex_delete(char *name, char *params, cf_dyn_buf *db) { + as_sindex_metadata imd; + memset((void *)&imd, 0, sizeof(imd)); + bool is_smd_op = true; + int res = as_info_parse_params_to_sindex_imd(params, &imd, db, false, &is_smd_op, "SINDEX DROP"); + + if (res != 0) { + goto ERR; + } + + as_namespace *ns = as_namespace_get_byname(imd.ns_name); + + // Do not use as_sindex_exists_by_defn() here, it'll fail because bname is null. + if (!as_sindex_delete_checker(ns, &imd)) { + cf_warning(AS_INFO, "SINDEX DROP : Index %s:%s does not exist on the system", + imd.ns_name, imd.iname); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_INDEX_NOTFOUND, + "Index does not exist on the system."); + goto ERR; + } + + if (is_smd_op == true) + { + cf_info(AS_INFO, "SINDEX DROP : Request received for %s:%s via SMD", imd.ns_name, imd.iname); + + char smd_key[SINDEX_SMD_KEY_SIZE]; + + if (as_sindex_delete_imd_to_smd_key(ns, &imd, smd_key)) { + res = as_smd_delete_metadata(SINDEX_MODULE, smd_key); + } + else { + res = AS_SINDEX_ERR_NOTFOUND; + } + + if (0 != res) { + cf_warning(AS_INFO, "SINDEX DROP : Queuing the index %s metadata to SMD failed with error %s", + imd.iname, as_sindex_err_str(res)); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, as_sindex_err_str(res)); + goto ERR; + } + } + else if(is_smd_op == false) + { + cf_info(AS_INFO, "SINDEX DROP : Request received for %s:%s via info", imd.ns_name, imd.iname); + res = as_sindex_destroy(ns, &imd); + if (0 != res) { + cf_warning(AS_INFO, "SINDEX DROP : Failed with error %s for index %s", + as_sindex_err_str(res), imd.iname); + INFO_COMMAND_SINDEX_FAILCODE(as_sindex_err_to_clienterr(res, __FILE__, __LINE__), + as_sindex_err_str(res)); + goto ERR; + } + } + + cf_dyn_buf_append_string(db, "OK"); +ERR: + as_sindex_imd_free(&imd); + return 0; +} + +int +as_info_parse_ns_iname(char* params, as_namespace ** ns, char ** iname, cf_dyn_buf* db, char * sindex_cmd) +{ + char ns_str[AS_ID_NAMESPACE_SZ]; + int ns_len = sizeof(ns_str); + int ret = 0; + + ret = as_info_parameter_get(params, "ns", ns_str, &ns_len); + if (ret) { + if (ret == -2) { + cf_warning(AS_INFO, "%s : namespace name exceeds max length %d", + sindex_cmd, AS_ID_NAMESPACE_SZ); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Namespace name exceeds max length"); + } + else { + cf_warning(AS_INFO, "%s : invalid namespace %s", sindex_cmd, ns_str); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Namespace Not Specified"); + } + return -1; + } + + *ns = as_namespace_get_byname(ns_str); + if (!*ns) { + cf_warning(AS_INFO, "%s : namespace %s not found", sindex_cmd, ns_str); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Namespace Not Found"); + return -1; + } + + // get indexname + char index_name_str[AS_ID_INAME_SZ]; + int index_len = sizeof(index_name_str); + ret = as_info_parameter_get(params, "indexname", index_name_str, &index_len); + if (ret) { + if (ret == -2) { + cf_warning(AS_INFO, "%s : indexname exceeds max length %d", sindex_cmd, AS_ID_INAME_SZ); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Index Name exceeds max length"); + } + else { + cf_warning(AS_INFO, "%s : invalid indexname %s", sindex_cmd, index_name_str); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, + "Index Name Not Specified"); + } + return -1; + } + + cf_info(AS_SINDEX, "%s : received request on index %s - namespace %s", + sindex_cmd, index_name_str, ns_str); + + *iname = cf_strdup(index_name_str); + + return 0; +} + +int info_command_abort_scan(char *name, char *params, cf_dyn_buf *db) { + char context[100]; + int context_len = sizeof(context); + int rv = -1; + if (0 == as_info_parameter_get(params, "id", context, &context_len)) { + uint64_t trid; + trid = strtoull(context, NULL, 10); + if (trid != 0) { + rv = as_scan_abort(trid); + } + } + + if (rv != 0) { + cf_dyn_buf_append_string(db, "ERROR:"); + cf_dyn_buf_append_int(db, AS_PROTO_RESULT_FAIL_NOT_FOUND); + cf_dyn_buf_append_string(db, ":Transaction Not Found"); + } + else { + cf_dyn_buf_append_string(db, "OK"); + } + + return 0; +} + +int info_command_abort_all_scans(char *name, char *params, cf_dyn_buf *db) { + + int n_scans_killed = as_scan_abort_all(); + + cf_dyn_buf_append_string(db, "OK - number of scans killed: "); + cf_dyn_buf_append_int(db, n_scans_killed); + + return 0; +} + +int info_command_query_kill(char *name, char *params, cf_dyn_buf *db) { + char context[100]; + int context_len = sizeof(context); + int rv = AS_QUERY_ERR; + if (0 == as_info_parameter_get(params, "trid", context, &context_len)) { + uint64_t trid; + trid = strtoull(context, NULL, 10); + if (trid != 0) { + rv = as_query_kill(trid); + } + } + + if (AS_QUERY_OK != rv) { + cf_dyn_buf_append_string(db, "Transaction Not Found"); + } + else { + cf_dyn_buf_append_string(db, "Ok"); + } + + return 0; + + + +} +int info_command_sindex_stat(char *name, char *params, cf_dyn_buf *db) { + as_namespace *ns = NULL; + char * iname = NULL; + + if (as_info_parse_ns_iname(params, &ns, &iname, db, "SINDEX STAT")) { + return 0; + } + + int resp = as_sindex_stats_str(ns, iname, db); + if (resp) { + cf_warning(AS_INFO, "SINDEX STAT : for index %s - ns %s failed with error %d", + iname, ns->name, resp); + INFO_COMMAND_SINDEX_FAILCODE( + as_sindex_err_to_clienterr(resp, __FILE__, __LINE__), + as_sindex_err_str(resp)); + } + + if (iname) { + cf_free(iname); + } + return(0); +} + + +// sindex-histogram:ns=test_D;indexname=indname;enable=true/false +int info_command_sindex_histogram(char *name, char *params, cf_dyn_buf *db) +{ + as_namespace * ns = NULL; + char * iname = NULL; + if (as_info_parse_ns_iname(params, &ns, &iname, db, "SINDEX HISTOGRAM")) { + return 0; + } + + char op[10]; + int op_len = sizeof(op); + + if (as_info_parameter_get(params, "enable", op, &op_len)) { + cf_info(AS_INFO, "SINDEX HISTOGRAM : invalid OP"); + cf_dyn_buf_append_string(db, "Invalid Op"); + goto END; + } + + bool enable = false; + if (!strncmp(op, "true", 5) && op_len != 5) { + enable = true; + } + else if (!strncmp(op, "false", 6) && op_len != 6) { + enable = false; + } + else { + cf_info(AS_INFO, "SINDEX HISTOGRAM : invalid OP"); + cf_dyn_buf_append_string(db, "Invalid Op"); + goto END; + } + + int resp = as_sindex_histogram_enable(ns, iname, enable); + if (resp) { + cf_warning(AS_INFO, "SINDEX HISTOGRAM : for index %s - ns %s failed with error %d", + iname, ns->name, resp); + INFO_COMMAND_SINDEX_FAILCODE( + as_sindex_err_to_clienterr(resp, __FILE__, __LINE__), + as_sindex_err_str(resp)); + } else { + cf_dyn_buf_append_string(db, "Ok"); + cf_info(AS_INFO, "SINDEX HISTOGRAM : for index %s - ns %s histogram is set as %s", + iname, ns->name, op); + } + +END: + if (iname) { + cf_free(iname); + } + return(0); +} + +int info_command_sindex_list(char *name, char *params, cf_dyn_buf *db) { + bool listall = true; + char ns_str[128]; + int ns_len = sizeof(ns_str); + if (!as_info_parameter_get(params, "ns", ns_str, &ns_len)) { + listall = false; + } + + if (listall) { + bool found = false; + for (int i = 0; i < g_config.n_namespaces; i++) { + as_namespace *ns = g_config.namespaces[i]; + if (ns) { + if (!as_sindex_list_str(ns, db)) { + found = true; + } + else { + cf_detail(AS_INFO, "No indexes for namespace %s", ns->name); + } + } + } + + if (found) { + cf_dyn_buf_chomp(db); + } + else { + cf_dyn_buf_append_string(db, "Empty"); + } + } + else { + as_namespace *ns = as_namespace_get_byname(ns_str); + if (!ns) { + cf_warning(AS_INFO, "SINDEX LIST : ns %s not found", ns_str); + INFO_COMMAND_SINDEX_FAILCODE(AS_PROTO_RESULT_FAIL_PARAMETER, "Namespace Not Found"); + return 0; + } else { + if (as_sindex_list_str(ns, db)) { + cf_info(AS_INFO, "ns not found"); + cf_dyn_buf_append_string(db, "Empty"); + } + return 0; + } + } + return(0); +} + +// Defined in "make_in/version.c" (auto-generated by the build system.) +extern const char aerospike_build_id[]; +extern const char aerospike_build_time[]; +extern const char aerospike_build_type[]; +extern const char aerospike_build_os[]; +extern const char aerospike_build_features[]; + +int +as_info_init() +{ + // g_info_node_info_history_hash is a hash of all nodes that have ever been + // recognized by this node - either via paxos or info messages. + g_info_node_info_history_hash = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), sizeof(info_node_info), 64, CF_SHASH_BIG_LOCK); + + // g_info_node_info_hash is a hash of all nodes *currently* in the cluster. + // This hash should *always* be a subset of g_info_node_info_history_hash - + // to ensure this, you should take the lock on the corresponding key in + // info_history_hash before modifying an element in this hash table. This + // hash is used to create the services list. + g_info_node_info_hash = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), sizeof(info_node_info), 64, CF_SHASH_BIG_LOCK); + + // create worker threads + g_info_work_q = cf_queue_create(sizeof(as_info_transaction), true); + + char vstr[64]; + sprintf(vstr, "%s build %s", aerospike_build_type, aerospike_build_id); + + // Set some basic values + as_info_set("version", vstr, true); // Returns the edition and build number. + as_info_set("build", aerospike_build_id, true); // Returns the build number for this server. + as_info_set("build_os", aerospike_build_os, true); // Return the OS used to create this build. + as_info_set("build_time", aerospike_build_time, true); // Return the creation time of this build. + as_info_set("edition", aerospike_build_type, true); // Return the edition of this build. + as_info_set("digests", "RIPEMD160", false); // Returns the hashing algorithm used by the server for key hashing. + as_info_set("status", "ok", false); // Always returns ok, used to verify service port is open. + as_info_set("STATUS", "OK", false); // Always returns OK, used to verify service port is open. + + char istr[1024]; + cf_str_itoa(AS_PARTITIONS, istr, 10); + as_info_set("partitions", istr, false); // Returns the number of partitions used to hash keys across. + + cf_str_itoa_u64(g_config.self_node, istr, 16); + as_info_set("node", istr, true); // Node ID. Unique 15 character hex string for each node based on the mac address and port. + as_info_set("name", istr, false); // Alias to 'node'. + // Returns list of features supported by this server + static char features[1024]; + strcat(features, "peers;cdt-list;cdt-map;pipelining;geo;float;batch-index;replicas;replicas-all;replicas-master;replicas-prole;udf"); + strcat(features, aerospike_build_features); + as_info_set("features", features, true); + as_hb_mode hb_mode; + as_hb_info_listen_addr_get(&hb_mode, istr, sizeof(istr)); + as_info_set( hb_mode == AS_HB_MODE_MESH ? "mesh" : "mcast", istr, false); + + // All commands accepted by asinfo/telnet + as_info_set("help", "alloc-info;asm;bins;build;build_os;build_time;cluster-name;config-get;config-set;" + "df;digests;dump-cluster;dump-fabric;dump-hb;dump-migrates;dump-msgs;dump-rw;" + "dump-si;dump-skew;dump-smd;dump-wb;dump-wb-summary;feature-key;get-config;get-sl;hist-dump;" + "hist-track-start;hist-track-stop;jem-stats;jobs;latency;log;log-set;" + "log-message;logs;mcast;mem;mesh;mstats;mtrace;name;namespace;namespaces;node;" + "racks;recluster;revive;roster;roster-set;service;services;services-alumni;services-alumni-reset;set-config;" + "set-log;sets;set-sl;show-devices;sindex;sindex-create;sindex-delete;" + "sindex-histogram;" + "smd;statistics;status;tip;tip-clear;truncate;truncate-undo;version;", + false); + /* + * help intentionally does not include the following: + * cluster-generation;features;objects; + * partition-generation;partition-info;partitions;replicas-master; + * replicas-prole;replicas-read;replicas-write;throughput + */ + + // Set up some dynamic functions + as_info_set_dynamic("alumni-clear-std", info_get_alumni_clear_std, false); // Supersedes "services-alumni" for non-TLS service. + as_info_set_dynamic("alumni-tls-std", info_get_alumni_tls_std, false); // Supersedes "services-alumni" for TLS service. + as_info_set_dynamic("bins", info_get_bins, false); // Returns bin usage information and used bin names. + as_info_set_dynamic("cluster-generation", info_get_cluster_generation, true); // Returns cluster generation. + as_info_set_dynamic("cluster-name", info_get_cluster_name, false); // Returns cluster name. + as_info_set_dynamic("endpoints", info_get_endpoints, false); // Returns the expanded bind / access address configuration. + as_info_set_dynamic("feature-key", info_get_features, false); // Returns the contents of the feature key (except signature). + as_info_set_dynamic("get-config", info_get_config, false); // Returns running config for specified context. + as_info_set_dynamic("logs", info_get_logs, false); // Returns a list of log file locations in use by this server. + as_info_set_dynamic("namespaces", info_get_namespaces, false); // Returns a list of namespace defined on this server. + as_info_set_dynamic("objects", info_get_objects, false); // Returns the number of objects stored on this server. + as_info_set_dynamic("partition-generation", info_get_partition_generation, true); // Returns the current partition generation. + as_info_set_dynamic("partition-info", info_get_partition_info, false); // Returns partition ownership information. + as_info_set_dynamic("peers-clear-alt", info_get_services_clear_alt, false); // Supersedes "services-alternate" for non-TLS, alternate addresses. + as_info_set_dynamic("peers-clear-std", info_get_services_clear_std, false); // Supersedes "services" for non-TLS, standard addresses. + as_info_set_dynamic("peers-generation", info_get_services_generation, false); // Returns the generation of the peers-*-* services lists. + as_info_set_dynamic("peers-tls-alt", info_get_services_tls_alt, false); // Supersedes "services-alternate" for TLS, alternate addresses. + as_info_set_dynamic("peers-tls-std", info_get_services_tls_std, false); // Supersedes "services" for TLS, standard addresses. + as_info_set_dynamic("replicas", info_get_replicas, false); // Same as replicas-all, but includes regime. + as_info_set_dynamic("replicas-all", info_get_replicas_all, false); // Base 64 encoded binary representation of partitions this node is replica for. + as_info_set_dynamic("replicas-master", info_get_replicas_master, false); // Base 64 encoded binary representation of partitions this node is master (replica) for. + as_info_set_dynamic("replicas-prole", info_get_replicas_prole, false); // Base 64 encoded binary representation of partitions this node is prole (replica) for. + as_info_set_dynamic("service", info_get_service, false); // IP address and server port for this node, expected to be a single. + // address/port per node, may be multiple address if this node is configured. + // to listen on multiple interfaces (typically not advised). + as_info_set_dynamic("service-clear-alt", info_get_service_clear_alt, false); // Supersedes "service". The alternate address and port for this node's non-TLS + // client service. + as_info_set_dynamic("service-clear-std", info_get_service_clear_std, false); // Supersedes "service". The address and port for this node's non-TLS client service. + as_info_set_dynamic("service-tls-alt", info_get_service_tls_alt, false); // Supersedes "service". The alternate address and port for this node's TLS + // client service. + as_info_set_dynamic("service-tls-std", info_get_service_tls_std, false); // Supersedes "service". The address and port for this node's TLS client service. + as_info_set_dynamic("services", info_get_services, true); // List of addresses of neighbor cluster nodes to advertise for Application to connect. + as_info_set_dynamic("services-alternate", info_get_alt_addr, false); // IP address mapping from internal to public ones + as_info_set_dynamic("services-alumni", info_get_services_alumni, true); // All neighbor addresses (services) this server has ever know about. + as_info_set_dynamic("services-alumni-reset", info_services_alumni_reset, false); // Reset the services alumni to equal services. + as_info_set_dynamic("sets", info_get_sets, false); // Returns set statistics for all or a particular set. + as_info_set_dynamic("statistics", info_get_stats, true); // Returns system health and usage stats for this server. + +#ifdef INFO_SEGV_TEST + as_info_set_dynamic("segvtest", info_segv_test, true); +#endif + + // Tree-based names + as_info_set_tree("bins", info_get_tree_bins); // Returns bin usage information and used bin names for all or a particular namespace. + as_info_set_tree("log", info_get_tree_log); // + as_info_set_tree("namespace", info_get_tree_namespace); // Returns health and usage stats for a particular namespace. + as_info_set_tree("sets", info_get_tree_sets); // Returns set statistics for all or a particular set. + as_info_set_tree("statistics", info_get_tree_statistics); + + // Define commands + as_info_set_command("config-get", info_command_config_get, PERM_NONE); // Returns running config for specified context. + as_info_set_command("config-set", info_command_config_set, PERM_SET_CONFIG); // Set a configuration parameter at run time, configuration parameter must be dynamic. + as_info_set_command("dump-cluster", info_command_dump_cluster, PERM_LOGGING_CTRL); // Print debug information about clustering and exchange to the log file. + as_info_set_command("dump-fabric", info_command_dump_fabric, PERM_LOGGING_CTRL); // Print debug information about fabric to the log file. + as_info_set_command("dump-hb", info_command_dump_hb, PERM_LOGGING_CTRL); // Print debug information about heartbeat state to the log file. + as_info_set_command("dump-hlc", info_command_dump_hlc, PERM_LOGGING_CTRL); // Print debug information about Hybrid Logical Clock to the log file. + as_info_set_command("dump-migrates", info_command_dump_migrates, PERM_LOGGING_CTRL); // Print debug information about migration. + as_info_set_command("dump-msgs", info_command_dump_msgs, PERM_LOGGING_CTRL); // Print debug information about existing 'msg' objects and queues to the log file. + as_info_set_command("dump-rw", info_command_dump_rw_request_hash, PERM_LOGGING_CTRL); // Print debug information about transaction hash table to the log file. + as_info_set_command("dump-si", info_command_dump_si, PERM_LOGGING_CTRL); // Print information about a Secondary Index + as_info_set_command("dump-skew", info_command_dump_skew, PERM_LOGGING_CTRL); // Print information about clock skew + as_info_set_command("dump-smd", info_command_dump_smd, PERM_LOGGING_CTRL); // Print information about System Metadata (SMD) to the log file. + as_info_set_command("dump-wb", info_command_dump_wb, PERM_LOGGING_CTRL); // Print debug information about Write Bocks (WB) to the log file. + as_info_set_command("dump-wb-summary", info_command_dump_wb_summary, PERM_LOGGING_CTRL); // Print summary information about all Write Blocks (WB) on a device to the log file. + as_info_set_command("get-config", info_command_config_get, PERM_NONE); // Returns running config for all or a particular context. + as_info_set_command("get-sl", info_command_get_sl, PERM_NONE); // Get the Paxos succession list. + as_info_set_command("hist-dump", info_command_hist_dump, PERM_NONE); // Returns a histogram snapshot for a particular histogram. + as_info_set_command("hist-track-start", info_command_hist_track, PERM_SERVICE_CTRL); // Start or Restart histogram tracking. + as_info_set_command("hist-track-stop", info_command_hist_track, PERM_SERVICE_CTRL); // Stop histogram tracking. + as_info_set_command("jem-stats", info_command_jem_stats, PERM_LOGGING_CTRL); // Print JEMalloc statistics to the log file. + as_info_set_command("latency", info_command_hist_track, PERM_NONE); // Returns latency and throughput information. + as_info_set_command("log-message", info_command_log_message, PERM_NONE); // Log a message. + as_info_set_command("log-set", info_command_log_set, PERM_LOGGING_CTRL); // Set values in the log system. + as_info_set_command("peers-clear-alt", info_get_services_clear_alt_delta, PERM_NONE); // The delta update version of "peers-clear-alt". + as_info_set_command("peers-clear-std", info_get_services_clear_std_delta, PERM_NONE); // The delta update version of "peers-clear-std". + as_info_set_command("peers-tls-alt", info_get_services_tls_alt_delta, PERM_NONE); // The delta update version of "peers-tls-alt". + as_info_set_command("peers-tls-std", info_get_services_tls_std_delta, PERM_NONE); // The delta update version of "peers-tls-std". + as_info_set_command("racks", info_command_racks, PERM_NONE); // Rack-aware information. + as_info_set_command("recluster", info_command_recluster, PERM_NONE); // Force cluster to re-form. FIXME - what permission? + as_info_set_command("revive", info_command_revive, PERM_NONE); // Mark all partitions as "trusted". + as_info_set_command("roster", info_command_roster, PERM_NONE); // Roster information. + as_info_set_command("roster-set", info_command_roster_set, PERM_NONE); // Set the entire roster. FIXME - what permission? + as_info_set_command("set-config", info_command_config_set, PERM_SET_CONFIG); // Set config values. + as_info_set_command("set-log", info_command_log_set, PERM_LOGGING_CTRL); // Set values in the log system. + as_info_set_command("show-devices", info_command_show_devices, PERM_LOGGING_CTRL); // Print snapshot of wblocks to the log file. + as_info_set_command("throughput", info_command_hist_track, PERM_NONE); // Returns throughput info. + as_info_set_command("tip", info_command_tip, PERM_SERVICE_CTRL); // Add external IP to mesh-mode heartbeats. + as_info_set_command("tip-clear", info_command_tip_clear, PERM_SERVICE_CTRL); // Clear tip list from mesh-mode heartbeats. + as_info_set_command("truncate", info_command_truncate, PERM_TRUNCATE); // Truncate a namespace or set. + as_info_set_command("truncate-undo", info_command_truncate_undo, PERM_TRUNCATE); // Undo a truncate command. + as_info_set_command("xdr-command", as_info_command_xdr, PERM_SERVICE_CTRL); // Command to XDR module. + + // SINDEX + as_info_set_dynamic("sindex", info_get_sindexes, false); + as_info_set_tree("sindex", info_get_tree_sindexes); + as_info_set_command("sindex-create", info_command_sindex_create, PERM_INDEX_MANAGE); // Create a secondary index. + as_info_set_command("sindex-delete", info_command_sindex_delete, PERM_INDEX_MANAGE); // Delete a secondary index. + + // UDF + as_info_set_dynamic("udf-list", udf_cask_info_list, false); + as_info_set_command("udf-put", udf_cask_info_put, PERM_UDF_MANAGE); + as_info_set_command("udf-get", udf_cask_info_get, PERM_NONE); + as_info_set_command("udf-remove", udf_cask_info_remove, PERM_UDF_MANAGE); + as_info_set_command("udf-clear-cache", udf_cask_info_clear_cache, PERM_UDF_MANAGE); + + // JOBS + as_info_set_command("jobs", info_command_mon_cmd, PERM_JOB_MONITOR); // Manipulate the multi-key lookup monitoring infrastructure. + + // Undocumented Secondary Index Command + as_info_set_command("sindex-histogram", info_command_sindex_histogram, PERM_SERVICE_CTRL); + + as_info_set_dynamic("query-list", as_query_list, false); + as_info_set_command("query-kill", info_command_query_kill, PERM_QUERY_MANAGE); + as_info_set_command("scan-abort", info_command_abort_scan, PERM_SCAN_MANAGE); // Abort a scan with a given id. + as_info_set_command("scan-abort-all", info_command_abort_all_scans, PERM_SCAN_MANAGE); // Abort all scans. + as_info_set_dynamic("scan-list", as_scan_list, false); // List info for all scan jobs. + as_info_set_command("sindex-stat", info_command_sindex_stat, PERM_NONE); + as_info_set_command("sindex-list", info_command_sindex_list, PERM_NONE); + as_info_set_dynamic("sindex-builder-list", as_sbld_list, false); // List info for all secondary index builder jobs. + + as_xdr_info_init(); + + // Spin up the Info threads *after* all static and dynamic Info commands have been added + // so we can guarantee that the static and dynamic lists will never again be changed. + pthread_attr_t thr_attr; + pthread_attr_init(&thr_attr); + pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED); + + for (int i = 0; i < g_config.n_info_threads; i++) { + pthread_t tid; + if (0 != pthread_create(&tid, &thr_attr, thr_info_fn, (void *) 0 )) { + cf_crash(AS_INFO, "pthread_create: %s", cf_strerror(errno)); + } + } + + as_fabric_register_msg_fn(M_TYPE_INFO, info_mt, sizeof(info_mt), INFO_MSG_SCRATCH_SIZE, info_msg_fn, 0 /* udata */ ); + + as_exchange_register_listener(info_clustering_event_listener, NULL); + + // Initialize services info exchange machinery. + set_static_services(); + + if (g_config.tls_service.tls_our_name != NULL) { + g_serv_tls_name = g_config.tls_service.tls_our_name; + } + + ++g_serv_gen; + + pthread_t info_interfaces_th; + pthread_create(&info_interfaces_th, &thr_attr, info_interfaces_fn, 0); + return(0); +} diff --git a/as/src/base/thr_info_port.c b/as/src/base/thr_info_port.c new file mode 100644 index 00000000..bdc4c4fc --- /dev/null +++ b/as/src/base/thr_info_port.c @@ -0,0 +1,316 @@ +/* + * thr_info_port.c + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/thr_info_port.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" + +#include "cf_str.h" +#include "dynbuf.h" +#include "fault.h" +#include "socket.h" + +#include "base/cfg.h" +#include "base/thr_info.h" + +#define POLL_SZ 1024 + +// State for any open info port. +typedef struct { + int recv_pos; + int recv_alloc; + uint8_t *recv_buf; + + int xmit_pos; // where we're currently writing + int xmit_limit; // the end of the write buffer + int xmit_alloc; + uint8_t *xmit_buf; + + cf_socket sock; + +} info_port_state; + +cf_serv_cfg g_info_bind = { .n_cfgs = 0 }; +cf_ip_port g_info_port = 0; + +static cf_sockets g_sockets; + +// Using int for 4-byte size, but maintaining bool semantics. +static volatile int g_started = false; + +void +info_port_state_free(info_port_state *ips) +{ + if (ips->recv_buf) cf_free(ips->recv_buf); + if (ips->xmit_buf) cf_free(ips->xmit_buf); + cf_socket_close(&ips->sock); + cf_socket_term(&ips->sock); + memset(ips, -1, sizeof(info_port_state)); + cf_free(ips); +} + + +int +thr_info_port_readable(info_port_state *ips) +{ + int sz = cf_socket_available(&ips->sock); + + if (sz == 0) { + return 0; + } + + // Make sure we've got some reasonable space in the read buffer. + if (ips->recv_alloc - ips->recv_pos < sz) { + int new_sz = sz + ips->recv_pos + 100; + ips->recv_buf = cf_realloc(ips->recv_buf, new_sz); + ips->recv_alloc = new_sz; + } + + int n = cf_socket_recv(&ips->sock, ips->recv_buf + ips->recv_pos, ips->recv_alloc - ips->recv_pos, 0); + if (n < 0) { + if (errno != EAGAIN) { + cf_detail(AS_INFO_PORT, "info socket: read fail: error: rv %d sz was %d errno %d", n, ips->recv_alloc - ips->recv_pos, errno); + } + return -1; + } + ips->recv_pos += n; + + // What about a control-c? + if (-1 != cf_str_strnchr(ips->recv_buf, ips->recv_pos, 0xFF)) { + cf_debug(AS_INFO_PORT, "recived a control c, aborting"); + return -1; + } + + // See if we've got a CR or LF in the buf yet. + int cr = cf_str_strnchr(ips->recv_buf, ips->recv_pos, '\r'); + int lf = cf_str_strnchr(ips->recv_buf, ips->recv_pos, '\n'); + if ((cr >= 0) || (lf >= 0)) { + size_t len; + // Take the closest of cr or lf. + if (-1 == lf) { + len = cr; + } + else if (-1 == cr) { + len = lf; + } + else { + len = lf < cr ? lf : cr; + } + + // We have a message. Process it. + cf_dyn_buf_define(db); + + ips->recv_buf[len] = '\n'; + len++; + + // Fill out the db buffer with the response (always returns 0). + as_info_buffer(ips->recv_buf, len, &db); + if (db.used_sz == 0) cf_dyn_buf_append_char(&db, '\n'); + + // See if it has a tab, get that location. It probably does. + int tab = cf_str_strnchr(db.buf, db.used_sz , '\t'); + tab++; + + while (len < ips->recv_pos && + ((ips->recv_buf[len] == '\r') || (ips->recv_buf[len] == '\n'))) { + + len ++ ; + } + + // Move transmit buffer forward. + if (ips->recv_pos - len > 0) { + memmove(ips->recv_buf, ips->recv_buf + len, ips->recv_pos - len); + ips->recv_pos -= len; + } + else { + ips->recv_pos = 0; + } + + // Queue the response - set to the xmit buf. + if (ips->xmit_alloc - ips->xmit_limit < db.used_sz) { + ips->xmit_buf = cf_realloc(ips->xmit_buf, db.used_sz + ips->xmit_limit); + ips->xmit_alloc = db.used_sz + ips->xmit_limit; + } + memcpy(ips->xmit_buf + ips->xmit_limit, db.buf + tab, db.used_sz - tab); + ips->xmit_limit += db.used_sz - tab; + + cf_dyn_buf_free(&db); + } + + return 0; +} + + +int +thr_info_port_writable(info_port_state *ips) +{ + // Do we have bytes to write? + if (ips->xmit_limit > 0) { + + // Write them! + int rv = cf_socket_send(&ips->sock, ips->xmit_buf + ips->xmit_pos, ips->xmit_limit - ips->xmit_pos , MSG_NOSIGNAL); + if (rv < 0) { + if (errno != EAGAIN) { + return -1; + } + } + else if (rv == 0) { + cf_debug(AS_INFO_PORT, "send with return value 0"); + return 0; + } + else { + ips->xmit_pos += rv; + if (ips->xmit_pos == ips->xmit_limit) { + ips->xmit_pos = ips->xmit_limit = 0; + } + } + } + + return 0; +} + + +// Demarshal info socket connections. +void * +thr_info_port_fn(void *arg) +{ + cf_poll poll; + cf_debug(AS_INFO_PORT, "Info port process started"); + + // Start the listener socket. Note that because this is done after privilege + // de-escalation, we can't use privileged ports. + + if (cf_socket_init_server(&g_info_bind, &g_sockets) < 0) { + cf_crash(AS_INFO_PORT, "Couldn't initialize service sockets"); + } + + cf_poll_create(&poll); + cf_poll_add_sockets(poll, &g_sockets, EPOLLIN | EPOLLERR | EPOLLHUP); + cf_socket_show_server(AS_INFO_PORT, "info", &g_sockets); + + g_started = true; + + while (true) { + cf_poll_event events[POLL_SZ]; + int32_t n_ev = cf_poll_wait(poll, events, POLL_SZ, -1); + + for (int32_t i = 0; i < n_ev; ++i) { + cf_socket *ssock = events[i].data; + + if (cf_sockets_has_socket(&g_sockets, ssock)) { + cf_socket csock; + cf_sock_addr addr; + + if (cf_socket_accept(ssock, &csock, &addr) < 0) { + // This means we're out of file descriptors. + if (errno == EMFILE) { + cf_warning(AS_INFO_PORT, "Too many file descriptors in use, consider raising limit"); + continue; + } + + cf_crash(AS_INFO_PORT, "cf_socket_accept() failed"); + } + + cf_detail(AS_INFO_PORT, "New connection: %s", cf_sock_addr_print(&addr)); + info_port_state *ips = cf_malloc(sizeof(info_port_state)); + + ips->recv_pos = 0; + ips->recv_alloc = 100; + ips->recv_buf = cf_malloc(100); + ips->xmit_limit = ips->xmit_pos = 0; + ips->xmit_alloc = 100; + ips->xmit_buf = cf_malloc(100); + cf_socket_copy(&csock, &ips->sock); + + cf_poll_add_socket(poll, &csock, EPOLLIN | EPOLLOUT | EPOLLET | EPOLLRDHUP, ips); + } + else { + info_port_state *ips = events[i].data; + + if (ips == NULL) { + cf_crash(AS_INFO_PORT, "Event with null handle"); + } + + cf_detail(AS_INFO_PORT, "Events %x on FD %d", events[i].events, CSFD(&ips->sock)); + + if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) { + cf_detail(AS_INFO_PORT, "Remote close on FD %d", CSFD(&ips->sock)); + cf_poll_delete_socket(poll, &ips->sock); + info_port_state_free(ips); + continue; + } + + if ((events[i].events & EPOLLIN) != 0 && thr_info_port_readable(ips) < 0) { + cf_poll_delete_socket(poll, &ips->sock); + info_port_state_free(ips); + continue; + } + + if ((events[i].events & EPOLLOUT) != 0 && thr_info_port_writable(ips) < 0) { + cf_poll_delete_socket(poll, &ips->sock); + info_port_state_free(ips); + continue; + } + } + + pthread_testcancel(); + } + } + + return NULL; +} + + +void +as_info_port_start() +{ + if (g_info_port == 0) { + return; + } + + cf_info(AS_INFO_PORT, "starting info port thread"); + + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + if (pthread_create(&thread, &attrs, thr_info_port_fn, NULL) != 0) { + cf_crash(AS_INFO_PORT, "failed to create info port thread"); + } + + // For orderly startup log, wait for endpoint setup. + while (! g_started) { + usleep(1000); + } +} diff --git a/as/src/base/thr_nsup.c b/as/src/base/thr_nsup.c new file mode 100644 index 00000000..525c9c91 --- /dev/null +++ b/as/src/base/thr_nsup.c @@ -0,0 +1,1276 @@ +/* + * thr_nsup.c + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * namespace supervisor + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // for MIN and MAX + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_queue.h" + +#include "fault.h" +#include "hardware.h" +#include "linear_hist.h" +#include "vmapx.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/thr_sindex.h" +#include "base/thr_tsvc.h" +#include "base/transaction.h" +#include "base/xdr_serverside.h" +#include "fabric/partition.h" +#include "storage/storage.h" + + +//========================================================== +// Typedefs & constants. +// + +#define EVAL_STOP_WRITES_PERIOD 10 // seconds + + +//========================================================== +// Forward declarations. +// + +static bool eval_stop_writes(as_namespace *ns); +static bool eval_hwm_breached(as_namespace *ns); + + +//========================================================== +// Eviction during cold start. +// +// No real need for this to be in thr_nsup.c, except maybe +// for convenient comparison to run-time eviction. +// + +#define EVAL_WRITE_STATE_FREQUENCY 1024 +#define COLD_START_HIST_MIN_BUCKETS 100000 // histogram memory is transient + + +//------------------------------------------------ +// Reduce callback prepares for cold start eviction. +// - builds cold start eviction histogram +// +typedef struct cold_start_evict_prep_info_s { + as_namespace* ns; + linear_hist* hist; + bool* sets_not_evicting; +} cold_start_evict_prep_info; + +static void +cold_start_evict_prep_reduce_cb(as_index_ref* r_ref, void* udata) +{ + as_index* r = r_ref->r; + cold_start_evict_prep_info* p_info = (cold_start_evict_prep_info*)udata; + uint32_t set_id = as_index_get_set_id(r); + uint32_t void_time = r->void_time; + + if (void_time != 0 && + ! p_info->sets_not_evicting[set_id]) { + linear_hist_insert_data_point(p_info->hist, void_time); + } + + as_record_done(r_ref, p_info->ns); +} + +//------------------------------------------------ +// Threads prepare for cold start eviction. +// +typedef struct evict_prep_thread_info_s { + as_namespace* ns; + cf_atomic32* p_pid; + uint32_t i_cpu; + linear_hist* hist; + bool* sets_not_evicting; +} evict_prep_thread_info; + +void* +run_cold_start_evict_prep(void* udata) +{ + evict_prep_thread_info* p_info = (evict_prep_thread_info*)udata; + + cf_topo_pin_to_cpu((cf_topo_cpu_index)p_info->i_cpu); + + as_namespace *ns = p_info->ns; + + cold_start_evict_prep_info cb_info; + + cb_info.ns = ns; + cb_info.hist = p_info->hist; + cb_info.sets_not_evicting = p_info->sets_not_evicting; + + int pid; + + while ((pid = (int)cf_atomic32_incr(p_info->p_pid)) < AS_PARTITIONS) { + // Don't bother with partition reservations - it's startup. + as_index_reduce_live(ns->partitions[pid].vp, cold_start_evict_prep_reduce_cb, &cb_info); + } + + return NULL; +} + +//------------------------------------------------ +// Reduce callback evicts records on cold start. +// - evicts based on calculated threshold +// +typedef struct cold_start_evict_info_s { + as_namespace* ns; + as_partition* p_partition; + bool* sets_not_evicting; + uint32_t num_evicted; + uint32_t num_0_void_time; +} cold_start_evict_info; + +static void +cold_start_evict_reduce_cb(as_index_ref* r_ref, void* udata) +{ + as_index* r = r_ref->r; + cold_start_evict_info* p_info = (cold_start_evict_info*)udata; + as_namespace* ns = p_info->ns; + as_partition* p_partition = p_info->p_partition; + uint32_t set_id = as_index_get_set_id(r); + uint32_t void_time = r->void_time; + + if (void_time != 0) { + if (! p_info->sets_not_evicting[set_id] && + void_time < ns->cold_start_threshold_void_time) { + as_index_delete(p_partition->vp, &r->keyd); + p_info->num_evicted++; + } + } + else { + p_info->num_0_void_time++; + } + + as_record_done(r_ref, ns); +} + +//------------------------------------------------ +// Threads do cold start eviction. +// +typedef struct evict_thread_info_s { + as_namespace* ns; + cf_atomic32 pid; + cf_atomic32 i_cpu; + bool* sets_not_evicting; + cf_atomic32 total_evicted; + cf_atomic32 total_0_void_time; +} evict_thread_info; + +void* +run_cold_start_evict(void* udata) +{ + evict_thread_info* p_info = (evict_thread_info*)udata; + + cf_topo_pin_to_cpu((cf_topo_cpu_index)cf_atomic32_incr(&p_info->i_cpu)); + + as_namespace* ns = p_info->ns; + + cold_start_evict_info cb_info; + + cb_info.ns = ns; + cb_info.sets_not_evicting = p_info->sets_not_evicting; + cb_info.num_evicted = 0; + cb_info.num_0_void_time = 0; + + int pid; + + while ((pid = (int)cf_atomic32_incr(&p_info->pid)) < AS_PARTITIONS) { + // Don't bother with partition reservations - it's startup. + as_partition* p_partition = &ns->partitions[pid]; + + cb_info.p_partition = p_partition; + as_index_reduce_live(p_partition->vp, cold_start_evict_reduce_cb, &cb_info); + } + + cf_atomic32_add(&p_info->total_evicted, cb_info.num_evicted); + cf_atomic32_add(&p_info->total_0_void_time, cb_info.num_0_void_time); + + return NULL; +} + +//------------------------------------------------ +// Get the cold start histogram's TTL range. +// +// TODO - ttl_range to 32 bits? +static uint64_t +get_cold_start_ttl_range(as_namespace* ns, uint32_t now) +{ + uint64_t max_void_time = 0; + + for (int n = 0; n < AS_PARTITIONS; n++) { + uint64_t partition_max_void_time = cf_atomic64_get(ns->partitions[n].max_void_time); + + if (partition_max_void_time > max_void_time) { + max_void_time = partition_max_void_time; + } + } + + // Use max-ttl to cap the namespace maximum void-time. + uint64_t cap = now + ns->max_ttl; + + if (max_void_time > cap) { + max_void_time = cap; + } + + // Convert to TTL - used for cold start histogram range. + return max_void_time > now ? max_void_time - now : 0; +} + +//------------------------------------------------ +// Set cold start eviction threshold. +// +static uint64_t +set_cold_start_threshold(as_namespace* ns, linear_hist* hist) +{ + linear_hist_threshold threshold; + uint64_t subtotal = linear_hist_get_threshold_for_fraction(hist, ns->evict_tenths_pct, &threshold); + bool all_buckets = threshold.value == 0xFFFFffff; + + if (subtotal == 0) { + if (all_buckets) { + cf_warning(AS_NSUP, "{%s} cold start found no records eligible for eviction", ns->name); + } + else { + cf_warning(AS_NSUP, "{%s} cold start found no records below eviction void-time %u - threshold bucket %u, width %u sec, count %lu > target %lu (%.1f pct)", + ns->name, threshold.value, threshold.bucket_index, + threshold.bucket_width, threshold.bucket_count, + threshold.target_count, (float)ns->evict_tenths_pct / 10.0); + } + + return 0; + } + + if (all_buckets) { + cf_warning(AS_NSUP, "{%s} cold start would evict all %lu records eligible - not evicting!", ns->name, subtotal); + return 0; + } + + cf_atomic32_set(&ns->cold_start_threshold_void_time, threshold.value); + + return subtotal; +} + +//------------------------------------------------ +// Cold start eviction, called by drv_ssd.c. +// Returns false if a serious problem occurred and +// we can't proceed. +// +bool +as_cold_start_evict_if_needed(as_namespace* ns) +{ + pthread_mutex_lock(&ns->cold_start_evict_lock); + + // Only go further than here every thousand record add attempts. + if (ns->cold_start_record_add_count++ % EVAL_WRITE_STATE_FREQUENCY != 0) { + pthread_mutex_unlock(&ns->cold_start_evict_lock); + return true; + } + + uint32_t now = as_record_void_time_get(); + + // Update threshold void-time if we're past it. + if (now > cf_atomic32_get(ns->cold_start_threshold_void_time)) { + cf_atomic32_set(&ns->cold_start_threshold_void_time, now); + } + + // Are we out of control? + if (eval_stop_writes(ns)) { + cf_warning(AS_NSUP, "{%s} hit stop-writes limit", ns->name); + pthread_mutex_unlock(&ns->cold_start_evict_lock); + return false; + } + + // If we don't need to evict, we're done. + if (! eval_hwm_breached(ns)) { + pthread_mutex_unlock(&ns->cold_start_evict_lock); + return true; + } + + // We want to evict, but are we allowed to do so? + if (! g_config.nsup_startup_evict) { + cf_warning(AS_NSUP, "{%s} hwm breached but not allowed to evict", ns->name); + pthread_mutex_unlock(&ns->cold_start_evict_lock); + return true; + } + + // We may evict - set up the cold start eviction histogram. + cf_info(AS_NSUP, "{%s} cold start building eviction histogram ...", ns->name); + + uint32_t ttl_range = (uint32_t)get_cold_start_ttl_range(ns, now); + uint32_t n_buckets = MAX(ns->evict_hist_buckets, COLD_START_HIST_MIN_BUCKETS); + + uint32_t num_sets = cf_vmapx_count(ns->p_sets_vmap); + bool sets_not_evicting[AS_SET_MAX_COUNT + 1]; + + memset(sets_not_evicting, 0, sizeof(sets_not_evicting)); + + for (uint32_t j = 0; j < num_sets; j++) { + uint32_t set_id = j + 1; + as_set* p_set; + + if (cf_vmapx_get_by_index(ns->p_sets_vmap, j, (void**)&p_set) != CF_VMAPX_OK) { + cf_crash(AS_NSUP, "failed to get set index %u from vmap", j); + } + + if (IS_SET_EVICTION_DISABLED(p_set)) { + sets_not_evicting[set_id] = true; + } + } + + // Split these tasks across multiple threads. + uint32_t n_cpus = cf_topo_count_cpus(); + pthread_t evict_threads[n_cpus]; + + // Reduce all partitions to build the eviction histogram. + evict_prep_thread_info prep_thread_infos[n_cpus]; + cf_atomic32 pid = -1; + + for (uint32_t n = 0; n < n_cpus; n++) { + prep_thread_infos[n].ns = ns; + prep_thread_infos[n].p_pid = &pid; + prep_thread_infos[n].i_cpu = n; + prep_thread_infos[n].hist = linear_hist_create("thread-hist", now, ttl_range, n_buckets); + prep_thread_infos[n].sets_not_evicting = sets_not_evicting; + + if (pthread_create(&evict_threads[n], NULL, run_cold_start_evict_prep, (void*)&prep_thread_infos[n]) != 0) { + cf_crash(AS_NSUP, "{%s} failed to create evict-prep thread %u", ns->name, n); + } + } + + for (uint32_t n = 0; n < n_cpus; n++) { + pthread_join(evict_threads[n], NULL); + + if (n == 0) { + continue; + } + + linear_hist_merge(prep_thread_infos[0].hist, prep_thread_infos[n].hist); + linear_hist_destroy(prep_thread_infos[n].hist); + } + // Now we're single-threaded again. + + // Calculate the eviction threshold. + uint64_t n_evictable = set_cold_start_threshold(ns, prep_thread_infos[0].hist); + + linear_hist_destroy(prep_thread_infos[0].hist); + + if (n_evictable == 0) { + cf_warning(AS_NSUP, "{%s} hwm breached but no records to evict", ns->name); + pthread_mutex_unlock(&ns->cold_start_evict_lock); + return true; + } + + cf_info(AS_NSUP, "{%s} cold start found %lu records eligible for eviction, evict ttl %u", ns->name, n_evictable, cf_atomic32_get(ns->cold_start_threshold_void_time) - now); + + // Reduce all partitions to evict based on the thresholds. + evict_thread_info thread_info = { + .ns = ns, + .pid = -1, + .i_cpu = -1, + .sets_not_evicting = sets_not_evicting, + .total_evicted = 0, + .total_0_void_time = 0 + }; + + for (uint32_t n = 0; n < n_cpus; n++) { + if (pthread_create(&evict_threads[n], NULL, run_cold_start_evict, (void*)&thread_info) != 0) { + cf_crash(AS_NSUP, "{%s} failed to create evict thread %u", ns->name, n); + } + } + + for (uint32_t n = 0; n < n_cpus; n++) { + pthread_join(evict_threads[n], NULL); + } + // Now we're single-threaded again. + + cf_info(AS_NSUP, "{%s} cold start evicted %u records, found %u 0-void-time records", ns->name, thread_info.total_evicted, thread_info.total_0_void_time); + + pthread_mutex_unlock(&ns->cold_start_evict_lock); + return true; +} + +// +// END - Eviction during cold start. +//========================================================== + +//========================================================== +// Temporary dangling prole garbage collection. +// + +typedef struct garbage_collect_info_s { + as_namespace* ns; + as_index_tree* p_tree; + uint32_t now; + uint32_t num_deleted; +} garbage_collect_info; + +static void +garbage_collect_reduce_cb(as_index_ref* r_ref, void* udata) +{ + garbage_collect_info* p_info = (garbage_collect_info*)udata; + uint32_t void_time = r_ref->r->void_time; + + // If we're past void-time plus safety margin, delete the record. + if (void_time != 0 && p_info->now > void_time + g_config.prole_extra_ttl) { + as_index_delete(p_info->p_tree, &r_ref->r->keyd); + p_info->num_deleted++; + } + + as_record_done(r_ref, p_info->ns); +} + +static int +garbage_collect_next_prole_partition(as_namespace* ns, int pid) +{ + as_partition_reservation rsv; + + // Look for the next non-master partition past pid, but loop only once over + // all partitions. + for (int n = 0; n < AS_PARTITIONS; n++) { + // Increment pid and wrap if necessary. + if (++pid == AS_PARTITIONS) { + pid = 0; + } + + // Note - may want a new method to get these under a single partition + // lock, but for now just do the two separate reserve calls. + if (as_partition_reserve_write(ns, pid, &rsv, NULL) == 0) { + // This is a master partition - continue. + as_partition_release(&rsv); + } + else { + as_partition_reserve(ns, pid, &rsv); + + // This is a non-master partition - garbage collect and break. + garbage_collect_info cb_info; + + cb_info.ns = ns; + cb_info.p_tree = rsv.tree; + cb_info.now = as_record_void_time_get(); + cb_info.num_deleted = 0; + + // Reduce the partition, deleting long-expired records. + as_index_reduce_live(rsv.tree, garbage_collect_reduce_cb, &cb_info); + + if (cb_info.num_deleted != 0) { + cf_info(AS_NSUP, "namespace %s pid %d: %u expired non-masters", + ns->name, pid, cb_info.num_deleted); + } + + as_partition_release(&rsv); + + // Do only one partition per nsup loop. + break; + } + } + + return pid; +} + +// +// END - Temporary dangling prole garbage collection. +//========================================================== + + +static cf_queue* g_p_nsup_delete_q = NULL; + +int +as_nsup_queue_get_size() +{ + return g_p_nsup_delete_q ? cf_queue_sz(g_p_nsup_delete_q) : 0; +} + +// Make sure a huge nsup deletion wave won't blow delete queue up. +#define DELETE_Q_SAFETY_THRESHOLD 10000 +#define DELETE_Q_SAFETY_SLEEP_us 1000 // 1 millisecond + +// Wait for delete queue to clear. +#define DELETE_Q_CLEAR_SLEEP_us 1000 // 1 millisecond + +typedef struct record_delete_info_s { + as_namespace* ns; + cf_digest digest; +} record_delete_info; + + +//------------------------------------------------ +// Run thread to handle delete queue. +// +void* +run_nsup_delete(void* pv_data) +{ + while (true) { + record_delete_info q_item; + + if (CF_QUEUE_OK != cf_queue_pop(g_p_nsup_delete_q, (void*)&q_item, CF_QUEUE_FOREVER)) { + cf_crash(AS_NSUP, "nsup delete queue pop failed"); + } + + // Generate a delete transaction for this digest, and hand it to tsvc. + + uint8_t info2 = AS_MSG_INFO2_WRITE | AS_MSG_INFO2_DELETE; + + cl_msg *msgp = as_msg_create_internal(q_item.ns->name, &q_item.digest, + 0, info2, 0); + + as_transaction tr; + as_transaction_init_head(&tr, NULL, msgp); + + as_transaction_set_msg_field_flag(&tr, AS_MSG_FIELD_TYPE_NAMESPACE); + as_transaction_set_msg_field_flag(&tr, AS_MSG_FIELD_TYPE_DIGEST_RIPE); + tr.origin = FROM_NSUP; + tr.start_time = cf_getns(); + + as_tsvc_enqueue(&tr); + + // Throttle - don't overwhelm tsvc queue. + if (g_config.nsup_delete_sleep != 0) { + usleep(g_config.nsup_delete_sleep); + } + } + + return NULL; +} + +//------------------------------------------------ +// Queue a record for deletion. +// +static void +queue_for_delete(as_namespace* ns, cf_digest* p_digest) +{ + record_delete_info q_item; + + q_item.ns = ns; // not bothering with namespace reservation + q_item.digest = *p_digest; + + cf_queue_push(g_p_nsup_delete_q, (void*)&q_item); +} + +//------------------------------------------------ +// Insert data into object size histograms. +// +static void +add_to_obj_size_histograms(as_namespace* ns, as_index* r) +{ + uint32_t set_id = as_index_get_set_id(r); + linear_hist* set_obj_size_hist = ns->set_obj_size_hists[set_id]; + uint64_t n_rblocks = r->n_rblocks; + + linear_hist_insert_data_point(ns->obj_size_hist, n_rblocks); + + if (set_obj_size_hist) { + linear_hist_insert_data_point(set_obj_size_hist, n_rblocks); + } +} + +//------------------------------------------------ +// Insert data into TTL histograms. +// +static void +add_to_ttl_histograms(as_namespace* ns, as_index* r) +{ + uint32_t set_id = as_index_get_set_id(r); + linear_hist* set_ttl_hist = ns->set_ttl_hists[set_id]; + uint32_t void_time = r->void_time; + + linear_hist_insert_data_point(ns->ttl_hist, void_time); + + if (set_ttl_hist) { + linear_hist_insert_data_point(set_ttl_hist, void_time); + } +} + +//------------------------------------------------ +// Reduce callback prepares for eviction. +// - builds object size, eviction & TTL histograms +// - counts 0-void-time records +// +typedef struct evict_prep_info_s { + as_namespace* ns; + bool* sets_not_evicting; + uint64_t num_0_void_time; +} evict_prep_info; + +static void +evict_prep_reduce_cb(as_index_ref* r_ref, void* udata) +{ + as_index* r = r_ref->r; + evict_prep_info* p_info = (evict_prep_info*)udata; + as_namespace* ns = p_info->ns; + uint32_t set_id = as_index_get_set_id(r); + uint32_t void_time = r->void_time; + + add_to_obj_size_histograms(ns, r); + + if (void_time != 0) { + if (! p_info->sets_not_evicting[set_id]) { + linear_hist_insert_data_point(ns->evict_hist, void_time); + } + + add_to_ttl_histograms(ns, r); + } + else { + p_info->num_0_void_time++; + } + + as_record_done(r_ref, ns); +} + +//------------------------------------------------ +// Reduce callback evicts records. +// - evicts based on general threshold +// - does expiration on eviction-disabled sets +// +typedef struct evict_info_s { + as_namespace* ns; + uint32_t now; + bool* sets_not_evicting; + uint32_t evict_void_time; + uint64_t num_evicted; +} evict_info; + +static void +evict_reduce_cb(as_index_ref* r_ref, void* udata) +{ + as_index* r = r_ref->r; + evict_info* p_info = (evict_info*)udata; + as_namespace* ns = p_info->ns; + uint32_t set_id = as_index_get_set_id(r); + uint32_t void_time = r->void_time; + + if (void_time != 0) { + if (p_info->sets_not_evicting[set_id]) { + if (p_info->now > void_time) { + queue_for_delete(ns, &r->keyd); + p_info->num_evicted++; + } + } + else if (void_time < p_info->evict_void_time) { + queue_for_delete(ns, &r->keyd); + p_info->num_evicted++; + } + } + + as_record_done(r_ref, ns); +} + +//------------------------------------------------ +// Reduce callback expires records. +// - does expiration +// - builds object size & TTL histograms +// - counts 0-void-time records +// +typedef struct expire_info_s { + as_namespace* ns; + uint32_t now; + uint64_t num_expired; + uint64_t num_0_void_time; +} expire_info; + +static void +expire_reduce_cb(as_index_ref* r_ref, void* udata) +{ + as_index* r = r_ref->r; + expire_info* p_info = (expire_info*)udata; + as_namespace* ns = p_info->ns; + uint32_t void_time = r->void_time; + + if (void_time != 0) { + if (p_info->now > void_time) { + queue_for_delete(ns, &r->keyd); + p_info->num_expired++; + } + else { + add_to_obj_size_histograms(ns, r); + add_to_ttl_histograms(ns, r); + } + } + else { + add_to_obj_size_histograms(ns, r); + p_info->num_0_void_time++; + } + + as_record_done(r_ref, ns); +} + +//------------------------------------------------ +// Reduce all master partitions, using specified +// functionality. Throttle to make sure deletions +// generated by reducing each partition don't blow +// up the delete queue. +// +static void +reduce_master_partitions(as_namespace* ns, as_index_reduce_fn cb, void* udata, uint32_t* p_n_waits, const char* tag) +{ + as_partition_reservation rsv; + + for (int n = 0; n < AS_PARTITIONS; n++) { + if (as_partition_reserve_write(ns, n, &rsv, NULL) != 0) { + continue; + } + + as_index_reduce_live(rsv.tree, cb, udata); + + as_partition_release(&rsv); + + while (cf_queue_sz(g_p_nsup_delete_q) > DELETE_Q_SAFETY_THRESHOLD) { + usleep(DELETE_Q_SAFETY_SLEEP_us); + (*p_n_waits)++; + } + + cf_debug(AS_NSUP, "{%s} %s done partition index %d, waits %u", ns->name, tag, n, *p_n_waits); + } +} + +//------------------------------------------------ +// Lazily create and clear a set's size histogram. +// +static void +clear_set_obj_size_hist(as_namespace* ns, uint32_t set_id) +{ + if (! ns->set_obj_size_hists[set_id]) { + char hist_name[HISTOGRAM_NAME_SIZE]; + + sprintf(hist_name, "%s set %u object size histogram", ns->name, set_id); + ns->set_obj_size_hists[set_id] = linear_hist_create(hist_name, 0, 0, OBJ_SIZE_HIST_NUM_BUCKETS); + } + + linear_hist_clear(ns->set_obj_size_hists[set_id], 0, cf_atomic32_get(ns->obj_size_hist_max)); +} + +//------------------------------------------------ +// Lazily create and clear a set's TTL histogram. +// +static void +clear_set_ttl_hist(as_namespace* ns, uint32_t set_id, uint32_t now, uint64_t ttl_range) +{ + if (! ns->set_ttl_hists[set_id]) { + char hist_name[HISTOGRAM_NAME_SIZE]; + + sprintf(hist_name, "%s set %u ttl histogram", ns->name, set_id); + ns->set_ttl_hists[set_id] = linear_hist_create(hist_name, 0, 0, TTL_HIST_NUM_BUCKETS); + } + + linear_hist_clear(ns->set_ttl_hists[set_id], now, ttl_range); +} + +//------------------------------------------------ +// Get the TTL range for histograms. +// +// TODO - ttl_range to 32 bits? +static uint64_t +get_ttl_range(as_namespace* ns, uint32_t now) +{ + uint64_t max_master_void_time = 0; + as_partition_reservation rsv; + + for (int n = 0; n < AS_PARTITIONS; n++) { + if (as_partition_reserve_write(ns, n, &rsv, NULL) != 0) { + continue; + } + + as_partition_release(&rsv); + + uint64_t partition_max_void_time = cf_atomic64_get(ns->partitions[n].max_void_time); + + if (partition_max_void_time > max_master_void_time) { + max_master_void_time = partition_max_void_time; + } + } + + // Use max-ttl to cap the namespace maximum void-time. + uint64_t cap = now + ns->max_ttl; + + if (max_master_void_time > cap) { + max_master_void_time = cap; + } + + // Convert to TTL - used for histogram ranges. + return max_master_void_time > now ? max_master_void_time - now : 0; +} + +//------------------------------------------------ +// Get general eviction threshold. +// +static bool +get_threshold(as_namespace* ns, uint32_t* p_evict_void_time) +{ + linear_hist_threshold threshold; + uint64_t subtotal = linear_hist_get_threshold_for_fraction(ns->evict_hist, ns->evict_tenths_pct, &threshold); + bool all_buckets = threshold.value == 0xFFFFffff; + + *p_evict_void_time = threshold.value; + + if (subtotal == 0) { + if (all_buckets) { + cf_warning(AS_NSUP, "{%s} no records eligible for eviction", ns->name); + } + else { + cf_warning(AS_NSUP, "{%s} no records below eviction void-time %u - threshold bucket %u, width %u sec, count %lu > target %lu (%.1f pct)", + ns->name, threshold.value, threshold.bucket_index, + threshold.bucket_width, threshold.bucket_count, + threshold.target_count, (float)ns->evict_tenths_pct / 10.0); + } + + return false; + } + + if (all_buckets) { + cf_warning(AS_NSUP, "{%s} would evict all %lu records eligible - not evicting!", ns->name, subtotal); + return false; + } + + cf_info(AS_NSUP, "{%s} found %lu records eligible for eviction", ns->name, subtotal); + + return true; +} + +//------------------------------------------------ +// Stats per namespace at the end of an nsup lap. +// +static void +update_stats(as_namespace* ns, uint64_t n_master, uint64_t n_0_void_time, + uint64_t n_expired_objects, uint64_t n_evicted_objects, + uint32_t evict_ttl, uint32_t n_general_waits, uint32_t n_clear_waits, + uint64_t start_ms) +{ + ns->non_expirable_objects = n_0_void_time; + + cf_atomic64_add(&ns->n_expired_objects, n_expired_objects); + cf_atomic64_add(&ns->n_evicted_objects, n_evicted_objects); + + cf_atomic64_set(&ns->evict_ttl, evict_ttl); + + uint64_t total_duration_ms = cf_getms() - start_ms; + + ns->nsup_cycle_duration = (uint32_t)(total_duration_ms / 1000); + ns->nsup_cycle_sleep_pct = total_duration_ms == 0 ? 0 : (uint32_t)((n_general_waits * 100) / total_duration_ms); + + cf_info(AS_NSUP, "{%s} nsup-done: master-objects (%lu,%lu) expired (%lu,%lu) evicted (%lu,%lu) evict-ttl %d waits (%u,%u) total-ms %lu", + ns->name, + n_master, n_0_void_time, + ns->n_expired_objects, n_expired_objects, + ns->n_evicted_objects, n_evicted_objects, + evict_ttl, + n_general_waits, n_clear_waits, + total_duration_ms); +} + +//------------------------------------------------ +// Namespace supervisor thread "run" function. +// +void * +run_nsup(void *arg) +{ + // Garbage-collect long-expired proles, one partition per loop. + int prole_pids[g_config.n_namespaces]; + + for (int n = 0; n < g_config.n_namespaces; n++) { + prole_pids[n] = -1; + } + + uint64_t last_time = cf_get_seconds(); + + for ( ; ; ) { + // Wake up every 1 second to check the nsup timeout. + struct timespec delay = { 1, 0 }; + nanosleep(&delay, NULL); + + uint64_t curr_time = cf_get_seconds(); + + if ((curr_time - last_time) < g_config.nsup_period) { + continue; // period has not been reached for running eviction check + } + + last_time = curr_time; + + // Iterate over every namespace. + for (int i = 0; i < g_config.n_namespaces; i++) { + as_namespace *ns = g_config.namespaces[i]; + + uint64_t start_ms = cf_getms(); + + cf_info(AS_NSUP, "{%s} nsup-start", ns->name); + + linear_hist_clear(ns->obj_size_hist, 0, cf_atomic32_get(ns->obj_size_hist_max)); + + // The "now" used for all expiration and eviction. + uint32_t now = as_record_void_time_get(); + + // Get the histogram range - used by all histograms. + uint32_t ttl_range = (uint32_t)get_ttl_range(ns, now); + + linear_hist_clear(ns->ttl_hist, now, ttl_range); + + uint64_t n_expired_records = 0; + uint64_t n_0_void_time_records = 0; + + uint32_t num_sets = cf_vmapx_count(ns->p_sets_vmap); + + bool sets_protected = false; + + // Giving this max possible size to spare us checking each record's + // set-id during index reduce. + bool sets_not_evicting[AS_SET_MAX_COUNT + 1]; + + memset(sets_not_evicting, 0, sizeof(sets_not_evicting)); + + for (uint32_t j = 0; j < num_sets; j++) { + uint32_t set_id = j + 1; + + clear_set_obj_size_hist(ns, set_id); + clear_set_ttl_hist(ns, set_id, now, ttl_range); + + as_set* p_set; + + if (cf_vmapx_get_by_index(ns->p_sets_vmap, j, (void**)&p_set) != CF_VMAPX_OK) { + cf_crash(AS_NSUP, "failed to get set index %u from vmap", j); + } + + if (IS_SET_EVICTION_DISABLED(p_set)) { + sets_not_evicting[set_id] = true; + sets_protected = true; + } + } + + uint64_t n_evicted_records = 0; + uint32_t evict_ttl = 0; + uint32_t n_general_waits = 0; + + // Check whether or not we need to do general eviction. + + if (eval_hwm_breached(ns)) { + // Eviction is necessary. + + linear_hist_clear(ns->obj_size_hist, 0, cf_atomic32_get(ns->obj_size_hist_max)); + linear_hist_reset(ns->evict_hist, now, ttl_range, ns->evict_hist_buckets); + linear_hist_clear(ns->ttl_hist, now, ttl_range); + + for (uint32_t j = 0; j < num_sets; j++) { + uint32_t set_id = j + 1; + + linear_hist_clear(ns->set_obj_size_hists[set_id], 0, cf_atomic32_get(ns->obj_size_hist_max)); + linear_hist_clear(ns->set_ttl_hists[set_id], now, ttl_range); + } + + evict_prep_info cb_info1; + + memset(&cb_info1, 0, sizeof(cb_info1)); + cb_info1.ns = ns; + cb_info1.sets_not_evicting = sets_not_evicting; + + // Reduce master partitions, building histograms to calculate + // general eviction threshold. + reduce_master_partitions(ns, evict_prep_reduce_cb, &cb_info1, &n_general_waits, "evict-prep"); + + n_0_void_time_records = cb_info1.num_0_void_time; + + evict_info cb_info2; + + memset(&cb_info2, 0, sizeof(cb_info2)); + cb_info2.ns = ns; + cb_info2.now = now; + cb_info2.sets_not_evicting = sets_not_evicting; + + // Determine general eviction threshold. + if (get_threshold(ns, &cb_info2.evict_void_time)) { + // Save the eviction depth in the device header(s) so it can + // be used to speed up cold start, etc. + as_storage_save_evict_void_time(ns, cb_info2.evict_void_time); + + // Reduce master partitions, deleting records up to + // threshold. (This automatically deletes expired records.) + reduce_master_partitions(ns, evict_reduce_cb, &cb_info2, &n_general_waits, "evict"); + + evict_ttl = cb_info2.evict_void_time - now; + n_evicted_records = cb_info2.num_evicted; + } + else if (sets_protected || cb_info2.evict_void_time == now) { + // Convert eviction into expiration. + cb_info2.evict_void_time = now; + + // Reduce master partitions, deleting expired records, + // including those in eviction-protected sets. + reduce_master_partitions(ns, evict_reduce_cb, &cb_info2, &n_general_waits, "expire-protected-sets"); + + // Count these as expired rather than evicted, since we can. + n_expired_records = cb_info2.num_evicted; + } + + // For now there's no get_info() call for evict_hist. + //linear_hist_save_info(ns->evict_hist); + } + else { + // Eviction is not necessary, only expiration. + + expire_info cb_info; + + memset(&cb_info, 0, sizeof(cb_info)); + cb_info.ns = ns; + cb_info.now = now; + + // Reduce master partitions, deleting expired records. + reduce_master_partitions(ns, expire_reduce_cb, &cb_info, &n_general_waits, "expire"); + + n_expired_records = cb_info.num_expired; + n_0_void_time_records = cb_info.num_0_void_time; + } + + linear_hist_dump(ns->obj_size_hist); + linear_hist_save_info(ns->obj_size_hist); + linear_hist_dump(ns->ttl_hist); + linear_hist_save_info(ns->ttl_hist); + + for (uint32_t j = 0; j < num_sets; j++) { + uint32_t set_id = j + 1; + + linear_hist_dump(ns->set_obj_size_hists[set_id]); + linear_hist_save_info(ns->set_obj_size_hists[set_id]); + linear_hist_dump(ns->set_ttl_hists[set_id]); + linear_hist_save_info(ns->set_ttl_hists[set_id]); + } + + // Wait for delete queue to clear. + uint32_t n_clear_waits = 0; + + while (cf_queue_sz(g_p_nsup_delete_q) > 0) { + usleep(DELETE_Q_CLEAR_SLEEP_us); + n_clear_waits++; + } + + update_stats(ns, linear_hist_get_total(ns->ttl_hist) + n_0_void_time_records, n_0_void_time_records, + n_expired_records, n_evicted_records, evict_ttl, + n_general_waits, n_clear_waits, start_ms); + + // Garbage-collect long-expired proles, one partition per loop. + if (g_config.prole_extra_ttl != 0) { + prole_pids[i] = garbage_collect_next_prole_partition(ns, prole_pids[i]); + } + } + } + + return NULL; +} + +//------------------------------------------------ +// Namespace stop-writes thread "run" function. +// +void * +run_stop_writes(void *arg) +{ + while (true) { + sleep(EVAL_STOP_WRITES_PERIOD); + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + eval_stop_writes(g_config.namespaces[ns_ix]); + } + } + + return NULL; +} + +//------------------------------------------------ +// Start supervisor threads. +// +void +as_nsup_start() +{ + // Seed the random number generator. + srand(time(NULL)); + + // Create queue for nsup-generated deletions. + g_p_nsup_delete_q = cf_queue_create(sizeof(record_delete_info), true); + + cf_info(AS_NSUP, "starting namespace supervisor threads"); + + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + // Start thread to handle all nsup-generated deletions. + if (0 != pthread_create(&thread, &attrs, run_nsup_delete, NULL)) { + cf_crash(AS_NSUP, "nsup delete thread create failed"); + } + + // Start namespace supervisor thread to do expiration & eviction. + if (0 != pthread_create(&thread, &attrs, run_nsup, NULL)) { + cf_crash(AS_NSUP, "nsup thread create failed"); + } + + // Start thread to do stop-writes evaluation. + if (0 != pthread_create(&thread, &attrs, run_stop_writes, NULL)) { + cf_crash(AS_NSUP, "nsup stop-writes thread create failed"); + } +} + + +//========================================================== +// Local helpers. +// + +static bool +eval_stop_writes(as_namespace *ns) +{ + // Compute the high-watermark. + uint64_t mem_stop_writes = (ns->memory_size * ns->stop_writes_pct) / 100; + + // Compute device available percent for namespace. + int device_avail_pct = 0; + + as_storage_stats(ns, &device_avail_pct, NULL); + + // Compute memory usage for namespace. + uint64_t index_sz = ns->n_objects * as_index_size_get(ns); + uint64_t tombstone_index_sz = ns->n_tombstones * as_index_size_get(ns); + uint64_t sindex_sz = ns->n_bytes_sindex_memory; + uint64_t data_in_memory_sz = ns->n_bytes_memory; + uint64_t memory_sz = index_sz + tombstone_index_sz + data_in_memory_sz + sindex_sz; + + // Possible reasons for eviction or stopping writes. + static const char* reasons[] = { + NULL, // 0x0 + "(memory)", // 0x1 + "(device-avail-pct)", // 0x2 + "(memory & device-avail-pct)", // 0x3 (0x1 | 0x2) + "(xdr-log)", // 0x4 + "(memory & xdr-log)", // 0x5 (0x1 | 0x4) + "(device-avail-pct & xdr-log)", // 0x6 (0x2 | 0x4) + "(memory & device-avail-pct & xdr-log)" // 0x7 (0x1 | 0x2 | 0x4) + }; + + // Check if the writes should be stopped. + bool stop_writes = false; + uint32_t why_stopped = 0x0; + + if (memory_sz > mem_stop_writes) { + stop_writes = true; + why_stopped = 0x1; + } + + if (device_avail_pct < (int)ns->storage_min_avail_pct) { + stop_writes = true; + why_stopped |= 0x2; + } + + if (is_xdr_digestlog_low(ns)) { + stop_writes = true; + why_stopped |= 0x4; + } + + if (stop_writes) { + cf_warning(AS_NSUP, "{%s} breached stop-writes limit %s, memory sz:%lu (%lu + %lu) limit:%lu, disk avail-pct:%d", + ns->name, reasons[why_stopped], + memory_sz, index_sz, data_in_memory_sz, mem_stop_writes, + device_avail_pct); + } + else { + cf_debug(AS_NSUP, "{%s} stop-writes limit not breached, memory sz:%lu (%lu + %lu) limit:%lu, disk avail-pct:%d", + ns->name, + memory_sz, index_sz, data_in_memory_sz, mem_stop_writes, + device_avail_pct); + } + + cf_atomic32_set(&ns->stop_writes, stop_writes ? 1 : 0); + + return stop_writes; +} + +static bool +eval_hwm_breached(as_namespace *ns) +{ + // Compute the high-watermark - memory. + uint64_t mem_hwm = (ns->memory_size * ns->hwm_memory_pct) / 100; + + // Compute the high-watermark - disk. + uint64_t ssd_hwm = (ns->ssd_size * ns->hwm_disk_pct) / 100; + + // Compute disk usage for namespace. + uint64_t used_disk_sz = 0; + + as_storage_stats(ns, NULL, &used_disk_sz); + + // Compute memory usage for namespace. + uint64_t index_sz = ns->n_objects * as_index_size_get(ns); + uint64_t tombstone_index_sz = ns->n_tombstones * as_index_size_get(ns); + uint64_t sindex_sz = ns->n_bytes_sindex_memory; + uint64_t data_in_memory_sz = ns->n_bytes_memory; + uint64_t memory_sz = index_sz + tombstone_index_sz + data_in_memory_sz + sindex_sz; + + // Possible reasons for eviction. + // (We don't use all combinations, but in case we change our minds...) + static const char* reasons[] = { + NULL, "(memory)", "(disk)", "(memory & disk)" + }; + + // Check if either high water mark is breached. + bool hwm_breached = false; + uint32_t how_breached = 0x0; + + if (memory_sz > mem_hwm) { + hwm_breached = true; + how_breached = 0x1; + } + + if (used_disk_sz > ssd_hwm) { + hwm_breached = true; + how_breached |= 0x2; + } + + if (hwm_breached) { + cf_warning(AS_NSUP, "{%s} breached eviction hwm %s, memory sz:%lu (%lu + %lu) hwm:%lu, disk sz:%lu hwm:%lu", + ns->name, reasons[how_breached], + memory_sz, index_sz, data_in_memory_sz, mem_hwm, + used_disk_sz, ssd_hwm); + } + else { + cf_debug(AS_NSUP, "{%s} neither eviction hwm breached, memory sz:%lu (%lu + %lu) hwm:%lu, disk sz:%lu hwm:%lu", + ns->name, + memory_sz, index_sz, data_in_memory_sz, mem_hwm, + used_disk_sz, ssd_hwm); + } + + cf_atomic32_set(&ns->hwm_breached, hwm_breached ? 1 : 0); + + return hwm_breached; +} diff --git a/as/src/base/thr_query.c b/as/src/base/thr_query.c new file mode 100644 index 00000000..e9973f7b --- /dev/null +++ b/as/src/base/thr_query.c @@ -0,0 +1,3383 @@ +/* + * thr_query.c + * + * Copyright (C) 2012-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * This code is responsible for the query execution. Each query received + * query transaction for the query threads to execute. Query has two parts + * a) Generator : This query the Aerospike Index B-tree and creates the digest list and + * queues it up for LOOKUP / UDF / AGGREGATION + * b) Aggregator : This does required processing of the record and send back + * response to the clients. + * LOOKUP: Read the record from the disk and based on the + * records selected by query packs it into the buffer + * and returns it back to the client + * UDF: Reads the record from the disk and based on the + * query applies UDF and packs the result back into + * the buffer and returns it back to the client. + * AGGREGATION: Creates istream(on the digstlist) and ostream( + * over the network buffer) and applies aggregator + * functions. For a single query this can be called + * multiple times. The istream interface takes care + * of partition reservation / record opening/ closing + * and object lock synchronization. Whole of which + * is driven by as_stream_read / as_stream_write from + * inside aggregation UDF. ostream keeps sending by + * batched result to the client. + * + * Please note all these parts can either be performed under single thread + * context or by different set of threads. For the namespace with data on disk + * I/O is performed separately in different set of I/O pools + * + * Flow of code looks like + * + * 1. thr_tsvc() + * + * ---------------------------------> query_generator + * / /|\ | + * as_query ----- | | qtr released + * (sets up qtr) \ qtr reserved | \|/ + * ----------------> g_query_q ------> query_th + * + * + * 2. Query Threads + * ---------------------------------> qwork_process + * / /|\ | + * query_generator -- | | qtr released + * (sets up qwork) \ qtr reserved | \|/ + * --------------> g_query_work_queue -> query_th + * + * + * + * 3. I/O threads + * query_process_ioreq --> query_io + * / + * qwork_process -----------------query_process_udfreq --> internal txn + * \ + * query_process_aggreq --> ag_aggr_process + * + * (Releases all the resources qtr and qwork if allocated) + * + * A query may be single thread execution or a multi threaded application. In the + * single thread execution all the functions are called in the single thread context + * and no queue is involved. In case of multi thread context qtr is setup by thr_tsvc + * and which is picked up by the query threads which could either service it in single + * thread or queue up to the I/O worker thread (done generally in case of data on ssd) + * + */ + +#include "base/thr_query.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "aerospike/as_buffer.h" +#include "aerospike/as_integer.h" +#include "aerospike/as_list.h" +#include "aerospike/as_map.h" +#include "aerospike/as_msgpack.h" +#include "aerospike/as_serializer.h" +#include "aerospike/as_stream.h" +#include "aerospike/as_string.h" +#include "aerospike/as_rec.h" +#include "aerospike/as_val.h" +#include "aerospike/mod_lua.h" +#include "citrusleaf/cf_ll.h" +#include "citrusleaf/cf_rchash.h" + +#include "ai_btree.h" +#include "bt.h" +#include "bt_iterator.h" + +#include "base/aggr.h" +#include "base/as_stap.h" +#include "base/datamodel.h" +#include "base/predexp.h" +#include "base/proto.h" +#include "base/secondary_index.h" +#include "base/stats.h" +#include "base/thr_tsvc.h" +#include "base/transaction.h" +#include "base/udf_memtracker.h" +#include "base/udf_record.h" +#include "fabric/fabric.h" +#include "fabric/partition.h" +#include "geospatial/geospatial.h" +#include "transaction/udf.h" + + +/* + * Query Transaction State + */ +// ************************************************************************************************** +typedef enum { + AS_QTR_STATE_INIT = 0, + AS_QTR_STATE_RUNNING = 1, + AS_QTR_STATE_ABORT = 2, + AS_QTR_STATE_ERR = 3, + AS_QTR_STATE_DONE = 4, +} qtr_state; +// ************************************************************************************************** + +/* + * Query Transcation Type + */ +// ************************************************************************************************** +typedef enum { + QUERY_TYPE_LOOKUP = 0, + QUERY_TYPE_AGGR = 1, + QUERY_TYPE_UDF_BG = 2, + QUERY_TYPE_UDF_FG = 3, + + QUERY_TYPE_UNKNOWN = -1 +} query_type; + + + +/* + * Query Transaction Structure + */ +// ************************************************************************************************** +typedef struct as_query_transaction_s { + + /* + * MT (Read Only) No protection required + */ + /************************** Query Parameter ********************************/ + uint64_t trid; + as_namespace * ns; + char * setname; + as_sindex * si; + as_sindex_range * srange; + query_type job_type; // Job type [LOOKUP/AGG/UDF] + bool no_bin_data; + predexp_eval_t * predexp_eval; + cf_vector * binlist; + as_file_handle * fd_h; // ref counted nonetheless + /************************** Run Time Data *********************************/ + bool blocking; + uint32_t priority; + uint64_t start_time; // Start time + uint64_t end_time; // timeout value + + /* + * MT (Single Writer / Single Threaded / Multiple Readers) + * Atomics or no Protection + */ + /****************** Stats (only generator) ***********************/ + uint64_t querying_ai_time_ns; // Time spent by query to run lookup secondary index trees. + uint32_t n_digests; // Digests picked by from secondary index + // including record read + bool short_running; + bool track; + + /* + * MT (Multiple Writers) + * These fields are either needs to be atomic or protected by lock. + */ + /****************** Stats (worker threads) ***********************/ + cf_atomic64 n_result_records; // Number of records returned as result + // if aggregation returns 1 record count + // is 1, irrelevant of number of record + // being touched. + cf_atomic64 net_io_bytes; + cf_atomic64 n_read_success; + + /********************** Query Progress ***********************************/ + cf_atomic32 n_qwork_active; + cf_atomic32 n_io_outstanding; + cf_atomic32 n_udf_tr_queued; // Throttling: max in flight scan + + /********************* Net IO packet order *******************************/ + cf_atomic32 netio_push_seq; + cf_atomic32 netio_pop_seq; + + /********************** IO Buf Builder ***********************************/ + pthread_mutex_t buf_mutex; + cf_buf_builder * bb_r; + /****************** Query State and Result Code **************************/ + pthread_mutex_t slock; + bool do_requeue; + qtr_state state; + int result_code; + + /********************* Fields Not Memzeroed ********************** + * + * Empirically, some of the following fields *still* require memzero + * initialization. Please test with a memset(qtr, 0xff, sizeof(*qtr)) + * right after allocation before you initialize before moving them + * into the uninitialized section. + * + * NB: Read Only or Single threaded + */ + struct ai_obj bkey; + as_aggr_call agg_call; // Stream UDF Details + iudf_origin origin; // Record UDF Details + bool is_durable_delete; // enterprise only + as_sindex_qctx qctx; // Secondary Index details + as_partition_reservation * rsv; +} as_query_transaction; +// ************************************************************************************************** + + + +/* + * Query Request Type + */ +// ************************************************************************************************** +typedef enum { + QUERY_WORK_TYPE_NONE = -1, // Request for I/O + QUERY_WORK_TYPE_LOOKUP = 0, // Request for I/O + QUERY_WORK_TYPE_AGG = 1, // Request for Aggregation + QUERY_WORK_TYPE_UDF_BG = 2, // Request for running UDF on query result +} query_work_type; +// ************************************************************************************************** + + +/* + * Query Request + */ +// ************************************************************************************************** +typedef struct query_work_s { + query_work_type type; + as_query_transaction * qtr; + cf_ll * recl; + uint64_t queued_time_ns; +} query_work; +// ************************************************************************************************** + + +/* + * Job Monitoring + */ +// ************************************************************************************************** +typedef struct query_jobstat_s { + int index; + as_mon_jobstat ** jobstat; + int max_size; +} query_jobstat; +// ************************************************************************************************** + +/* + * Skey list + */ +// ************************************************************************************************** +typedef struct qtr_skey_s { + as_query_transaction * qtr; + as_sindex_key * skey; +} qtr_skey; +// ************************************************************************************************** + + +/* + * Query Engine Global + */ +// ************************************************************************************************** +static int g_current_queries_count = 0; +static pthread_rwlock_t g_query_lock + = PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP; +static cf_rchash * g_query_job_hash = NULL; +// Buf Builder Pool +static cf_queue * g_query_response_bb_pool = 0; +static cf_queue * g_query_qwork_pool = 0; +pthread_mutex_t g_query_pool_mutex = PTHREAD_MUTEX_INITIALIZER; +as_query_transaction * g_query_pool_head = NULL; +size_t g_query_pool_count = 0; +// +// GENERATOR +static pthread_t g_query_threads[AS_QUERY_MAX_THREADS]; +static pthread_attr_t g_query_th_attr; +static cf_queue * g_query_short_queue = 0; +static cf_queue * g_query_long_queue = 0; +static cf_atomic32 g_query_threadcnt = 0; + +cf_atomic32 g_query_short_running = 0; +cf_atomic32 g_query_long_running = 0; + +// I/O & AGGREGATOR +static pthread_t g_query_worker_threads[AS_QUERY_MAX_WORKER_THREADS]; +static pthread_attr_t g_query_worker_th_attr; +static cf_queue * g_query_work_queue = 0; +static cf_atomic32 g_query_worker_threadcnt = 0; +// ************************************************************************************************** + +/* + * Extern Functions + */ +// ************************************************************************************************** + +extern cf_vector * as_sindex_binlist_from_msg(as_namespace *ns, as_msg *msgp, int * numbins); + +// ************************************************************************************************** + +/* + * Forward Declaration + */ +// ************************************************************************************************** + +static void qtr_finish_work(as_query_transaction *qtr, cf_atomic32 *stat, char *fname, int lineno, bool release); + +// ************************************************************************************************** + +/* + * Histograms + */ +// ************************************************************************************************** +histogram * query_txn_q_wait_hist; // Histogram to track time spend in trasaction queue. Transaction + // queue backing, it is busy. Check if query in transaction is + // true from query perspective. +histogram * query_query_q_wait_hist; // Histogram to track time spend waiting in queue for query thread. + // Query queue backing up. Try increasing query thread in case CPU is + // not fully utilized or if system is not IO bound +histogram * query_prepare_batch_hist; // Histogram to track time spend while preparing batches. Secondary index + // slow. Check batch is too big +histogram * query_batch_io_q_wait_hist; // Histogram to track time spend waiting in queue for worker thread. +histogram * query_batch_io_hist; // Histogram to track time spend doing I/O per batch. This includes + // priority based sleep after n units of work. + // For above two Query worker thread busy if not IO bound then try bumping + // up the priority. Query thread may be yielding too much. +histogram * query_net_io_hist; // Histogram to track time spend sending results to client. Network problem!! + // or client too slow + +#define QUERY_HIST_INSERT_DATA_POINT(type, start_time_ns) \ +do { \ + if (g_config.query_enable_histogram && start_time_ns != 0) { \ + if (type) { \ + histogram_insert_data_point(type, start_time_ns); \ + } \ + } \ +} while(0); + +#define QUERY_HIST_INSERT_RAW(type, time_ns) \ +do { \ + if (g_config.query_enable_histogram && time_ns != 0) { \ + if (type) { \ + histogram_insert_raw(type, time_ns); \ + } \ + } \ +} while(0); + +// ************************************************************************************************** + + +/* + * Query Locks + */ +// ************************************************************************************************** +static void +qtr_lock(as_query_transaction *qtr) { + if (qtr) { + pthread_mutex_lock(&qtr->slock); + } +} +static void +qtr_unlock(as_query_transaction *qtr) { + if (qtr) { + pthread_mutex_unlock(&qtr->slock); + } +} +// ************************************************************************************************** + + +/* + * Query Transaction Pool + */ +// ************************************************************************************************** +static as_query_transaction * +qtr_alloc() +{ + pthread_mutex_lock(&g_query_pool_mutex); + + as_query_transaction * qtr; + + if (!g_query_pool_head) { + qtr = cf_rc_alloc(sizeof(as_query_transaction)); + } else { + qtr = g_query_pool_head; + g_query_pool_head = * (as_query_transaction **) qtr; + --g_query_pool_count; + cf_rc_reserve(qtr); + } + + pthread_mutex_unlock(&g_query_pool_mutex); + return qtr; +} + +static void +qtr_free(as_query_transaction * qtr) +{ + pthread_mutex_lock(&g_query_pool_mutex); + + if (g_query_pool_count >= AS_QUERY_MAX_QTR_POOL) { + cf_rc_free(qtr); + } + else { + // Use the initial location as a next pointer. + * (as_query_transaction **) qtr = g_query_pool_head; + g_query_pool_head = qtr; + ++g_query_pool_count; + } + + pthread_mutex_unlock(&g_query_pool_mutex); +} +// ************************************************************************************************** + + +/* + * Bufbuilder buffer pool + */ +// ************************************************************************************************** +static int +bb_poolrelease(cf_buf_builder *bb_r) +{ + int ret = AS_QUERY_OK; + if ((cf_queue_sz(g_query_response_bb_pool) > g_config.query_bufpool_size) + || g_config.query_buf_size != cf_buf_builder_size(bb_r)) { + cf_detail(AS_QUERY, "Freed Buffer of Size %zu with", bb_r->alloc_sz + sizeof(as_msg)); + cf_buf_builder_free(bb_r); + } else { + cf_detail(AS_QUERY, "Pushed %p %"PRIu64" %d ", bb_r, g_config.query_buf_size, cf_buf_builder_size(bb_r)); + cf_queue_push(g_query_response_bb_pool, &bb_r); + } + return ret; +} + +static cf_buf_builder * +bb_poolrequest() +{ + cf_buf_builder *bb_r; + int rv = cf_queue_pop(g_query_response_bb_pool, &bb_r, CF_QUEUE_NOWAIT); + if (rv == CF_QUEUE_EMPTY) { + bb_r = cf_buf_builder_create_size(g_config.query_buf_size); + if (!bb_r) { + cf_crash(AS_QUERY, "Allocation Error in Buf builder Pool !!"); + } + } else if (rv == CF_QUEUE_OK) { + bb_r->used_sz = 0; + cf_detail(AS_QUERY, "Popped %p", bb_r); + } else { + cf_warning(AS_QUERY, "Failed to find response buffer in the pool%d", rv); + return NULL; + } + return bb_r; +}; +// ************************************************************************************************** + +/* + * Query Request Pool + */ +// ************************************************************************************************** +static int +qwork_poolrelease(query_work *qwork) +{ + if (!qwork) return AS_QUERY_OK; + qwork->qtr = 0; + qwork->type = QUERY_WORK_TYPE_NONE; + + int ret = AS_QUERY_OK; + if (cf_queue_sz(g_query_qwork_pool) < AS_QUERY_MAX_QREQ) { + cf_detail(AS_QUERY, "Pushed qwork %p", qwork); + cf_queue_push(g_query_qwork_pool, &qwork); + } else { + cf_detail(AS_QUERY, "Freed qwork %p", qwork); + cf_free(qwork); + } + if (ret != CF_QUEUE_OK) ret = AS_QUERY_ERR; + return ret; +} + +static query_work * +qwork_poolrequest() +{ + query_work *qwork = NULL; + int rv = cf_queue_pop(g_query_qwork_pool, &qwork, CF_QUEUE_NOWAIT); + if (rv == CF_QUEUE_EMPTY) { + qwork = cf_malloc(sizeof(query_work)); + memset(qwork, 0, sizeof(query_work)); + } else if (rv != CF_QUEUE_OK) { + cf_warning(AS_QUERY, "Failed to find query work in the pool"); + return NULL; + } + qwork->qtr = 0; + qwork->type = QUERY_WORK_TYPE_NONE; + return qwork; +}; +// ************************************************************************************************** + + +/* + * Query State set/get function + */ +// ************************************************************************************************** +static void +qtr_set_running(as_query_transaction *qtr) { + qtr_lock(qtr); + if (qtr->state == AS_QTR_STATE_INIT) { + qtr->state = AS_QTR_STATE_RUNNING; + } else { + cf_crash(AS_QUERY, "Invalid Query state %d while moving to running state ...", qtr->state); + } + qtr_unlock(qtr); +} + +/* + * Query in non init state (picked up by generator) means it is + * running. Could be RUNNING/ABORT/FAIL/DONE + */ +static bool +qtr_started(as_query_transaction *qtr) { + qtr_lock(qtr); + bool started = false; + if (qtr->state != AS_QTR_STATE_INIT) { + started = true; + } + qtr_unlock(qtr); + return started; +} + +static void +qtr_set_abort(as_query_transaction *qtr, int result_code, char *fname, int lineno) +{ + qtr_lock(qtr); + if (qtr->state == AS_QTR_STATE_RUNNING + || qtr->state == AS_QTR_STATE_DONE) { + cf_debug(AS_QUERY, "Query %p Aborted at %s:%d", qtr, fname, lineno); + qtr->state = AS_QTR_STATE_ABORT; + qtr->result_code = result_code; + } + qtr_unlock(qtr); +} + +static void +qtr_set_err(as_query_transaction *qtr, int result_code, char *fname, int lineno) +{ + qtr_lock(qtr); + if (qtr->state == AS_QTR_STATE_RUNNING) { + cf_debug(AS_QUERY, "Query %p Error at %s:%d", qtr, fname, lineno); + qtr->state = AS_QTR_STATE_ERR; + qtr->result_code = result_code; + } + qtr_unlock(qtr); +} + +static void +qtr_set_done(as_query_transaction *qtr, int result_code, char *fname, int lineno) +{ + qtr_lock(qtr); + if (qtr->state == AS_QTR_STATE_RUNNING) { + cf_debug(AS_QUERY, "Query %p Done at %s:%d", qtr, fname, lineno); + qtr->state = AS_QTR_STATE_DONE; + qtr->result_code = result_code; + } + qtr_unlock(qtr); +} + +static bool +qtr_failed(as_query_transaction *qtr) +{ + qtr_lock(qtr); + bool abort = false; + if ((qtr->state == AS_QTR_STATE_ABORT) + || (qtr->state == AS_QTR_STATE_ERR)) { + abort = true; + } + qtr_unlock(qtr); + return abort; +} + +static bool +qtr_is_abort(as_query_transaction *qtr) +{ + qtr_lock(qtr); + bool abort = false; + if (qtr->state == AS_QTR_STATE_ABORT) { + abort = true; + } + qtr_unlock(qtr); + return abort; +} + + +static bool +qtr_finished(as_query_transaction *qtr) +{ + qtr_lock(qtr); + bool finished = false; + if ((qtr->state == AS_QTR_STATE_DONE) + || (qtr->state == AS_QTR_STATE_ERR) + || (qtr->state == AS_QTR_STATE_ABORT)) { + finished = true; + } + qtr_unlock(qtr); + return finished; +} + +static void +query_check_timeout(as_query_transaction *qtr) +{ + if ((qtr) + && (qtr->end_time != 0) + && (cf_getns() > qtr->end_time)) { + cf_debug(AS_QUERY, "Query Timed-out %lu %lu", cf_getns(), qtr->end_time); + qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_TIMEOUT, __FILE__, __LINE__); + } +} +// ************************************************************************************************** + + +/* + * Query Destructor Function + */ +// ************************************************************************************************** +static void +query_release_prereserved_partitions(as_query_transaction * qtr) +{ + if (!qtr) { + cf_warning(AS_QUERY, "qtr is NULL"); + return; + } + if (qtr->qctx.partitions_pre_reserved) { + for (int i=0; iqctx.can_partition_query[i]) { + as_partition_release(&qtr->rsv[i]); + } + } + if (qtr->rsv) { + cf_free(qtr->rsv); + } + } +} + +/* + * NB: These stats come into picture only if query really started + * running. If it fails before even running it is accounted in + * fail + */ +static inline void +query_update_stats(as_query_transaction *qtr) +{ + uint64_t rows = cf_atomic64_get(qtr->n_result_records); + + switch (qtr->job_type) { + case QUERY_TYPE_LOOKUP: + if (qtr->state == AS_QTR_STATE_ABORT) { + cf_atomic64_incr(&qtr->ns->n_lookup_abort); + } else if (qtr->state == AS_QTR_STATE_ERR) { + cf_atomic64_incr(&(qtr->si->stats.lookup_errs)); + cf_atomic64_incr(&qtr->ns->n_lookup_errs); + } + if (!qtr_failed(qtr)) + cf_atomic64_incr(&qtr->ns->n_lookup_success); + cf_atomic64_incr(&qtr->si->stats.n_lookup); + cf_atomic64_add(&qtr->si->stats.lookup_response_size, qtr->net_io_bytes); + cf_atomic64_add(&qtr->si->stats.lookup_num_records, rows); + cf_atomic64_add(&qtr->ns->lookup_response_size, qtr->net_io_bytes); + cf_atomic64_add(&qtr->ns->lookup_num_records, rows); + break; + + case QUERY_TYPE_AGGR: + if (qtr->state == AS_QTR_STATE_ABORT) { + cf_atomic64_incr(&qtr->ns->n_agg_abort); + } else if (qtr->state == AS_QTR_STATE_ERR) { + cf_atomic64_incr(&(qtr->si->stats.agg_errs)); + cf_atomic64_incr(&qtr->ns->n_agg_errs); + } + if (!qtr_failed(qtr)) + cf_atomic64_incr(&qtr->ns->n_agg_success); + cf_atomic64_incr(&qtr->si->stats.n_aggregation); + cf_atomic64_add(&qtr->si->stats.agg_response_size, qtr->net_io_bytes); + cf_atomic64_add(&qtr->si->stats.agg_num_records, rows); + cf_atomic64_add(&qtr->ns->agg_response_size, qtr->net_io_bytes); + cf_atomic64_add(&qtr->ns->agg_num_records, rows); + break; + + case QUERY_TYPE_UDF_BG: + if (qtr_failed(qtr)) { + cf_atomic64_incr(&qtr->ns->n_query_udf_bg_failure); + } else { + cf_atomic64_incr(&qtr->ns->n_query_udf_bg_success); + } + break; + + default: + cf_crash(AS_QUERY, "Unknown Query Type !!"); + break; + } + + // Can't use macro that tr and rw use. + qtr->ns->query_hist_active = true; + cf_hist_track_insert_data_point(qtr->ns->query_hist, qtr->start_time); + + SINDEX_HIST_INSERT_DATA_POINT(qtr->si, query_hist, qtr->start_time); + + if (qtr->querying_ai_time_ns) { + QUERY_HIST_INSERT_RAW(query_prepare_batch_hist, qtr->querying_ai_time_ns); + } + + if (qtr->n_digests) { + SINDEX_HIST_INSERT_RAW(qtr->si, query_rcnt_hist, qtr->n_digests); + if (rows) { + // Can't use macro that tr and rw use. + qtr->ns->query_rec_count_hist_active = true; + histogram_insert_raw(qtr->ns->query_rec_count_hist, rows); + + SINDEX_HIST_INSERT_RAW(qtr->si, query_diff_hist, qtr->n_digests - rows); + } + } + + + + uint64_t query_stop_time = cf_getns(); + uint64_t elapsed_us = (query_stop_time - qtr->start_time) / 1000; + cf_detail(AS_QUERY, + "Total time elapsed %"PRIu64" us, %"PRIu64" of %d read operations avg latency %"PRIu64" us", + elapsed_us, rows, qtr->n_digests, rows > 0 ? elapsed_us / rows : 0); +} + +static void +query_run_teardown(as_query_transaction *qtr) +{ + query_update_stats(qtr); + + if (qtr->n_udf_tr_queued != 0) { + cf_warning(AS_QUERY, "QUEUED UDF not equal to zero when query transaction is done"); + } + + if (qtr->qctx.recl) { + cf_ll_reduce(qtr->qctx.recl, true /*forward*/, as_index_keys_ll_reduce_fn, NULL); + cf_free(qtr->qctx.recl); + qtr->qctx.recl = NULL; + } + + if (qtr->short_running) { + cf_atomic32_decr(&g_query_short_running); + } else { + cf_atomic32_decr(&g_query_long_running); + } + + // Release all the partitions + query_release_prereserved_partitions(qtr); + + + if (qtr->bb_r) { + bb_poolrelease(qtr->bb_r); + qtr->bb_r = NULL; + } + + pthread_mutex_destroy(&qtr->buf_mutex); +} + +static void +query_teardown(as_query_transaction *qtr) +{ + if (qtr->srange) as_sindex_range_free(&qtr->srange); + if (qtr->si) AS_SINDEX_RELEASE(qtr->si); + if (qtr->binlist) cf_vector_destroy(qtr->binlist); + if (qtr->setname) cf_free(qtr->setname); + if (qtr->predexp_eval) predexp_destroy(qtr->predexp_eval); + if (qtr->job_type == QUERY_TYPE_AGGR && qtr->agg_call.def.arglist) { + as_list_destroy(qtr->agg_call.def.arglist); + } + else if (qtr->job_type == QUERY_TYPE_UDF_BG) { + iudf_origin_destroy(&qtr->origin); + } + pthread_mutex_destroy(&qtr->slock); +} + +static void +query_release_fd(as_file_handle *fd_h, bool force_close) +{ + if (fd_h) { + fd_h->fh_info &= ~FH_INFO_DONOT_REAP; + fd_h->last_used = cf_getms(); + as_end_of_transaction(fd_h, force_close); + } +} + +static void +query_transaction_done(as_query_transaction *qtr) +{ + +#if defined(USE_SYSTEMTAP) + uint64_t nodeid = g_config.self_node; +#endif + + if (!qtr) + return; + + ASD_QUERY_TRANS_DONE(nodeid, qtr->trid, (void *) qtr); + + if (qtr_started(qtr)) { + query_run_teardown(qtr); + } + + + // if query is aborted force close connection. + // Not to be reused + query_release_fd(qtr->fd_h, qtr_is_abort(qtr)); + qtr->fd_h = NULL; + query_teardown(qtr); + + ASD_QUERY_QTR_FREE(nodeid, qtr->trid, (void *) qtr); + + qtr_free(qtr); +} +// ************************************************************************************************** + + +/* + * Query Transaction Ref Counts + */ +// ************************************************************************************************** +int +qtr_release(as_query_transaction *qtr, char *fname, int lineno) +{ + if (qtr) { + int val = cf_rc_release(qtr); + if (val == 0) { + cf_detail(AS_QUERY, "Released qtr [%s:%d] %p %d ", fname, lineno, qtr, val); + query_transaction_done(qtr); + } + cf_detail(AS_QUERY, "Released qtr [%s:%d] %p %d ", fname, lineno, qtr, val); + } + return AS_QUERY_OK; +} + +static int +qtr_reserve(as_query_transaction *qtr, char *fname, int lineno) +{ + if (!qtr) { + return AS_QUERY_ERR; + } + int val = cf_rc_reserve(qtr); + cf_detail(AS_QUERY, "Reserved qtr [%s:%d] %p %d ", fname, lineno, qtr, val); + return AS_QUERY_OK; +} +// ************************************************************************************************** + + +/* + * Async Network IO Entry Point + */ +// ************************************************************************************************** +/* Call back function to determine if the IO should go ahead or not. + * Purpose + * 1. If our sequence number does not match requeue + * 2. If query aborted fail IO. + * 3. In all other cases let the IO go through. That would mean + * if IO is queued it will be done before the fin with error + * result_code is sent !! + */ +int +query_netio_start_cb(void *udata, int seq) +{ + as_netio *io = (as_netio *)udata; + as_query_transaction *qtr = (as_query_transaction *)io->data; + cf_detail(AS_QUERY, "Netio Started_CB %d %d %d %d ", io->offset, io->seq, qtr->netio_pop_seq, qtr->state); + + // It is needed to send all the packets in sequence + // A packet can be requeued after being half sent. + if (seq > cf_atomic32_get(qtr->netio_pop_seq)) { + return AS_NETIO_CONTINUE; + } + + if (qtr_is_abort(qtr)) { + return AS_QUERY_ERR; + } + + return AS_NETIO_OK; +} + +/* + * The function after the IO on the network has been done. + * 1. If OK was done successfully bump up the sequence number and + * fix stats + * 2. Release the qtr if something fails ... which would trigger + * fin packet send and eventually free up qtr + * Abort it set if something goes wrong + */ +int +query_netio_finish_cb(void *data, int retcode) +{ + as_netio *io = (as_netio *)data; + cf_detail(AS_QUERY, "Query Finish Callback io seq %d with retCode %d", io->seq, retcode); + as_query_transaction *qtr = (as_query_transaction *)io->data; + if (qtr && (retcode != AS_NETIO_CONTINUE)) { + // If send success make stat is updated + if (retcode == AS_NETIO_OK) { + cf_atomic64_add(&qtr->net_io_bytes, io->bb_r->used_sz + 8); + } else { + qtr_set_abort(qtr, AS_PROTO_RESULT_FAIL_QUERY_NETIO_ERR, __FILE__, __LINE__); + } + QUERY_HIST_INSERT_DATA_POINT(query_net_io_hist, io->start_time); + + // Undo the increment from query_netio(). Cannot reach zero here: the + // increment owned by the transaction will only be undone after all netio + // is complete. + cf_rc_release(io->fd_h); + io->fd_h = NULL; + bb_poolrelease(io->bb_r); + + cf_atomic32_incr(&qtr->netio_pop_seq); + + qtr_finish_work(qtr, &qtr->n_io_outstanding, __FILE__, __LINE__, true); + } + return retcode; +} + +#define MAX_OUTSTANDING_IO_REQ 2 +static int +query_netio_wait(as_query_transaction *qtr) +{ + return (cf_atomic32_get(qtr->n_io_outstanding) > MAX_OUTSTANDING_IO_REQ) ? AS_QUERY_ERR : AS_QUERY_OK; +} + +// Returns AS_NETIO_OK always +static int +query_netio(as_query_transaction *qtr) +{ +#if defined(USE_SYSTEMTAP) + uint64_t nodeid = g_config.self_node; +#endif + + ASD_QUERY_NETIO_STARTING(nodeid, qtr->trid); + + as_netio io; + + io.finish_cb = query_netio_finish_cb; + io.start_cb = query_netio_start_cb; + + qtr_reserve(qtr, __FILE__, __LINE__); + io.data = qtr; + + io.bb_r = qtr->bb_r; + qtr->bb_r = NULL; + + cf_rc_reserve(qtr->fd_h); + io.fd_h = qtr->fd_h; + + io.offset = 0; + + cf_atomic32_incr(&qtr->n_io_outstanding); + io.seq = cf_atomic32_incr(&qtr->netio_push_seq); + io.start_time = cf_getns(); + + int ret = as_netio_send(&io, false, qtr->blocking); + qtr->bb_r = bb_poolrequest(); + cf_buf_builder_reserve(&qtr->bb_r, 8, NULL); + + ASD_QUERY_NETIO_FINISHED(nodeid, qtr->trid); + + return ret; +} +// ************************************************************************************************** + + +/* + * Query Reservation Abstraction + */ +// ************************************************************************************************** +// Returns NULL if partition with is 'pid' is not query-able Else +// if all the partitions are reserved upfront returns the rsv used for reserving the partition +// else reserves the partition and returns rsv +as_partition_reservation * +query_reserve_partition(as_namespace * ns, as_query_transaction * qtr, uint32_t pid, as_partition_reservation * rsv) +{ + if (qtr->qctx.partitions_pre_reserved) { + if (!qtr->qctx.can_partition_query[pid]) { + cf_debug(AS_QUERY, "Getting digest in rec list which do not belong to query-able partition."); + return NULL; + } + return &qtr->rsv[pid]; + } + + // Works for scan aggregation + if (!rsv) { + cf_warning(AS_QUERY, "rsv is null while reserving partition."); + return NULL; + } + + if (0 != as_partition_reserve_query(ns, pid, rsv)) { + return NULL; + } + + return rsv; +} + +void +query_release_partition(as_query_transaction * qtr, as_partition_reservation * rsv) +{ + if (!qtr->qctx.partitions_pre_reserved) { + as_partition_release(rsv); + } +} + +// Pre reserves query-able partitions +void +as_query_pre_reserve_partitions(as_query_transaction * qtr) +{ + if (!qtr) { + cf_warning(AS_QUERY, "qtr is NULL"); + return; + } + if (qtr->qctx.partitions_pre_reserved) { + qtr->rsv = cf_malloc(sizeof(as_partition_reservation) * AS_PARTITIONS); + as_partition_prereserve_query(qtr->ns, qtr->qctx.can_partition_query, qtr->rsv); + } else { + qtr->rsv = NULL; + } +} + +// ************************************************************************************************** + + +/* + * Query tracking + */ +// ************************************************************************************************** +// Put qtr in a global hash +static int +hash_put_qtr(as_query_transaction * qtr) +{ + if (!qtr->track) { + return AS_QUERY_CONTINUE; + } + + int rc = cf_rchash_put_unique(g_query_job_hash, &qtr->trid, sizeof(qtr->trid), qtr); + if (rc) { + cf_warning(AS_SINDEX, "QTR Put in hash failed with error %d", rc); + } + + return rc; +} + +// Get Qtr from global hash +static int +hash_get_qtr(uint64_t trid, as_query_transaction ** qtr) +{ + int rv = cf_rchash_get(g_query_job_hash, &trid, sizeof(trid), (void **) qtr); + if (CF_RCHASH_OK != rv) { + cf_info(AS_SINDEX, "Query job with transaction id [%"PRIu64"] does not exist", trid ); + } + return rv; +} + +// Delete Qtr from global hash +static int +hash_delete_qtr(as_query_transaction *qtr) +{ + if (!qtr->track) { + return AS_QUERY_CONTINUE; + } + + int rv = cf_rchash_delete(g_query_job_hash, &qtr->trid, sizeof(qtr->trid)); + if (CF_RCHASH_OK != rv) { + cf_warning(AS_SINDEX, "Failed to delete qtr from query hash."); + } + return rv; +} +// If any query run from more than g_config.query_untracked_time_ms +// we are going to track it +// else no. +int +hash_track_qtr(as_query_transaction *qtr) +{ + if (!qtr->track) { + if ((cf_getns() - qtr->start_time) > (g_config.query_untracked_time_ms * 1000000)) { + qtr->track = true; + qtr_reserve(qtr, __FILE__, __LINE__); + int ret = hash_put_qtr(qtr); + if (ret != 0 && ret != AS_QUERY_CONTINUE) { + // track should be disabled otherwise at the + // qtr cleanup stage some other qtr with the same + // trid can get cleaned up. + qtr->track = false; + qtr_release(qtr, __FILE__, __LINE__); + return AS_QUERY_ERR; + } + } + } + return AS_QUERY_OK; +} +// ************************************************************************************************** + + + +/* + * Query Request IO functions + */ +// ************************************************************************************************** +/* + * Function query_add_response + * + * Returns - + * AS_QUERY_OK - On success. + * AS_QUERY_ERR - On failure. + * + * Notes - + * Basic query call back function. Fills up the client response buffer; + * sends out buffer and then + * reinitializes the buf for the next set of requests, + * In case buffer is full Bail out quick if unable to send response back to client + * + * On success, qtr->n_result_records is incremented by 1. + * + * Synchronization - + * Takes a lock over qtr->buf + */ +static int +query_add_response(void *void_qtr, as_storage_rd *rd) +{ + as_query_transaction *qtr = (as_query_transaction *)void_qtr; + + // TODO - check and handle error result (< 0 - drive IO) explicitly? + size_t msg_sz = (size_t)as_msg_make_response_bufbuilder(NULL, rd, + qtr->no_bin_data, true, true, qtr->binlist); + int ret = 0; + + pthread_mutex_lock(&qtr->buf_mutex); + cf_buf_builder *bb_r = qtr->bb_r; + if (bb_r == NULL) { + // Assert that query is aborted if bb_r is found to be null + pthread_mutex_unlock(&qtr->buf_mutex); + return AS_QUERY_ERR; + } + + if (msg_sz > (bb_r->alloc_sz - bb_r->used_sz) && bb_r->used_sz != 0) { + query_netio(qtr); + } + + int32_t result = as_msg_make_response_bufbuilder(&qtr->bb_r, rd, + qtr->no_bin_data, true, true, qtr->binlist); + + if (result < 0) { + ret = result; + cf_warning(AS_QUERY, "Weird there is space but still the packing failed " + "available = %zd msg size = %zu", + bb_r->alloc_sz - bb_r->used_sz, msg_sz); + } + cf_atomic64_incr(&qtr->n_result_records); + pthread_mutex_unlock(&qtr->buf_mutex); + return ret; +} + + +static int +query_add_fin(as_query_transaction *qtr) +{ + +#if defined(USE_SYSTEMTAP) + uint64_t nodeid = g_config.self_node; +#endif + cf_detail(AS_QUERY, "Adding fin %p", qtr); + uint8_t *b; + // in case of aborted query, the bb_r is already released + if (qtr->bb_r == NULL) { + // Assert that query is aborted if bb_r is found to be null + return AS_QUERY_ERR; + } + cf_buf_builder_reserve(&qtr->bb_r, sizeof(as_msg), &b); + + ASD_QUERY_ADDFIN(nodeid, qtr->trid); + // set up the header + uint8_t *buf = b; + as_msg *msgp = (as_msg *) buf; + msgp->header_sz = sizeof(as_msg); + msgp->info1 = 0; + msgp->info2 = 0; + msgp->info3 = AS_MSG_INFO3_LAST; + msgp->unused = 0; + msgp->result_code = qtr->result_code; + msgp->generation = 0; + msgp->record_ttl = 0; + msgp->n_fields = 0; + msgp->n_ops = 0; + msgp->transaction_ttl = 0; + as_msg_swap_header(msgp); + return AS_QUERY_OK; +} + +static int +query_send_fin(as_query_transaction *qtr) { + // Send out the final data back + if (qtr->fd_h) { + query_add_fin(qtr); + query_netio(qtr); + } + return AS_QUERY_OK; +} + +static void +query_send_bg_udf_response(as_transaction *tr) +{ + cf_detail(AS_QUERY, "Send Fin for Background UDF"); + bool force_close = ! as_msg_send_fin(&tr->from.proto_fd_h->sock, AS_PROTO_RESULT_OK); + query_release_fd(tr->from.proto_fd_h, force_close); + tr->from.proto_fd_h = NULL; +} + +static bool +query_match_integer_fromval(as_query_transaction * qtr, as_val *v, as_sindex_key *skey) +{ + as_sindex_bin_data *start = &qtr->srange->start; + as_sindex_bin_data *end = &qtr->srange->end; + + if ((AS_PARTICLE_TYPE_INTEGER != as_sindex_pktype(qtr->si->imd)) + || (AS_PARTICLE_TYPE_INTEGER != start->type) + || (AS_PARTICLE_TYPE_INTEGER != end->type)) { + cf_debug(AS_QUERY, "query_record_matches: Type mismatch %d!=%d!=%d!=%d binname=%s index=%s", + AS_PARTICLE_TYPE_INTEGER, start->type, end->type, as_sindex_pktype(qtr->si->imd), + qtr->si->imd->bname, qtr->si->imd->iname); + return false; + } + as_integer * i = as_integer_fromval(v); + int64_t value = as_integer_get(i); + if (skey->key.int_key != value) { + cf_debug(AS_QUERY, "query_record_matches: sindex key does " + "not matches bin value in record. skey %ld bin value %ld", skey->key.int_key, value); + return false; + } + + return true; +} + +static bool +query_match_string_fromval(as_query_transaction * qtr, as_val *v, as_sindex_key *skey) +{ + as_sindex_bin_data *start = &qtr->srange->start; + as_sindex_bin_data *end = &qtr->srange->end; + + if ((AS_PARTICLE_TYPE_STRING != as_sindex_pktype(qtr->si->imd)) + || (AS_PARTICLE_TYPE_STRING != start->type) + || (AS_PARTICLE_TYPE_STRING != end->type)) { + cf_debug(AS_QUERY, "query_record_matches: Type mismatch %d!=%d!=%d!=%d binname=%s index=%s", + AS_PARTICLE_TYPE_STRING, start->type, end->type, as_sindex_pktype(qtr->si->imd), + qtr->si->imd->bname, qtr->si->imd->iname); + return false; + } + + char * str_val = as_string_get(as_string_fromval(v)); + cf_digest str_digest; + cf_digest_compute(str_val, strlen(str_val), &str_digest); + + if (memcmp(&str_digest, &skey->key.str_key, AS_DIGEST_KEY_SZ)) { + return false; + } + return true; +} + +static bool +query_match_geojson_fromval(as_query_transaction * qtr, as_val *v, as_sindex_key *skey) +{ + as_sindex_bin_data *start = &qtr->srange->start; + as_sindex_bin_data *end = &qtr->srange->end; + + if ((AS_PARTICLE_TYPE_GEOJSON != as_sindex_pktype(qtr->si->imd)) + || (AS_PARTICLE_TYPE_GEOJSON != start->type) + || (AS_PARTICLE_TYPE_GEOJSON != end->type)) { + cf_debug(AS_QUERY, "query_record_matches: Type mismatch %d!=%d!=%d!=%d binname=%s index=%s", + AS_PARTICLE_TYPE_GEOJSON, start->type, end->type, + as_sindex_pktype(qtr->si->imd), qtr->si->imd->bname, + qtr->si->imd->iname); + return false; + } + + return as_particle_geojson_match_asval(v, qtr->srange->cellid, + qtr->srange->region, qtr->ns->geo2dsphere_within_strict); +} + +// If the value matches foreach should stop iterating the +bool +query_match_mapkeys_foreach(const as_val * key, const as_val * val, void * udata) +{ + qtr_skey * q_s = (qtr_skey *)udata; + switch (key->type) { + case AS_STRING: + // If matches return false + return !query_match_string_fromval(q_s->qtr, (as_val *)key, q_s->skey); + case AS_INTEGER: + // If matches return false + return !query_match_integer_fromval(q_s->qtr,(as_val *) key, q_s->skey); + case AS_GEOJSON: + // If matches return false + return !query_match_geojson_fromval(q_s->qtr,(as_val *) key, q_s->skey); + default: + // All others don't match + return true; + } +} + +static bool +query_match_mapvalues_foreach(const as_val * key, const as_val * val, void * udata) +{ + qtr_skey * q_s = (qtr_skey *)udata; + switch (val->type) { + case AS_STRING: + // If matches return false + return !query_match_string_fromval(q_s->qtr, (as_val *)val, q_s->skey); + case AS_INTEGER: + // If matches return false + return !query_match_integer_fromval(q_s->qtr, (as_val *)val, q_s->skey); + case AS_GEOJSON: + // If matches return false + return !query_match_geojson_fromval(q_s->qtr, (as_val *)val, q_s->skey); + default: + // All others don't match + return true; + } +} + +static bool +query_match_listele_foreach(as_val * val, void * udata) +{ + qtr_skey * q_s = (qtr_skey *)udata; + switch (val->type) { + case AS_STRING: + // If matches return false + return !query_match_string_fromval(q_s->qtr, val, q_s->skey); + case AS_INTEGER: + // If matches return false + return !query_match_integer_fromval(q_s->qtr, val, q_s->skey); + case AS_GEOJSON: + // If matches return false + return !query_match_geojson_fromval(q_s->qtr, val, q_s->skey); + default: + // All others don't match + return true; + } +} +/* + * Validate record based on its content and query make sure it indeed should + * be selected. Secondary index does lazy delete for the entries for the record + * for which data is on ssd. See sindex design doc for details. Hence it is + * possible that it returns digest for which record may have changed. Do the + * validation before returning the row. + */ +static bool +query_record_matches(as_query_transaction *qtr, as_storage_rd *rd, as_sindex_key * skey) +{ + // TODO: Add counters and make sure it is not a performance hit + as_sindex_bin_data *start = &qtr->srange->start; + as_sindex_bin_data *end = &qtr->srange->end; + + //TODO: Make it more general to support sindex over multiple bins + as_bin * b = as_bin_get_by_id(rd, qtr->si->imd->binid); + + if (!b) { + cf_debug(AS_QUERY , "as_query_record_validation: " + "Bin name %s not found ", qtr->si->imd->bname); + // Possible bin may not be there anymore classic case of + // bin delete. + return false; + } + uint8_t type = as_bin_get_particle_type(b); + + // If the bin is of type cdt, we need to see if anyone of the value within cdt + // matches the query. + // This can be performance hit for big list and maps. + as_val * res_val = NULL; + as_val * val = NULL; + bool matches = false; + bool from_cdt = false; + switch (type) { + case AS_PARTICLE_TYPE_INTEGER : { + if ((type != as_sindex_pktype(qtr->si->imd)) + || (type != start->type) + || (type != end->type)) { + cf_debug(AS_QUERY, "query_record_matches: Type mismatch %d!=%d!=%d!=%d binname=%s index=%s", + type, start->type, end->type, as_sindex_pktype(qtr->si->imd), + qtr->si->imd->bname, qtr->si->imd->iname); + matches = false; + break; + } + + int64_t i = as_bin_particle_integer_value(b); + if (skey->key.int_key != i) { + cf_debug(AS_QUERY, "query_record_matches: sindex key does " + "not matches bin value in record. bin value %ld skey value %ld", i, skey->key.int_key); + matches = false; + break; + } + matches = true; + break; + } + case AS_PARTICLE_TYPE_STRING : { + if ((type != as_sindex_pktype(qtr->si->imd)) + || (type != start->type) + || (type != end->type)) { + cf_debug(AS_QUERY, "query_record_matches: Type mismatch %d!=%d!=%d!=%d binname=%s index=%s", + type, start->type, end->type, as_sindex_pktype(qtr->si->imd), + qtr->si->imd->bname, qtr->si->imd->iname); + matches = false; + break; + } + + char * buf; + uint32_t psz = as_bin_particle_string_ptr(b, &buf); + cf_digest bin_digest; + cf_digest_compute(buf, psz, &bin_digest); + if (memcmp(&skey->key.str_key, &bin_digest, AS_DIGEST_KEY_SZ)) { + matches = false; + break; + } + matches = true; + break; + } + case AS_PARTICLE_TYPE_GEOJSON : { + if ((type != as_sindex_pktype(qtr->si->imd)) + || (type != start->type) + || (type != end->type)) { + cf_debug(AS_QUERY, "as_query_record_matches: Type mismatch %d!=%d!=%d!=%d binname=%s index=%s", + type, start->type, end->type, as_sindex_pktype(qtr->si->imd), + qtr->si->imd->bname, qtr->si->imd->iname); + return false; + } + + bool iswithin = as_particle_geojson_match(b->particle, + qtr->srange->cellid, qtr->srange->region, + qtr->ns->geo2dsphere_within_strict); + + // We either found a valid point or a false positive. + if (iswithin) { + cf_atomic64_incr(&qtr->ns->geo_region_query_points); + } + else { + cf_atomic64_incr(&qtr->ns->geo_region_query_falsepos); + } + + return iswithin; + } + case AS_PARTICLE_TYPE_MAP : { + val = as_bin_particle_to_asval(b); + res_val = as_sindex_extract_val_from_path(qtr->si->imd, val); + if (!res_val) { + matches = false; + break; + } + from_cdt = true; + break; + } + case AS_PARTICLE_TYPE_LIST : { + val = as_bin_particle_to_asval(b); + res_val = as_sindex_extract_val_from_path(qtr->si->imd, val); + if (!res_val) { + matches = false; + break; + } + from_cdt = true; + break; + } + default: { + break; + } + } + + if (from_cdt) { + if (res_val->type == AS_INTEGER) { + // Defensive check. + if (qtr->si->imd->itype == AS_SINDEX_ITYPE_DEFAULT) { + matches = query_match_integer_fromval(qtr, res_val, skey); + } + else { + matches = false; + } + } + else if (res_val->type == AS_STRING) { + // Defensive check. + if (qtr->si->imd->itype == AS_SINDEX_ITYPE_DEFAULT) { + matches = query_match_string_fromval(qtr, res_val, skey); + } + else { + matches = false; + } + } + else if (res_val->type == AS_MAP) { + qtr_skey q_s; + q_s.qtr = qtr; + q_s.skey = skey; + // Defensive check. + if (qtr->si->imd->itype == AS_SINDEX_ITYPE_MAPKEYS) { + as_map * map = as_map_fromval(res_val); + matches = !as_map_foreach(map, query_match_mapkeys_foreach, &q_s); + } + else if (qtr->si->imd->itype == AS_SINDEX_ITYPE_MAPVALUES){ + as_map * map = as_map_fromval(res_val); + matches = !as_map_foreach(map, query_match_mapvalues_foreach, &q_s); + } + else { + matches = false; + } + } + else if (res_val->type == AS_LIST) { + qtr_skey q_s; + q_s.qtr = qtr; + q_s.skey = skey; + + // Defensive check + if (qtr->si->imd->itype == AS_SINDEX_ITYPE_LIST) { + as_list * list = as_list_fromval(res_val); + matches = !as_list_foreach(list, query_match_listele_foreach, &q_s); + } + else { + matches = false; + } + } + } + + if (val) { + as_val_destroy(val); + } + return matches; +} + + + +static int +query_io(as_query_transaction *qtr, cf_digest *dig, as_sindex_key * skey) +{ +#if defined(USE_SYSTEMTAP) + uint64_t nodeid = g_config.self_node; +#endif + + as_namespace * ns = qtr->ns; + as_partition_reservation rsv_stack; + as_partition_reservation * rsv = &rsv_stack; + + // We make sure while making digest list that current partition is query-able + // Attempt the query reservation here as well. If this partition is not + // query-able anymore then no need to return anything + // Since we are reserving all the partitions upfront, this is a defensive check + uint32_t pid = as_partition_getid(dig); + rsv = query_reserve_partition(ns, qtr, pid, rsv); + if (!rsv) { + return AS_QUERY_OK; + } + + ASD_QUERY_IO_STARTING(nodeid, qtr->trid); + + as_index_ref r_ref; + r_ref.skip_lock = false; + int rec_rv = as_record_get_live(rsv->tree, dig, &r_ref, ns); + + if (rec_rv == 0) { + as_index *r = r_ref.r; + + predexp_args_t predargs = { .ns = ns, .md = r, .vl = NULL, .rd = NULL }; + + if (qtr->predexp_eval && + ! predexp_matches_metadata(qtr->predexp_eval, &predargs)) { + as_record_done(&r_ref, ns); + goto CLEANUP; + } + + // check to see this isn't a record waiting to die + if (as_record_is_doomed(r, ns)) { + as_record_done(&r_ref, ns); + cf_debug(AS_QUERY, + "build_response: record expired. treat as not found"); + // Not sending error message to client as per the agreement + // that server will never send a error result code to the query client. + goto CLEANUP; + } + + // make sure it's brought in from storage if necessary + as_storage_rd rd; + as_storage_record_open(ns, r, &rd); + qtr->n_read_success += 1; + + // TODO - even if qtr->no_bin_data is true, we still read bins in order + // to check via query_record_matches() below. If sindex evolves to not + // have to do that, optimize this case and bypass reading bins. + + as_storage_rd_load_n_bins(&rd); // TODO - handle error returned + + // Note: This array must stay in scope until the response + // for this record has been built, since in the get + // data w/ record on device case, it's copied by + // reference directly into the record descriptor! + as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; + + // Figure out which bins you want - for now, all + as_storage_rd_load_bins(&rd, stack_bins); // TODO - handle error returned + rd.n_bins = as_bin_inuse_count(&rd); + + // Now we have a record. + predargs.rd = &rd; + + if (qtr->predexp_eval && + ! predexp_matches_record(qtr->predexp_eval, &predargs)) { + as_storage_record_close(&rd); + as_record_done(&r_ref, ns); + goto CLEANUP; + } + + // Call Back + if (!query_record_matches(qtr, &rd, skey)) { + as_storage_record_close(&rd); + as_record_done(&r_ref, ns); + query_release_partition(qtr, rsv); + cf_atomic64_incr(&g_stats.query_false_positives); + ASD_QUERY_IO_NOTMATCH(nodeid, qtr->trid); + return AS_QUERY_OK; + } + + int ret = query_add_response(qtr, &rd); + if (ret != 0) { + as_storage_record_close(&rd); + as_record_done(&r_ref, ns); + qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_CBERROR, __FILE__, __LINE__); + query_release_partition(qtr, rsv); + ASD_QUERY_IO_ERROR(nodeid, qtr->trid); + return AS_QUERY_ERR; + } + as_storage_record_close(&rd); + as_record_done(&r_ref, ns); + } else { + // What do we do about empty records? + // 1. Should gin up an empty record + // 2. Current error is returned back to the client. + cf_detail(AS_QUERY, "query_generator: " + "as_record_get returned %d : key %"PRIx64, rec_rv, + *(uint64_t *)dig); + } +CLEANUP : + query_release_partition(qtr, rsv); + + ASD_QUERY_IO_FINISHED(nodeid, qtr->trid); + + return AS_QUERY_OK; +} +// ************************************************************************************************** + +/* + * Query Aggregation Request Workhorse Function + */ +// ************************************************************************************************** +static int +query_add_val_response(void *void_qtr, const as_val *val, bool success) +{ + as_query_transaction *qtr = (as_query_transaction *)void_qtr; + if (!qtr) { + return AS_QUERY_ERR; + } + + uint32_t msg_sz = as_particle_asval_client_value_size(val); + if (0 == msg_sz) { + cf_warning(AS_PROTO, "particle to buf: could not copy data!"); + } + + pthread_mutex_lock(&qtr->buf_mutex); + cf_buf_builder *bb_r = qtr->bb_r; + if (bb_r == NULL) { + // Assert that query is aborted if bb_r is found to be null + pthread_mutex_unlock(&qtr->buf_mutex); + return AS_QUERY_ERR; + } + + if (msg_sz > (bb_r->alloc_sz - bb_r->used_sz) && bb_r->used_sz != 0) { + query_netio(qtr); + } + + as_msg_make_val_response_bufbuilder(val, &qtr->bb_r, msg_sz, success); + cf_atomic64_incr(&qtr->n_result_records); + + pthread_mutex_unlock(&qtr->buf_mutex); + return 0; +} + + +static void +query_add_result(char *res, as_query_transaction *qtr, bool success) +{ + const as_val * v = (as_val *) as_string_new (res, false); + query_add_val_response((void *) qtr, v, success); + as_val_destroy(v); +} + + +static int +query_process_aggreq(query_work *qagg) +{ + as_query_transaction *qtr = qagg->qtr; + if (!qtr) { + return AS_QUERY_ERR; + } + + if (!cf_ll_size(qagg->recl)) { + return AS_QUERY_ERR; + } + + as_result *res = as_result_new(); + int ret = as_aggr_process(qtr->ns, &qtr->agg_call, qagg->recl, (void *)qtr, res); + + if (ret != 0) { + char *rs = as_module_err_string(ret); + if (res->value != NULL) { + as_string * lua_s = as_string_fromval(res->value); + char * lua_err = (char *) as_string_tostring(lua_s); + if (lua_err != NULL) { + int l_rs_len = strlen(rs); + rs = cf_realloc(rs,l_rs_len + strlen(lua_err) + 4); + sprintf(&rs[l_rs_len]," : %s",lua_err); + } + } + query_add_result(rs, qtr, false); + cf_free(rs); + } + as_result_destroy(res); + return ret; +} +// ************************************************************************************************** + + +/* + * Aggregation HOOKS + */ +// ************************************************************************************************** +as_stream_status +agg_ostream_write(void *udata, as_val *v) +{ + as_query_transaction *qtr = (as_query_transaction *)udata; + if (!v) { + return AS_STREAM_OK; + } + int ret = AS_STREAM_OK; + if (query_add_val_response((void *)qtr, v, true)) { + ret = AS_STREAM_ERR; + } + as_val_destroy(v); + return ret; +} + +static as_partition_reservation * +agg_reserve_partition(void *udata, as_namespace *ns, uint32_t pid, as_partition_reservation *rsv) +{ + return query_reserve_partition(ns, (as_query_transaction *)udata, pid, rsv); +} + +static void +agg_release_partition(void *udata, as_partition_reservation *rsv) +{ + query_release_partition((as_query_transaction *)udata, rsv); +} + +static void +agg_set_error(void * udata, int err) +{ + qtr_set_err((as_query_transaction *)udata, AS_PROTO_RESULT_FAIL_QUERY_CBERROR, __FILE__, __LINE__); +} + +// true if matches +static bool +agg_record_matches(void *udata, udf_record *urecord, void *key_data) +{ + as_query_transaction * qtr = (as_query_transaction*)udata; + as_sindex_key *skey = (void *)key_data; + qtr->n_read_success++; + if (query_record_matches(qtr, urecord->rd, skey) == false) { + cf_atomic64_incr(&g_stats.query_false_positives); // PUT IT INSIDE PRE_CHECK + return false; + } + return true; +} + +const as_aggr_hooks query_aggr_hooks = { + .ostream_write = agg_ostream_write, + .set_error = agg_set_error, + .ptn_reserve = agg_reserve_partition, + .ptn_release = agg_release_partition, + .pre_check = agg_record_matches +}; +// ************************************************************************************************** + + + + + +/* + * Query Request UDF functions + */ +// ************************************************************************************************** +// NB: Caller holds a write hash lock _BE_CAREFUL_ if you intend to take +// lock inside this function +int +query_udf_bg_tr_complete(void *udata, int retcode) +{ + as_query_transaction *qtr = (as_query_transaction *)udata; + if (!qtr) { + cf_warning(AS_QUERY, "Complete called with invalid job id"); + return AS_QUERY_ERR; + } + + qtr_finish_work(qtr, &qtr->n_udf_tr_queued, __FILE__, __LINE__, true); + return AS_QUERY_OK; +} + +// Creates a internal transaction for per record UDF execution triggered +// from inside generator. The generator could be scan job generating digest +// or query generating digest. +int +query_udf_bg_tr_start(as_query_transaction *qtr, cf_digest *keyd) +{ + if (qtr->origin.predexp) { + as_partition_reservation rsv_stack; + as_partition_reservation *rsv = &rsv_stack; + uint32_t pid = as_partition_getid(keyd); + + if (! (rsv = query_reserve_partition(qtr->ns, qtr, pid, rsv))) { + return AS_QUERY_OK; + } + + as_index_ref r_ref; + r_ref.skip_lock = false; + + if (as_record_get_live(rsv->tree, keyd, &r_ref, qtr->ns) != 0) { + query_release_partition(qtr, rsv); + return AS_QUERY_OK; + } + + predexp_args_t predargs = { + .ns = qtr->ns, .md = r_ref.r, .vl = NULL, .rd = NULL + }; + + if (qtr->origin.predexp && + ! predexp_matches_metadata(qtr->origin.predexp, &predargs)) { + as_record_done(&r_ref, qtr->ns); + query_release_partition(qtr, rsv); + return AS_QUERY_OK; + } + + as_record_done(&r_ref, qtr->ns); + query_release_partition(qtr, rsv); + } + + as_transaction tr; + + as_transaction_init_iudf(&tr, qtr->ns, keyd, &qtr->origin, qtr->is_durable_delete); + + qtr_reserve(qtr, __FILE__, __LINE__); + cf_atomic32_incr(&qtr->n_udf_tr_queued); + + as_tsvc_enqueue(&tr); + + return AS_QUERY_OK; +} + +static int +query_process_udfreq(query_work *qudf) +{ + int ret = AS_QUERY_OK; + cf_ll_element * ele = NULL; + cf_ll_iterator * iter = NULL; + as_query_transaction *qtr = qudf->qtr; + if (!qtr) return AS_QUERY_ERR; + cf_detail(AS_QUERY, "Performing UDF"); + iter = cf_ll_getIterator(qudf->recl, true /*forward*/); + if (!iter) { + ret = AS_QUERY_ERR; + qtr_set_err(qtr, AS_SINDEX_ERR_NO_MEMORY, __FILE__, __LINE__); + goto Cleanup; + } + + while ((ele = cf_ll_getNext(iter))) { + as_index_keys_ll_element * node; + node = (as_index_keys_ll_element *) ele; + as_index_keys_arr * keys_arr = node->keys_arr; + if (!keys_arr) { + continue; + } + node->keys_arr = NULL; + + for (int i = 0; i < keys_arr->num; i++) { + + while (cf_atomic32_get(qtr->n_udf_tr_queued) >= (AS_QUERY_MAX_UDF_TRANSACTIONS * (qtr->priority / 10 + 1))) { + usleep(g_config.query_sleep_us); + query_check_timeout(qtr); + if (qtr_failed(qtr)) { + ret = AS_QUERY_ERR; + goto Cleanup; + } + } + + if (AS_QUERY_ERR == query_udf_bg_tr_start(qtr, &keys_arr->pindex_digs[i])) { + as_index_keys_release_arr_to_queue(keys_arr); + ret = AS_QUERY_ERR; + goto Cleanup; + } + } + as_index_keys_release_arr_to_queue(keys_arr); + } +Cleanup: + if (iter) { + cf_ll_releaseIterator(iter); + iter = NULL; + } + return ret; +} +// ************************************************************************************************** + + + + +static int +query_process_ioreq(query_work *qio) +{ + +#if defined(USE_SYSTEMTAP) + uint64_t nodeid = g_config.self_node; +#endif + + as_query_transaction *qtr = qio->qtr; + if (!qtr) { + return AS_QUERY_ERR; + } + + ASD_QUERY_IOREQ_STARTING(nodeid, qtr->trid); + + cf_ll_element * ele = NULL; + cf_ll_iterator * iter = NULL; + + cf_detail(AS_QUERY, "Performing IO"); + uint64_t time_ns = 0; + if (g_config.query_enable_histogram || qtr->si->enable_histogram) { + time_ns = cf_getns(); + } + iter = cf_ll_getIterator(qio->recl, true /*forward*/); + if (!iter) { + cf_crash(AS_QUERY, "Cannot allocate iterator... out of memory !!"); + } + + while ((ele = cf_ll_getNext(iter))) { + as_index_keys_ll_element * node; + node = (as_index_keys_ll_element *) ele; + as_index_keys_arr *keys_arr = node->keys_arr; + if (!keys_arr) { + continue; + } + node->keys_arr = NULL; + for (int i = 0; i < keys_arr->num; i++) { + if (AS_QUERY_OK != query_io(qtr, &keys_arr->pindex_digs[i], &keys_arr->sindex_keys[i])) { + as_index_keys_release_arr_to_queue(keys_arr); + goto Cleanup; + } + + int64_t nresults = cf_atomic64_get(qtr->n_result_records); + if (nresults > 0 && (nresults % qtr->priority == 0)) + { + usleep(g_config.query_sleep_us); + query_check_timeout(qtr); + if (qtr_failed(qtr)) { + as_index_keys_release_arr_to_queue(keys_arr); + goto Cleanup; + } + } + } + as_index_keys_release_arr_to_queue(keys_arr); + } +Cleanup: + + if (iter) { + cf_ll_releaseIterator(iter); + iter = NULL; + } + QUERY_HIST_INSERT_DATA_POINT(query_batch_io_hist, time_ns); + SINDEX_HIST_INSERT_DATA_POINT(qtr->si, query_batch_io, time_ns); + + ASD_QUERY_IOREQ_FINISHED(nodeid, qtr->trid); + + return AS_QUERY_OK; +} + +// ************************************************************************************************** + + +/* + * Query Request Processing + */ +// ************************************************************************************************** +static int +qwork_process(query_work *qworkp) +{ + QUERY_HIST_INSERT_DATA_POINT(query_batch_io_q_wait_hist, qworkp->queued_time_ns); + cf_detail(AS_QUERY, "Processing Request %d", qworkp->type); + if (qtr_failed(qworkp->qtr)) { + return AS_QUERY_ERR; + } + int ret = AS_QUERY_OK; + switch (qworkp->type) { + case QUERY_WORK_TYPE_LOOKUP: + ret = query_process_ioreq(qworkp); + break; + case QUERY_WORK_TYPE_UDF_BG: // Does it need different call ?? + ret = query_process_udfreq(qworkp); + break; + case QUERY_WORK_TYPE_AGG: + ret = query_process_aggreq(qworkp); + break; + default: + cf_warning(AS_QUERY, "Unsupported query type %d.. Dropping it", qworkp->type); + break; + } + return ret; +} + +static void +qwork_setup(query_work *qworkp, as_query_transaction *qtr) +{ + qtr_reserve(qtr, __FILE__, __LINE__); + qworkp->qtr = qtr; + qworkp->recl = qtr->qctx.recl; + qtr->qctx.recl = NULL; + qworkp->queued_time_ns = cf_getns(); + qtr->n_digests += qtr->qctx.n_bdigs; + qtr->qctx.n_bdigs = 0; + + switch (qtr->job_type) { + case QUERY_TYPE_LOOKUP: + qworkp->type = QUERY_WORK_TYPE_LOOKUP; + break; + case QUERY_TYPE_AGGR: + qworkp->type = QUERY_WORK_TYPE_AGG; + break; + case QUERY_TYPE_UDF_BG: + qworkp->type = QUERY_WORK_TYPE_UDF_BG; + break; + default: + cf_crash(AS_QUERY, "Unknown Query Type !!"); + } +} + +static void +qwork_teardown(query_work *qworkp) +{ + if (qworkp->recl) { + cf_ll_reduce(qworkp->recl, true /*forward*/, as_index_keys_ll_reduce_fn, NULL); + cf_free(qworkp->recl); + qworkp->recl = NULL; + } + qtr_release(qworkp->qtr, __FILE__, __LINE__); + qworkp->qtr = NULL; +} +// ************************************************************************************************** + + +void * +qwork_th(void *q_to_wait_on) +{ + unsigned int thread_id = cf_atomic32_incr(&g_query_worker_threadcnt); + cf_detail(AS_QUERY, "Created Query Worker Thread %d", thread_id); + query_work * qworkp = NULL; + int ret = AS_QUERY_OK; + + while (1) { + // Kill self if thread id is greater than that of number of configured + // Config change should be flag for quick check + if (thread_id > g_config.query_worker_threads) { + pthread_rwlock_rdlock(&g_query_lock); + if (thread_id > g_config.query_worker_threads) { + cf_atomic32_decr(&g_query_worker_threadcnt); + pthread_rwlock_unlock(&g_query_lock); + cf_detail(AS_QUERY, "Query Worker thread %d exited", thread_id); + return NULL; + } + pthread_rwlock_unlock(&g_query_lock); + } + if (cf_queue_pop(g_query_work_queue, &qworkp, CF_QUEUE_FOREVER) != 0) { + cf_crash(AS_QUERY, "Failed to pop from Query worker queue."); + } + cf_detail(AS_QUERY, "Popped I/O work [%p,%p]", qworkp, qworkp->qtr); + + ret = qwork_process(qworkp); + + as_query_transaction *qtr = qworkp->qtr; + if ((ret != AS_QUERY_OK) && !qtr_failed(qtr)) { + cf_warning(AS_QUERY, "Request processing failed but query is not qtr_failed .... ret %d", ret); + } + qtr_finish_work(qtr, &qtr->n_qwork_active, __FILE__, __LINE__, false); + qwork_teardown(qworkp); + qwork_poolrelease(qworkp); + } + + return NULL; +} + +/* + * Query Generator + */ +// ************************************************************************************************** +/* + * Function query_get_nextbatch + * + * Notes- + * Function generates the next batch of digest list after looking up + * secondary index tree. The function populates qctx->recl with the + * digest list. + * + * Returns + * AS_QUERY_OK: If the batch is full qctx->n_bdigs == qctx->bsize. The caller + * then processes the batch and reset the qctx->recl and qctx->n_bdigs. + * + * AS_QUERY_CONTINUE: If the caller should continue calling this function. + * + * AS_QUERY_ERR: In case of error + */ +int +query_get_nextbatch(as_query_transaction *qtr) +{ + int ret = AS_QUERY_OK; + as_sindex *si = qtr->si; + as_sindex_qctx *qctx = &qtr->qctx; + uint64_t time_ns = 0; + if (g_config.query_enable_histogram + || qtr->si->enable_histogram) { + time_ns = cf_getns(); + } + + as_sindex_range *srange = &qtr->srange[qctx->range_index]; + + if (qctx->pimd_idx == -1) { + if (!srange->isrange) { + qctx->pimd_idx = ai_btree_key_hash_from_sbin(si->imd, &srange->start); + } else { + qctx->pimd_idx = 0; + } + } + + if (!qctx->recl) { + qctx->recl = cf_malloc(sizeof(cf_ll)); + cf_ll_init(qctx->recl, as_index_keys_ll_destroy_fn, false /*no lock*/); + qctx->n_bdigs = 0; + } else { + // Following condition may be true if the + // query has moved from short query pool to + // long running query pool + if (qctx->n_bdigs >= qctx->bsize) + return ret; + } + + // Query Aerospike Index + int qret = as_sindex_query(qtr->si, srange, &qtr->qctx); + cf_detail(AS_QUERY, "start %ld end %ld @ %d pimd found %"PRIu64, srange->start.u.i64, srange->end.u.i64, qctx->pimd_idx, qctx->n_bdigs); + + qctx->new_ibtr = false; + if (qret < 0) { // [AS_SINDEX_OK, AS_SINDEX_CONTINUE] -> OK + qtr_set_err(qtr, as_sindex_err_to_clienterr(qret, __FILE__, __LINE__), __FILE__, __LINE__); + ret = AS_QUERY_ERR; + goto batchout; + } + + if (time_ns) { + if (g_config.query_enable_histogram) { + qtr->querying_ai_time_ns += cf_getns() - time_ns; + } else if (qtr->si->enable_histogram) { + SINDEX_HIST_INSERT_DATA_POINT(qtr->si, query_batch_lookup, time_ns); + } + } + if (qctx->n_bdigs < qctx->bsize) { + qctx->new_ibtr = true; + qctx->nbtr_done = false; + qctx->pimd_idx++; + cf_detail(AS_QUERY, "All the Data finished moving to next tree %d", qctx->pimd_idx); + if (!srange->isrange) { + qtr->result_code = AS_PROTO_RESULT_OK; + ret = AS_QUERY_DONE; + goto batchout; + } + if (qctx->pimd_idx == si->imd->nprts) { + + // Geospatial queries need to search multiple ranges. The + // srange object is a vector of MAX_REGION_CELLS elements. + // We iterate over ranges until we encounter an empty + // srange (num_binval == 0). + // + if (qctx->range_index == (MAX_REGION_CELLS - 1) || + qtr->srange[qctx->range_index+1].num_binval == 0) { + qtr->result_code = AS_PROTO_RESULT_OK; + ret = AS_QUERY_DONE; + goto batchout; + } + qctx->range_index++; + qctx->pimd_idx = -1; + } + ret = AS_QUERY_CONTINUE; + goto batchout; + } +batchout: + return ret; +} + + +/* + * Phase II setup just after the generator picks up query for + * the first time + */ +static int +query_run_setup(as_query_transaction *qtr) +{ + +#if defined(USE_SYSTEMTAP) + uint64_t nodeid = g_config.self_node; +#endif + + QUERY_HIST_INSERT_DATA_POINT(query_query_q_wait_hist, qtr->start_time); + cf_atomic64_set(&qtr->n_result_records, 0); + qtr->track = false; + qtr->querying_ai_time_ns = 0; + qtr->n_io_outstanding = 0; + qtr->netio_push_seq = 0; + qtr->netio_pop_seq = 1; + qtr->blocking = false; + pthread_mutex_init(&qtr->buf_mutex, NULL); + + // Aerospike Index object initialization + qtr->result_code = AS_PROTO_RESULT_OK; + + // Initialize qctx + // start with the threshold value + qtr->qctx.bsize = g_config.query_threshold; + qtr->qctx.new_ibtr = true; + qtr->qctx.nbtr_done = false; + qtr->qctx.pimd_idx = -1; + qtr->qctx.recl = NULL; + qtr->qctx.n_bdigs = 0; + qtr->qctx.range_index = 0; + qtr->qctx.partitions_pre_reserved = g_config.partitions_pre_reserved; + qtr->qctx.bkey = &qtr->bkey; + init_ai_obj(qtr->qctx.bkey); + bzero(&qtr->qctx.bdig, sizeof(cf_digest)); + // Populate all the paritions for which this partition is query-able + as_query_pre_reserve_partitions(qtr); + + qtr->priority = g_config.query_priority; + qtr->bb_r = bb_poolrequest(); + cf_buf_builder_reserve(&qtr->bb_r, 8, NULL); + + qtr_set_running(qtr); + cf_atomic64_incr(&qtr->ns->query_short_reqs); + cf_atomic32_incr(&g_query_short_running); + + // This needs to be distant from the initialization of nodeid to + // workaround a lame systemtap/compiler interaction. + ASD_QUERY_INIT(nodeid, qtr->trid); + + return AS_QUERY_OK; +} + +static int +query_qtr_enqueue(as_query_transaction *qtr, bool is_requeue) +{ + uint64_t limit = 0; + uint32_t size = 0; + cf_queue * q; + cf_atomic64 * queue_full_err; + if (qtr->short_running) { + limit = g_config.query_short_q_max_size; + size = cf_atomic32_get(g_query_short_running); + q = g_query_short_queue; + queue_full_err = &qtr->ns->query_short_queue_full; + } + else { + limit = g_config.query_long_q_max_size; + size = cf_atomic32_get(g_query_long_running); + q = g_query_long_queue; + queue_full_err = &qtr->ns->query_long_queue_full; + } + + // Allow requeue without limit check, to cover for dynamic + // config change while query + if (!is_requeue && (size > limit)) { + cf_atomic64_incr(queue_full_err); + return AS_QUERY_ERR; + } else { + cf_queue_push(q, &qtr); + cf_detail(AS_QUERY, "Logged query "); + } + + return AS_QUERY_OK; +} + +int +query_requeue(as_query_transaction *qtr) +{ + int ret = AS_QUERY_OK; + if (query_qtr_enqueue(qtr, true) != 0) { + cf_warning(AS_QUERY, "Queuing Error... continue!!"); + qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_QUEUEFULL, __FILE__, __LINE__); + ret = AS_QUERY_ERR; + } else { + cf_detail(AS_QUERY, "Query Queued Due to Network"); + ret = AS_QUERY_OK; + } + return ret; +} + +static void +qtr_finish_work(as_query_transaction *qtr, cf_atomic32 *stat, char *fname, int lineno, bool release) +{ + qtr_lock(qtr); + uint32_t val = cf_atomic32_decr(stat); + if ((val == 0) && qtr->do_requeue) { + query_requeue(qtr); + cf_detail(AS_QUERY, "(%s:%d) Job Requeued %p", fname, lineno, qtr); + qtr->do_requeue = false; + } + qtr_unlock(qtr); + if (release) { + qtr_release(qtr, fname, lineno); + } +} + +// +// 0: Successfully requeued +// -1: Query Err +// 1: Not requeued continue +// 2: Query finished +// +static int +query_qtr_check_and_requeue(as_query_transaction *qtr) +{ + bool do_enqueue = false; + // Step 1: If the query batch is done then wait for number of outstanding qwork to + // finish. This may slow down query responses get the better model + if (qtr_finished(qtr)) { + if ((cf_atomic32_get(qtr->n_qwork_active) == 0) + && (cf_atomic32_get(qtr->n_io_outstanding) == 0) + && (cf_atomic32_get(qtr->n_udf_tr_queued) == 0)) { + cf_detail(AS_QUERY, "Request is finished"); + return AS_QUERY_DONE; + } + do_enqueue = true; + cf_detail(AS_QUERY, "Request not finished qwork(%d) io(%d)", cf_atomic32_get(qtr->n_qwork_active), cf_atomic32_get(qtr->n_io_outstanding)); + } + + // Step 2: Client is slow requeue + if (query_netio_wait(qtr) != AS_QUERY_OK) { + do_enqueue = true; + } + + // Step 3: Check to see if this is long running query. This is determined by + // checking number of records read. Please note that it makes sure the false + // entries in secondary index does not effect this decision. All short running + // queries perform I/O in the batch thread context. + if ((cf_atomic64_get(qtr->n_result_records) >= g_config.query_threshold) + && qtr->short_running) { + qtr->short_running = false; + // Change batch size to the long running job batch size value + qtr->qctx.bsize = g_config.query_bsize; + cf_atomic32_decr(&g_query_short_running); + cf_atomic32_incr(&g_query_long_running); + cf_atomic64_incr(&qtr->ns->query_long_reqs); + cf_atomic64_decr(&qtr->ns->query_short_reqs); + cf_detail(AS_QUERY, "Query Queued Into Long running thread pool %ld %d", cf_atomic64_get(qtr->n_result_records), qtr->short_running); + do_enqueue = true; + } + + if (do_enqueue) { + int ret = AS_QUERY_OK; + qtr_lock(qtr); + if ((cf_atomic32_get(qtr->n_qwork_active) != 0) + || (cf_atomic32_get(qtr->n_io_outstanding) != 0) + || (cf_atomic32_get(qtr->n_udf_tr_queued) != 0)) { + cf_detail(AS_QUERY, "Job Setup for Requeue %p", qtr); + + // Release of one of the above will perform requeue... look for + // qtr_finish_work(); + qtr->do_requeue = true; + ret = AS_QUERY_OK; + } else { + ret = query_requeue(qtr); + } + qtr_unlock(qtr); + return ret; + } + + return AS_QUERY_CONTINUE; +} +static bool +query_process_inline(as_query_transaction *qtr) +{ + if ( g_config.query_req_in_query_thread + || (cf_atomic32_get((qtr)->n_qwork_active) > g_config.query_req_max_inflight) + || (qtr && qtr->short_running) + || (qtr && qtr_finished(qtr))) { + return true; + } + else { + return false; + } +} +/* + * Process the query work either inilne or pass it on to the + * worker thread + * + * Returns + * -1 : Fail + * 0 : Success + */ +static int +qtr_process(as_query_transaction *qtr) +{ + if (query_process_inline(qtr)) { + query_work qwork; + qwork_setup(&qwork, qtr); + + int ret = qwork_process(&qwork); + + qwork_teardown(&qwork); + return ret; + + } else { + query_work *qworkp = qwork_poolrequest(); + if (!qworkp) { + cf_warning(AS_QUERY, "Could not allocate query " + "request structure .. out of memory .. Aborting !!!"); + return AS_QUERY_ERR; + } + // Successfully queued + cf_atomic32_incr(&qtr->n_qwork_active); + qwork_setup(qworkp, qtr); + cf_queue_push(g_query_work_queue, &qworkp); + + } + return AS_QUERY_OK; +} + +static int +query_check_bound(as_query_transaction *qtr) +{ + if (cf_atomic64_get(qtr->n_result_records) > g_config.query_rec_count_bound) { + return AS_QUERY_ERR; + } + return AS_QUERY_OK; +} +/* + * Function query_generator + * + * Does the following + * 1. Calls the sindex layer for fetching digest list + * 2. If short running query performs I/O inline and for long running query + * queues it up for work threads to execute. + * 3. If the query is short_running and has hit threshold. Requeue it for + * long running generator threads + * + * Returns - + * Nothing, sets the qtr status accordingly + */ +static void +query_generator(as_query_transaction *qtr) +{ +#if defined(USE_SYSTEMTAP) + uint64_t nodeid = g_config.self_node; + uint64_t trid = qtr->trid; + size_t nrecs = 0; +#endif + + // Query can get requeue for many different reason. Check if it is + // already started before indulging in act to setting it up for run + if (!qtr_started(qtr)) { + query_run_setup(qtr); + } + + int loop = 0; + while (true) { + + // Step 1: Check for requeue + int ret = query_qtr_check_and_requeue(qtr); + if (ret == AS_QUERY_ERR) { + cf_warning(AS_QUERY, "Unexpected requeue failure .. shutdown connection.. abort!!"); + qtr_set_abort(qtr, AS_PROTO_RESULT_FAIL_QUERY_NETIO_ERR, __FILE__, __LINE__); + break; + } else if (ret == AS_QUERY_DONE) { + break; + } else if (ret == AS_QUERY_OK) { + return; + } + // Step 2: Check for timeout + query_check_timeout(qtr); + if (qtr_failed(qtr)) { + qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_TIMEOUT, __FILE__, __LINE__); + continue; + } + // Step 3: Conditionally track + if (hash_track_qtr(qtr)) { + qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_DUPLICATE, __FILE__, __LINE__); + continue; + } + + // Step 4: If needs user based abort + if (query_check_bound(qtr)) { + qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_USERABORT, __FILE__, __LINE__); + continue; + } + + // Step 5: Get Next Batch + loop++; + int qret = query_get_nextbatch(qtr); + + cf_detail(AS_QUERY, "Loop=%d, Selected=%"PRIu64", ret=%d", loop, qtr->qctx.n_bdigs, qret); + switch (qret) { + case AS_QUERY_OK: + case AS_QUERY_DONE: + break; + case AS_QUERY_ERR: + continue; + case AS_QUERY_CONTINUE: + continue; + default: + cf_warning(AS_QUERY, "Unexpected return type"); + continue; + } + + if (qret == AS_QUERY_DONE) { + // In case all physical tree is done return. if not range loop + // till less than batch size results are returned +#if defined(USE_SYSTEMTAP) + nrecs = qtr->n_result_records; +#endif + qtr_set_done(qtr, AS_PROTO_RESULT_OK, __FILE__, __LINE__); + } + + // Step 6: Prepare Query Request either to process inline or for + // queueing up for offline processing + if (qtr_process(qtr)) { + qtr_set_err(qtr, AS_PROTO_RESULT_FAIL_QUERY_CBERROR, __FILE__, __LINE__); + continue; + } + } + + if (!qtr_is_abort(qtr)) { + // Send the fin packet in it is NOT a shutdown + query_send_fin(qtr); + } + // deleting it from the global hash. + hash_delete_qtr(qtr); + qtr_release(qtr, __FILE__, __LINE__); + ASD_QUERY_DONE(nodeid, trid, nrecs); +} + +/* + * Function as_query_worker + * + * Notes - + * Process one queue's Query requests. + * - Immediately fail if query has timed out + * - Maximum queries that can be served is number of threads + * + * Releases the qtr, which will call as_query_trasaction_done + * + * Synchronization - + * Takes a global query lock while + */ +void* +query_th(void* q_to_wait_on) +{ + cf_queue * query_queue = (cf_queue*)q_to_wait_on; + unsigned int thread_id = cf_atomic32_incr(&g_query_threadcnt); + cf_detail(AS_QUERY, "Query Thread Created %d", thread_id); + as_query_transaction *qtr = NULL; + + while (1) { + // Kill self if thread id is greater than that of number of configured + // thread + if (thread_id > g_config.query_threads) { + pthread_rwlock_rdlock(&g_query_lock); + if (thread_id > g_config.query_threads) { + cf_atomic32_decr(&g_query_threadcnt); + pthread_rwlock_unlock(&g_query_lock); + cf_detail(AS_QUERY, "Query thread %d exited", thread_id); + return NULL; + } + pthread_rwlock_unlock(&g_query_lock); + } + if (cf_queue_pop(query_queue, &qtr, CF_QUEUE_FOREVER) != 0) { + cf_crash(AS_QUERY, "Failed to pop from Query worker queue."); + } + + query_generator(qtr); + } + return AS_QUERY_OK; +} + +/* + * Parse the UDF OP type to find what type of UDF this is or otherwise not even + * UDF + */ +query_type +query_get_type(as_transaction* tr) +{ + if (! as_transaction_is_udf(tr)) { + return QUERY_TYPE_LOOKUP; + } + + as_msg_field *udf_op_f = as_transaction_has_udf_op(tr) ? + as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_UDF_OP) : NULL; + + if (udf_op_f && *udf_op_f->data == (uint8_t)AS_UDF_OP_AGGREGATE) { + return QUERY_TYPE_AGGR; + } + + if (udf_op_f && *udf_op_f->data == (uint8_t)AS_UDF_OP_BACKGROUND) { + return QUERY_TYPE_UDF_BG; + } +/* + if (udf_op_f && *udf_op_f->data == (uint8_t)AS_UDF_OP_FOREGROUND) { + return QUERY_TYPE_UDF_FG; + } +*/ + return QUERY_TYPE_UNKNOWN; +} + +/* + * Function aggr_query_init + */ +int +aggr_query_init(as_aggr_call * call, as_transaction *tr) +{ + if (! udf_def_init_from_msg(&call->def, tr)) { + return AS_QUERY_ERR; + } + + call->aggr_hooks = &query_aggr_hooks; + return AS_QUERY_OK; +} + +static int +query_setup_udf_call(as_query_transaction *qtr, as_transaction *tr) +{ + switch (qtr->job_type) { + case QUERY_TYPE_LOOKUP: + cf_atomic64_incr(&qtr->ns->n_lookup); + break; + case QUERY_TYPE_AGGR: + if (aggr_query_init(&qtr->agg_call, tr) != AS_QUERY_OK) { + tr->result_code = AS_PROTO_RESULT_FAIL_PARAMETER; + return AS_QUERY_ERR; + } + cf_atomic64_incr(&qtr->ns->n_aggregation); + break; + case QUERY_TYPE_UDF_BG: + if (! udf_def_init_from_msg(&qtr->origin.def, tr)) { + tr->result_code = AS_PROTO_RESULT_FAIL_PARAMETER; + return AS_QUERY_ERR; + } + break; + default: + cf_crash(AS_QUERY, "Invalid QUERY TYPE %d !!!", qtr->job_type); + break; + } + return AS_QUERY_OK; +} + +static void +query_setup_fd(as_query_transaction *qtr, as_transaction *tr) +{ + switch (qtr->job_type) { + case QUERY_TYPE_LOOKUP: + case QUERY_TYPE_AGGR: + qtr->fd_h = tr->from.proto_fd_h; + qtr->fd_h->fh_info |= FH_INFO_DONOT_REAP; + break; + case QUERY_TYPE_UDF_BG: + qtr->fd_h = NULL; + break; + default: + cf_crash(AS_QUERY, "Invalid QUERY TYPE %d !!!", qtr->job_type); + break; + } +} +/* + * Phase I query setup which happens just before query is queued for generator + * Populates valid qtrp in case of success and NULL in case of failure. + * All the query related parsing code sits here + * + * Returns: + * AS_QUERY_OK in case of successful + * AS_QUERY_DONE in case nothing to be like scan on non-existent set + * AS_QUERY_ERR in case of parsing failure + * + */ +static int +query_setup(as_transaction *tr, as_namespace *ns, as_query_transaction **qtrp) +{ + +#if defined(USE_SYSTEMTAP) + uint64_t nodeid = g_config.self_node; + uint64_t trid = tr ? as_transaction_trid(tr) : 0; +#endif + + int rv = AS_QUERY_ERR; + *qtrp = NULL; + + ASD_QUERY_STARTING(nodeid, trid); + + uint64_t start_time = cf_getns(); + as_sindex *si = NULL; + cf_vector *binlist = 0; + as_sindex_range *srange = 0; + predexp_eval_t *predexp_eval = NULL; + char *setname = NULL; + as_query_transaction *qtr = NULL; + + bool has_sindex = as_sindex_ns_has_sindex(ns); + if (!has_sindex) { + tr->result_code = AS_PROTO_RESULT_FAIL_INDEX_NOTFOUND; + cf_debug(AS_QUERY, "No Secondary Index on namespace %s", ns->name); + goto Cleanup; + } + + as_msg *m = &tr->msgp->msg; + + // TODO - still lots of redundant msg field parsing (e.g. for set) - fix. + if ((si = as_sindex_from_msg(ns, m)) == NULL) { + cf_debug(AS_QUERY, "No Index Defined in the Query"); + } + + ASD_SINDEX_MSGRANGE_STARTING(nodeid, trid); + int ret = as_sindex_rangep_from_msg(ns, m, &srange); + if (AS_QUERY_OK != ret) { + cf_debug(AS_QUERY, "Could not instantiate index range metadata... " + "Err, %s", as_sindex_err_str(ret)); + tr->result_code = as_sindex_err_to_clienterr(ret, __FILE__, __LINE__); + goto Cleanup; + } + + ASD_SINDEX_MSGRANGE_FINISHED(nodeid, trid); + // get optional set + as_msg_field *sfp = as_transaction_has_set(tr) ? + as_msg_field_get(m, AS_MSG_FIELD_TYPE_SET) : NULL; + + if (sfp) { + uint32_t setname_len = as_msg_field_get_value_sz(sfp); + + if (setname_len >= AS_SET_NAME_MAX_SIZE) { + cf_warning(AS_QUERY, "set name too long"); + tr->result_code = AS_PROTO_RESULT_FAIL_PARAMETER; + goto Cleanup; + } + + if (setname_len != 0) { + setname = cf_strndup((const char *)sfp->data, setname_len); + } + } + + if (si) { + + if (! as_sindex_can_query(si)) { + tr->result_code = as_sindex_err_to_clienterr( + AS_SINDEX_ERR_NOT_READABLE, __FILE__, __LINE__); + goto Cleanup; + } + } else { + // Look up sindex by bin in the query in case not + // specified in query + si = as_sindex_from_range(ns, setname, srange); + } + + if (as_transaction_has_predexp(tr)) { + as_msg_field * pfp = as_msg_field_get(m, AS_MSG_FIELD_TYPE_PREDEXP); + predexp_eval = predexp_build(pfp); + if (! predexp_eval) { + cf_warning(AS_QUERY, "Failed to build predicate expression"); + tr->result_code = AS_PROTO_RESULT_FAIL_PARAMETER; + goto Cleanup; + } + } + + int numbins = 0; + // Populate binlist to be Projected by the Query + binlist = as_sindex_binlist_from_msg(ns, m, &numbins); + + // If anyone of the bin in the bin is bad, fail the query + if (numbins != 0 && !binlist) { + tr->result_code = AS_PROTO_RESULT_FAIL_INDEX_GENERIC; + goto Cleanup; + } + + if (!has_sindex || !si) { + tr->result_code = AS_PROTO_RESULT_FAIL_INDEX_NOTFOUND; + goto Cleanup; + } + + // quick check if there is any data with the certain set name + if (setname && as_namespace_get_set_id(ns, setname) == INVALID_SET_ID) { + cf_info(AS_QUERY, "Query on non-existent set %s", setname); + tr->result_code = AS_PROTO_RESULT_OK; + rv = AS_QUERY_DONE; + goto Cleanup; + } + cf_detail(AS_QUERY, "Query on index %s ", + ((as_sindex_metadata *)si->imd)->iname); + + query_type qtype = query_get_type(tr); + if (qtype == QUERY_TYPE_UNKNOWN) { + tr->result_code = AS_PROTO_RESULT_FAIL_PARAMETER; + rv = AS_QUERY_ERR; + goto Cleanup; + } + + if (qtype == QUERY_TYPE_AGGR && as_transaction_has_predexp(tr)) { + cf_warning(AS_QUERY, "aggregation queries do not support predexp filters"); + tr->result_code = AS_PROTO_RESULT_FAIL_UNSUPPORTED_FEATURE; + rv = AS_QUERY_ERR; + goto Cleanup; + } + + ASD_QUERY_QTRSETUP_STARTING(nodeid, trid); + qtr = qtr_alloc(); + if (!qtr) { + tr->result_code = AS_PROTO_RESULT_FAIL_UNKNOWN; + goto Cleanup; + } + ASD_QUERY_QTR_ALLOC(nodeid, trid, (void *) qtr); + // Be aware of the size of qtr + // Memset it partial + memset(qtr, 0, offsetof(as_query_transaction, bkey)); + + ASD_QUERY_QTRSETUP_FINISHED(nodeid, trid); + + qtr->ns = ns; + qtr->job_type = qtype; + + if (query_setup_udf_call(qtr, tr)) { + rv = AS_QUERY_ERR; + cf_free(qtr); + goto Cleanup; + } + + query_setup_fd(qtr, tr); + + if (qtr->job_type == QUERY_TYPE_LOOKUP) { + qtr->predexp_eval = predexp_eval; + qtr->no_bin_data = (m->info1 & AS_MSG_INFO1_GET_NO_BINS) != 0; + } + else if (qtr->job_type == QUERY_TYPE_UDF_BG) { + qtr->origin.predexp = predexp_eval; + qtr->origin.cb = query_udf_bg_tr_complete; + qtr->origin.udata = (void *)qtr; + qtr->is_durable_delete = as_transaction_is_durable_delete(tr); + } + + // Consume everything from tr rest will be picked up in init + qtr->trid = as_transaction_trid(tr); + qtr->setname = setname; + qtr->si = si; + qtr->srange = srange; + qtr->binlist = binlist; + qtr->start_time = start_time; + qtr->end_time = tr->end_time; + qtr->rsv = NULL; + + rv = AS_QUERY_OK; + + pthread_mutex_init(&qtr->slock, NULL); + qtr->state = AS_QTR_STATE_INIT; + qtr->do_requeue = false; + qtr->short_running = true; + + *qtrp = qtr; + return rv; + +Cleanup: + // Pre Query Setup Failure + if (setname) cf_free(setname); + if (si) AS_SINDEX_RELEASE(si); + if (predexp_eval) predexp_destroy(predexp_eval); + if (srange) as_sindex_range_free(&srange); + if (binlist) cf_vector_destroy(binlist); + return rv; +} + +/* + * Arguments - + * tr - transaction coming from the client. + * + * Returns - + * AS_QUERY_OK - on success. Responds, frees msgp and proto_fd + * AS_QUERY_ERR - on failure. That means the query was not even started. + * frees msgp, response is responsibility of caller + * + * Notes - + * Allocates and reserves the qtr if query_in_transaction_thr + * is set to false or data is in not in memory. + * Has the responsibility to free tr->msgp. + * Either call query_transaction_done or Cleanup to free the msgp + */ +int +as_query(as_transaction *tr, as_namespace *ns) +{ + if (tr) { + QUERY_HIST_INSERT_DATA_POINT(query_txn_q_wait_hist, tr->start_time); + } + + as_query_transaction *qtr; + int rv = query_setup(tr, ns, &qtr); + + if (rv == AS_QUERY_DONE) { + // Send FIN packet to client to ignore this. + bool force_close = ! as_msg_send_fin(&tr->from.proto_fd_h->sock, AS_PROTO_RESULT_OK); + query_release_fd(tr->from.proto_fd_h, force_close); + tr->from.proto_fd_h = NULL; // Paranoid + return AS_QUERY_OK; + } else if (rv == AS_QUERY_ERR) { + // tsvc takes care of managing fd + return AS_QUERY_ERR; + } + + if (g_config.query_in_transaction_thr) { + if (qtr->job_type == QUERY_TYPE_UDF_BG) { + query_send_bg_udf_response(tr); + } + query_generator(qtr); + } else { + if (query_qtr_enqueue(qtr, false)) { + // This error will be accounted by thr_tsvc layer. Thus + // reset fd_h before calling qtr release, and let the + // transaction handler deal with the failure. + qtr->fd_h = NULL; + qtr_release(qtr, __FILE__, __LINE__); + tr->result_code = AS_PROTO_RESULT_FAIL_QUERY_QUEUEFULL; + return AS_QUERY_ERR; + } + // Respond after queuing is successfully. + if (qtr->job_type == QUERY_TYPE_UDF_BG) { + query_send_bg_udf_response(tr); + } + } + + // Query engine will reply to queued query as needed. + tr->from.proto_fd_h = NULL; + return AS_QUERY_OK; +} +// ************************************************************************************************** + + +/* + * Query Utility and Monitoring functions + */ +// ************************************************************************************************** + +// Find matching trid and kill the query +int +as_query_kill(uint64_t trid) +{ + as_query_transaction *qtr; + int rv = hash_get_qtr(trid, &qtr); + + if (rv != AS_QUERY_OK) { + cf_warning(AS_QUERY, "Cannot kill query with trid [%"PRIu64"]", trid); + } else { + qtr_set_abort(qtr, AS_PROTO_RESULT_FAIL_QUERY_USERABORT, __FILE__, __LINE__); + rv = AS_QUERY_OK; + qtr_release(qtr, __FILE__, __LINE__); + } + + return rv; +} + +// Find matching trid and set priority +int +as_query_set_priority(uint64_t trid, uint32_t priority) +{ + as_query_transaction *qtr; + int rv = hash_get_qtr(trid, &qtr); + + if (rv != AS_QUERY_OK) { + cf_warning(AS_QUERY, "Cannot set priority for query with trid [%"PRIu64"]", trid); + } else { + uint32_t old_priority = qtr->priority; + qtr->priority = priority; + cf_info(AS_QUERY, "Query priority changed from %d to %d", old_priority, priority); + rv = AS_QUERY_OK; + qtr_release(qtr, __FILE__, __LINE__); + } + return rv; +} + +int +as_query_list_job_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ + as_query_transaction * qtr = (as_query_transaction*)object; + cf_dyn_buf * db = (cf_dyn_buf*) udata; + + cf_dyn_buf_append_string(db, "trid="); + cf_dyn_buf_append_uint64(db, qtr->trid); + cf_dyn_buf_append_string(db, ":job_type="); + cf_dyn_buf_append_int(db, qtr->job_type); + cf_dyn_buf_append_string(db, ":n_result_records="); + cf_dyn_buf_append_uint64(db, cf_atomic_int_get(qtr->n_result_records)); + cf_dyn_buf_append_string(db, ":run_time="); + cf_dyn_buf_append_uint64(db, (cf_getns() - qtr->start_time) / 1000); + cf_dyn_buf_append_string(db, ":state="); + if(qtr_failed(qtr)) { + cf_dyn_buf_append_string(db, "ABORTED"); + } else { + cf_dyn_buf_append_string(db, "RUNNING"); + } + cf_dyn_buf_append_string(db, ";"); + return AS_QUERY_OK; +} + +// Lists thr current running queries +int +as_query_list(char *name, cf_dyn_buf *db) +{ + uint32_t size = cf_rchash_get_size(g_query_job_hash); + // No elements in the query job hash, return failure + if (!size) { + cf_dyn_buf_append_string(db, "No running queries"); + } + // Else go through all the jobs in the hash and list their statistics + else { + cf_rchash_reduce(g_query_job_hash, as_query_list_job_reduce_fn, db); + cf_dyn_buf_chomp(db); + } + return AS_QUERY_OK; +} + + +// query module to monitor +void +as_query_fill_jobstat(as_query_transaction *qtr, as_mon_jobstat *stat) +{ + stat->trid = qtr->trid; + stat->cpu = 0; // not implemented + stat->run_time = (cf_getns() - qtr->start_time) / 1000000; + stat->recs_read = qtr->n_read_success; + stat->net_io_bytes = qtr->net_io_bytes; + stat->priority = qtr->priority; + + // Not implemented: + stat->progress_pct = 0; + stat->time_since_done = 0; + stat->job_type[0] = '\0'; + + strcpy(stat->ns, qtr->ns->name); + + if (qtr->setname) { + strcpy(stat->set, qtr->setname); + } else { + strcpy(stat->set, "NULL"); + } + + strcpy(stat->status, "active"); + + char *specific_data = stat->jdata; + sprintf(specific_data, ":sindex-name=%s:", qtr->si->imd->iname); +} + +/* + * Populates the as_mon_jobstat and returns to mult-key lookup monitoring infrastructure. + * Serves as a callback function + * + * Returns - + * NULL - In case of failure. + * as_mon_jobstat - On success. + */ +as_mon_jobstat * +as_query_get_jobstat(uint64_t trid) +{ + as_mon_jobstat *stat; + as_query_transaction *qtr; + int rv = hash_get_qtr(trid, &qtr); + + if (rv != AS_QUERY_OK) { + cf_warning(AS_MON, "No query was found with trid [%"PRIu64"]", trid); + stat = NULL; + } + else { + stat = cf_malloc(sizeof(as_mon_jobstat)); + as_query_fill_jobstat(qtr, stat); + qtr_release(qtr, __FILE__, __LINE__); + } + return stat; +} + + +int +as_mon_query_jobstat_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata) +{ + as_query_transaction * qtr = (as_query_transaction*)object; + query_jobstat *job_pool = (query_jobstat*) udata; + + if ( job_pool->index >= job_pool->max_size) return AS_QUERY_OK; + as_mon_jobstat * stat = *(job_pool->jobstat); + stat = stat + job_pool->index; + as_query_fill_jobstat(qtr, stat); + (job_pool->index)++; + return AS_QUERY_OK; +} + +as_mon_jobstat * +as_query_get_jobstat_all(int * size) +{ + *size = cf_rchash_get_size(g_query_job_hash); + if(*size == 0) return AS_QUERY_OK; + + as_mon_jobstat * job_stats; + query_jobstat job_pool; + + job_stats = (as_mon_jobstat *) cf_malloc(sizeof(as_mon_jobstat) * (*size)); + job_pool.jobstat = &job_stats; + job_pool.index = 0; + job_pool.max_size = *size; + cf_rchash_reduce(g_query_job_hash, as_mon_query_jobstat_reduce_fn, &job_pool); + *size = job_pool.index; + return job_stats; +} + +void +as_query_histogram_dumpall() +{ + if (g_config.query_enable_histogram == false) + { + return; + } + + if (query_txn_q_wait_hist) { + histogram_dump(query_txn_q_wait_hist); + } + if (query_query_q_wait_hist) { + histogram_dump(query_query_q_wait_hist); + } + if (query_prepare_batch_hist) { + histogram_dump(query_prepare_batch_hist); + } + if (query_batch_io_q_wait_hist) { + histogram_dump(query_batch_io_q_wait_hist); + } + if (query_batch_io_hist) { + histogram_dump(query_batch_io_hist); + } + if (query_net_io_hist) { + histogram_dump(query_net_io_hist); + } +} + + +/* + * Query Subsystem Initialization function + */ +// ************************************************************************************************** +void +as_query_gconfig_default(as_config *c) +{ + // NB: Do not change query_threads default to odd. as_query_reinit code cannot + // handle it. Code to handle it is unnecessarily complicated code, hence opted + // to make the default value even. + c->query_threads = 6; + c->query_worker_threads = 15; + c->query_priority = 10; + c->query_sleep_us = 1; + c->query_bsize = QUERY_BATCH_SIZE; + c->query_in_transaction_thr = 0; + c->query_req_max_inflight = AS_QUERY_MAX_QREQ_INFLIGHT; + c->query_bufpool_size = AS_QUERY_MAX_BUFS; + c->query_short_q_max_size = AS_QUERY_MAX_SHORT_QUEUE_SZ; + c->query_long_q_max_size = AS_QUERY_MAX_LONG_QUEUE_SZ; + c->query_buf_size = AS_QUERY_BUF_SIZE; + c->query_threshold = 10; // threshold after which the query is considered long running + // no reason for choosing 10 + c->query_rec_count_bound = UINT64_MAX; // Unlimited + c->query_req_in_query_thread = 0; + c->query_untracked_time_ms = AS_QUERY_UNTRACKED_TIME; + + c->partitions_pre_reserved = false; +} + + +void +as_query_init() +{ + g_current_queries_count = 0; + cf_detail(AS_QUERY, "Initialize %d Query Worker threads.", g_config.query_threads); + + // global job hash to keep track of the query job + cf_rchash_create(&g_query_job_hash, cf_rchash_fn_u32, NULL, sizeof(uint64_t), 64, CF_RCHASH_MANY_LOCK); + + // I/O threads + g_query_qwork_pool = cf_queue_create(sizeof(query_work *), true); + g_query_response_bb_pool = cf_queue_create(sizeof(void *), true); + g_query_work_queue = cf_queue_create(sizeof(query_work *), true); + + // Create the query worker threads detached so we don't need to join with them. + if (pthread_attr_init(&g_query_worker_th_attr)) { + cf_crash(AS_SINDEX, "failed to initialize the query worker thread attributes"); + } + if (pthread_attr_setdetachstate(&g_query_worker_th_attr, PTHREAD_CREATE_DETACHED)) { + cf_crash(AS_SINDEX, "failed to set the query worker thread attributes to the detached state"); + } + int max = g_config.query_worker_threads; + for (int i = 0; i < max; i++) { + pthread_create(&g_query_worker_threads[i], &g_query_worker_th_attr, + qwork_th, (void*)g_query_work_queue); + } + + g_query_short_queue = cf_queue_create(sizeof(as_query_transaction *), true); + g_query_long_queue = cf_queue_create(sizeof(as_query_transaction *), true); + + // Create the query threads detached so we don't need to join with them. + if (pthread_attr_init(&g_query_th_attr)) { + cf_crash(AS_SINDEX, "failed to initialize the query thread attributes"); + } + if (pthread_attr_setdetachstate(&g_query_th_attr, PTHREAD_CREATE_DETACHED)) { + cf_crash(AS_SINDEX, "failed to set the query thread attributes to the detached state"); + } + + max = g_config.query_threads; + for (int i = 0; i < max; i += 2) { + if (pthread_create(&g_query_threads[i], &g_query_th_attr, + query_th, (void*)g_query_short_queue) + || pthread_create(&g_query_threads[i + 1], &g_query_th_attr, + query_th, (void*)g_query_long_queue)) { + cf_crash(AS_QUERY, "Failed to create query transaction threads for query short queue"); + } + } + + char hist_name[64]; + + sprintf(hist_name, "query_txn_q_wait_us"); + query_txn_q_wait_hist = histogram_create(hist_name, HIST_MICROSECONDS); + + sprintf(hist_name, "query_query_q_wait_us"); + query_query_q_wait_hist = histogram_create(hist_name, HIST_MICROSECONDS); + + sprintf(hist_name, "query_prepare_batch_us"); + query_prepare_batch_hist = histogram_create(hist_name, HIST_MICROSECONDS); + + sprintf(hist_name, "query_batch_io_q_wait_us"); + query_batch_io_q_wait_hist = histogram_create(hist_name, HIST_MICROSECONDS); + + sprintf(hist_name, "query_batch_io_us"); + query_batch_io_hist = histogram_create(hist_name, HIST_MICROSECONDS); + + sprintf(hist_name, "query_net_io_us"); + query_net_io_hist = histogram_create(hist_name, HIST_MICROSECONDS); + + g_config.query_enable_histogram = false; +} + +/* + * Description - + * It tries to set the query_worker_threads to the given value. + * + * Synchronization - + * Takes a global query lock to protect the config of + * + * Arguments - + * set_size - Value which one want to assign to query_threads. + * + * Returns - + * AS_QUERY_OK - On successful resize of query threads. + * AS_QUERY_ERR - Either the set_size exceeds AS_QUERY_MAX_THREADS + * OR Query threads were not initialized on the first place. + */ +int +as_query_worker_reinit(int set_size, int *actual_size) +{ + if (set_size > AS_QUERY_MAX_WORKER_THREADS) { + cf_warning(AS_QUERY, "Cannot increase query threads more than %d", + AS_QUERY_MAX_WORKER_THREADS); + //unlock + return AS_QUERY_ERR; + } + + pthread_rwlock_wrlock(&g_query_lock); + // Add threads if count is increased + int i = cf_atomic32_get(g_query_worker_threadcnt); + g_config.query_worker_threads = set_size; + if (set_size > g_query_worker_threadcnt) { + for (; i < set_size; i++) { + cf_detail(AS_QUERY, "Creating thread %d", i); + if (0 != pthread_create(&g_query_worker_threads[i], &g_query_worker_th_attr, + qwork_th, (void*)g_query_work_queue)) { + break; + } + } + g_config.query_worker_threads = i; + } + *actual_size = g_config.query_worker_threads; + + pthread_rwlock_unlock(&g_query_lock); + + return AS_QUERY_OK; +} + +/* + * Description - + * It tries to set the query_threads to the given value. + * + * Synchronization - + * Takes a global query lock to protect the config of + * + * Arguments - + * set_size - Value which one want to assign to query_threads. + * + * Returns - + * AS_QUERY_OK - On successful resize of query threads. + * AS_QUERY_ERR - Either the set_size exceeds AS_QUERY_MAX_THREADS + * OR Query threads were not initialized on the first place. + */ +int +as_query_reinit(int set_size, int *actual_size) +{ + if (set_size > AS_QUERY_MAX_THREADS) { + cf_warning(AS_QUERY, "Cannot increase query threads more than %d", + AS_QUERY_MAX_THREADS); + return AS_QUERY_ERR; + } + + pthread_rwlock_wrlock(&g_query_lock); + // Add threads if count is increased + int i = cf_atomic32_get(g_query_threadcnt); + + // make it multiple of 2 + if (set_size % 2 != 0) + set_size++; + + g_config.query_threads = set_size; + if (set_size > g_query_threadcnt) { + for (; i < set_size; i++) { + cf_detail(AS_QUERY, "Creating thread %d", i); + if (0 != pthread_create(&g_query_threads[i], &g_query_th_attr, + query_th, (void*)g_query_short_queue)) { + break; + } + i++; + if (0 != pthread_create(&g_query_threads[i], &g_query_th_attr, + query_th, (void*)g_query_long_queue)) { + break; + } + } + g_config.query_threads = i; + } + *actual_size = g_config.query_threads; + + pthread_rwlock_unlock(&g_query_lock); + + return AS_QUERY_OK; +} +// ************************************************************************************************** diff --git a/as/src/base/thr_sindex.c b/as/src/base/thr_sindex.c new file mode 100644 index 00000000..a838ee7d --- /dev/null +++ b/as/src/base/thr_sindex.c @@ -0,0 +1,841 @@ +/* + * thr_sindex.c + * + * Copyright (C) 2012-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + /* + * SYNOPSIS + * This file implements supporting threads for the secondary index implementation. + * Currently following two main threads are implemented here + * + * - Secondary index gc thread which walks sweeps through secondary indexes + * and cleanup the stale entries by looking up digest in the primary index. + * + * - Secondary index thread which cleans up secondary index entry for a particular + * partitions + * + */ + +#include "base/thr_sindex.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_ll.h" +#include "citrusleaf/cf_queue.h" + +#include "ai_obj.h" +#include "ai_btree.h" +#include "fault.h" +#include "shash.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/job_manager.h" +#include "base/monitor.h" +#include "base/secondary_index.h" +#include "base/stats.h" +#include "fabric/partition.h" + + +int as_sbld_build(as_sindex* si); + +// All this is global because Aerospike Index is single threaded +pthread_rwlock_t g_sindex_rwlock = PTHREAD_RWLOCK_INITIALIZER; +pthread_rwlock_t g_ai_rwlock = PTHREAD_RWLOCK_INITIALIZER; +pthread_t g_sindex_populate_th; +pthread_t g_sindex_destroy_th; +pthread_t g_sindex_gc_th; + +cf_queue *g_sindex_populate_q; +cf_queue *g_sindex_destroy_q; +cf_queue *g_sindex_populateall_done_q; +cf_queue *g_q_objs_to_defrag; +bool g_sindex_boot_done; + +typedef struct as_sindex_set_s { + as_namespace * ns; + as_set * set; +} as_sindex_set; + +int +ll_sindex_gc_reduce_fn(cf_ll_element *ele, void *udata) +{ + return CF_LL_REDUCE_DELETE; +} + +void +as_sindex_gc_release_gc_arr_to_queue(void *v) +{ + objs_to_defrag_arr *dt = (objs_to_defrag_arr *)v; + if (cf_queue_sz(g_q_objs_to_defrag) < SINDEX_GC_QUEUE_HIGHWATER) { + cf_queue_push(g_q_objs_to_defrag, &dt); + } + else { + cf_free(dt); + } +} + +void +ll_sindex_gc_destroy_fn(cf_ll_element *ele) +{ + ll_sindex_gc_element * node = (ll_sindex_gc_element *) ele; + if (node) { + as_sindex_gc_release_gc_arr_to_queue((void *)(node->objs_to_defrag)); + cf_free(node); + } +} + +objs_to_defrag_arr * +as_sindex_gc_get_defrag_arr(void) +{ + objs_to_defrag_arr *dt; + if (cf_queue_pop(g_q_objs_to_defrag, &dt, CF_QUEUE_NOWAIT) == CF_QUEUE_EMPTY) { + dt = cf_malloc(sizeof(objs_to_defrag_arr)); + } + dt->num = 0; + return dt; +} + +// Main thread which looks at the request of the populating index +void * +as_sindex__populate_fn(void *param) +{ + while(1) { + as_sindex *si; + cf_queue_pop(g_sindex_populate_q, &si, CF_QUEUE_FOREVER); + // TODO should check flag under a lock + // conflict with as_sindex_repair + if (si->flag & AS_SINDEX_FLAG_POPULATING) { + // Earlier job to populate index is still going on, push it back + // into the queue to look at it later. this is problem only when + // there are multiple populating threads currently there is only 1. + cf_queue_push(g_sindex_populate_q, &si); + } else { + cf_debug(AS_SINDEX, "Populating index %s", si->imd->iname); + // should set under a lock + si->flag |= AS_SINDEX_FLAG_POPULATING; + si->stats.recs_pending = si->ns->n_objects; + as_sbld_build(si); + } + } + return NULL; +} + + +// Main thread which looks at the request of the destroy of index +void * +as_sindex__destroy_fn(void *param) +{ + while(1) { + as_sindex *si; + cf_queue_pop(g_sindex_destroy_q, &si, CF_QUEUE_FOREVER); + + SINDEX_GWLOCK(); + cf_assert((si->state == AS_SINDEX_DESTROY), + AS_SINDEX, " Invalid state %d at cleanup expected %d for %p and %s", si->state, AS_SINDEX_DESTROY, si, (si) ? ((si->imd) ? si->imd->iname : NULL) : NULL); + int rv = as_sindex__delete_from_set_binid_hash(si->ns, si->imd); + if (rv) { + cf_warning(AS_SINDEX, "Delete from set_binid hash fails with error %d", rv); + } + // Free entire usage counter before tree destroy + cf_atomic64_sub(&si->ns->n_bytes_sindex_memory, + ai_btree_get_isize(si->imd) + ai_btree_get_nsize(si->imd)); + + // Cache the ibtr pointers + uint16_t nprts = si->imd->nprts; + struct btree *ibtr[nprts]; + for (int i = 0; i < nprts; i++) { + as_sindex_pmetadata *pimd = &si->imd->pimd[i]; + ibtr[i] = pimd->ibtr; + ai_btree_reset_pimd(pimd); + } + + as_sindex_destroy_pmetadata(si); + si->state = AS_SINDEX_INACTIVE; + si->flag = 0; + + si->ns->sindex_cnt--; + + if (si->imd->set) { + as_set *p_set = as_namespace_get_set_by_name(si->ns, si->imd->set); + p_set->n_sindexes--; + } else { + si->ns->n_setless_sindexes--; + } + + as_sindex_metadata *imd = si->imd; + si->imd = NULL; + + char iname[AS_ID_INAME_SZ]; + memset(iname, 0, AS_ID_INAME_SZ); + snprintf(iname, strlen(imd->iname) + 1, "%s", imd->iname); + cf_shash_delete(si->ns->sindex_iname_hash, (void *)iname); + + + as_namespace *ns = si->ns; + si->ns = NULL; + si->simatch = -1; + + as_sindex_metadata *recreate_imd = NULL; + if (si->recreate_imd) { + recreate_imd = si->recreate_imd; + si->recreate_imd = NULL; + } + + // remember this is going to release the write lock + // of meta-data first. This is the only special case + // where both GLOCK and LOCK is called together + SINDEX_GWUNLOCK(); + + // Destroy cached ibtr pointer + for (int i = 0; i < imd->nprts; i++) { + ai_btree_delete_ibtr(ibtr[i]); + } + as_sindex_imd_free(imd); + cf_rc_free(imd); + + if (recreate_imd) { + as_sindex_create(ns, recreate_imd); + as_sindex_imd_free(recreate_imd); + cf_rc_free(recreate_imd); + } + } + return NULL; +} + +void +as_sindex_update_gc_stat(as_sindex *si, uint64_t r, uint64_t start_time_ms) +{ + cf_atomic64_add(&si->stats.n_deletes, r); + cf_atomic64_add(&si->stats.n_objects, -r); + cf_atomic64_add(&si->stats.n_defrag_records, r); + cf_atomic64_add(&si->stats.defrag_time, cf_getms() - start_time_ms); +} + +typedef struct gc_stat_s { + uint64_t processed; + uint64_t found; + uint64_t deleted; + uint64_t creation_time; + uint64_t deletion_time; +} gc_stat; + +typedef struct gc_ctx_s { + uint32_t ns_id; + as_sindex *si; + uint16_t pimd_idx; + + // stat + gc_stat stat; + + // config + uint64_t start_time; + uint32_t gc_max_rate; +} gc_ctx; + +typedef struct gc_offset_s { + ai_obj i_col; + uint64_t pos; // uint actually + bool done; +} gc_offset; + +static bool +can_gc_si(as_sindex *si, uint16_t pimd_idx) +{ + if (! as_sindex_isactive(si)) { + return false; + } + + if (si->state == AS_SINDEX_DESTROY) { + return false; + } + + // pimd_idx we are iterating does not + // exist in this sindex. + if (pimd_idx >= si->imd->nprts) { + return false; + } + + return true; +} + +static bool +gc_getnext_si(gc_ctx *ctx) +{ + int16_t si_idx; + as_namespace *ns = g_config.namespaces[ctx->ns_id]; + + // From previous si_idx or 0 + if (ctx->si) { + si_idx = ctx->si->simatch; + AS_SINDEX_RELEASE(ctx->si); + ctx->si = NULL; + } else { + si_idx = -1; + } + + SINDEX_GRLOCK(); + + while (true) { + + si_idx++; + if (si_idx == AS_SINDEX_MAX) { + SINDEX_GRUNLOCK(); + return false; + } + + as_sindex *si = &ns->sindex[si_idx]; + + if (! can_gc_si(si, ctx->pimd_idx)) { + continue; + } + + AS_SINDEX_RESERVE(si); + ctx->si = si; + SINDEX_GRUNLOCK(); + return true; + } +} + +static void +gc_print_ctx(gc_ctx *ctx) +{ + cf_detail(AS_SINDEX, "%s %s[%d]", g_config.namespaces[ctx->ns_id]->name, + ctx->si ? ctx->si->imd->iname : "NULL", ctx->pimd_idx); +} + +// TODO - Find the correct values +#define CREATE_LIST_PER_ITERATION_LIMIT 10000 +#define PROCESS_LIST_PER_ITERATION_LIMIT 10 + +// true if tree is done +// false if more in tree +static bool +gc_create_list(as_sindex *si, as_sindex_pmetadata *pimd, cf_ll *gc_list, + gc_offset *offsetp, gc_stat *statp) +{ + uint64_t processed = 0; + uint64_t found = 0; + uint64_t limit_per_iteration = CREATE_LIST_PER_ITERATION_LIMIT; + + uint64_t start_time = cf_getms(); + + PIMD_RLOCK(&pimd->slock); + as_sindex_status ret = ai_btree_build_defrag_list(si->imd, pimd, + &offsetp->i_col, &offsetp->pos, limit_per_iteration, + &processed, &found, gc_list); + + PIMD_RUNLOCK(&pimd->slock); + + statp->creation_time += (cf_getms() - start_time); + statp->processed += processed; + statp->found += found; + + if (ret == AS_SINDEX_DONE) { + offsetp->done = true; + } + + if (ret == AS_SINDEX_ERR) { + return false; + } + + return true; +} + +static void +gc_process_list(as_sindex *si, as_sindex_pmetadata *pimd, cf_ll *gc_list, + gc_offset *offsetp, gc_stat *statp) +{ + uint64_t deleted = 0; + uint64_t start_time = cf_getms(); + uint64_t limit_per_iteration = PROCESS_LIST_PER_ITERATION_LIMIT; + + bool more = true; + + while (more) { + + PIMD_WLOCK(&pimd->slock); + more = ai_btree_defrag_list(si->imd, pimd, gc_list, + limit_per_iteration, &deleted); + PIMD_WUNLOCK(&pimd->slock); + } + + // Update secondary index object count + // statistics aggressively. + as_sindex_update_gc_stat(si, deleted, start_time); + + statp->deletion_time = cf_getms() - start_time; + statp->deleted += deleted; +} + +static void +gc_throttle(gc_ctx *ctx) +{ + while (true) { + uint64_t expected_processed = + (cf_get_seconds() - ctx->start_time) * ctx->gc_max_rate; + + // processed less than expected + // no throttling needed. + if (ctx->stat.processed <= expected_processed) { + break; + } + + usleep(10000); // 10 ms + } +} + +static void +do_gc(gc_ctx *ctx) +{ + // SKEY + Digest offset + gc_offset offset; + init_ai_obj(&offset.i_col); + offset.pos = 0; + offset.done = false; + + as_sindex *si = ctx->si; + as_sindex_pmetadata *pimd = &si->imd->pimd[ctx->pimd_idx]; + + cf_ll gc_list; + cf_ll_init(&gc_list, &ll_sindex_gc_destroy_fn, false); + + while (true) { + + if (! gc_create_list(si, pimd, &gc_list, &offset, &ctx->stat)) { + break; + } + + if (cf_ll_size(&gc_list) > 0) { + gc_process_list(si, pimd, &gc_list, &offset, &ctx->stat); + cf_ll_reduce(&gc_list, true /*forward*/, ll_sindex_gc_reduce_fn, NULL); + } + + if (offset.done) { + break; + } + } + + cf_ll_reduce(&gc_list, true /*forward*/, ll_sindex_gc_reduce_fn, NULL); +} + +static void +update_gc_stat(gc_stat *statp) +{ + g_stats.sindex_gc_objects_validated += statp->processed; + g_stats.sindex_gc_garbage_found += statp->found; + g_stats.sindex_gc_garbage_cleaned += statp->deleted; + g_stats.sindex_gc_list_deletion_time += statp->deletion_time; + g_stats.sindex_gc_list_creation_time += statp->creation_time; +} + +void * +as_sindex__gc_fn(void *udata) +{ + while (! g_sindex_boot_done) { + sleep(10); + continue; + } + + cf_debug(AS_SINDEX, "Secondary index gc thread started !!"); + + uint64_t last_time = cf_get_seconds(); + + for ( ; ; ) { + // Wake up every 1 second to check the gc timeout. + struct timespec delay = { 1, 0 }; + nanosleep(&delay, NULL); + + uint64_t curr_time = cf_get_seconds(); + + if ((curr_time - last_time) < g_config.sindex_gc_period) { + continue; // period has not been reached for running gc check + } + + last_time = curr_time; + + for (int i = 0; i < g_config.n_namespaces; i++) { + + as_namespace *ns = g_config.namespaces[i]; + + if (ns->sindex_cnt == 0) { + continue; + } + + cf_info(AS_NSUP, "{%s} sindex-gc start", ns->name); + + uint64_t start_time_ms = cf_getms(); + + // gc_max_rate change at the namespace boundary + gc_ctx ctx = { + .ns_id = i, + .si = NULL, + .stat = { 0 }, + .start_time = cf_get_seconds(), + .gc_max_rate = g_config.sindex_gc_max_rate + }; + + // Give one pimd quata of chance for every sindex + // in a namespace in round robin manner. + for (uint16_t pimd_idx = 0; pimd_idx < MAX_PARTITIONS_PER_INDEX; + pimd_idx++) { + + ctx.pimd_idx = pimd_idx; + + while (gc_getnext_si(&ctx)) { + gc_print_ctx(&ctx); + do_gc(&ctx); + + // throttle after every quota (1 pimd) + gc_throttle(&ctx); + } + } + + cf_info(AS_NSUP, "{%s} sindex-gc: Processed: %ld, found:%ld, deleted: %ld: Total time: %ld ms", + ns->name, ctx.stat.processed, ctx.stat.found, ctx.stat.deleted, + cf_getms() - start_time_ms); + + update_gc_stat(&ctx.stat); + } + } +} + + +/* + * Secondary index main gc thread, it keeps watching out for request to + * the gc, Client API to set up aerospike facing meta data for the secondary index + * and setting all the initial things + * + * Parameter: + * sindex_metadata: (in/out) Index meta-data structure + * + * Caller: + * aerospike + * Return: + * 0: On success + * -1: On failure + * Synchronization: + * Acquires the meta lock. + */ +void +as_sindex_thr_init() +{ + // Thread request read lock on this recursively could possibly cause deadlock. Caller + // should be careful with that + pthread_rwlockattr_t rwattr; + if (!g_q_objs_to_defrag) { + g_q_objs_to_defrag = cf_queue_create(sizeof(void *), true); + } + if (0 != pthread_rwlockattr_init(&rwattr)) + cf_crash(AS_SINDEX, "pthread_rwlockattr_init: %s", cf_strerror(errno)); + if (0 != pthread_rwlockattr_setkind_np(&rwattr, PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP)) + cf_crash( AS_SINDEX, "pthread_rwlockattr_setkind_np: %s", cf_strerror(errno)); + + // Aerospike Index Metadata lock + if (0 != pthread_rwlock_init(&g_ai_rwlock, &rwattr)) { + cf_crash(AS_SINDEX, " Could not create secondary index ddl mutex "); + } + + // Sindex Metadata lock + if (0 != pthread_rwlock_init(&g_sindex_rwlock, &rwattr)) { + cf_crash(AS_SINDEX, " Could not create secondary index ddl mutex "); + } + + g_sindex_populate_q = cf_queue_create(sizeof(as_sindex *), true); + if (0 != pthread_create(&g_sindex_populate_th, 0, as_sindex__populate_fn, 0)) { + cf_crash(AS_SINDEX, " Could not create sindex populate thread "); + } + + g_sindex_destroy_q = cf_queue_create(sizeof(as_sindex *), true); + if (0 != pthread_create(&g_sindex_destroy_th, 0, as_sindex__destroy_fn, 0)) { + cf_crash(AS_SINDEX, " Could not create sindex destroy thread "); + } + + if (0 != pthread_create(&g_sindex_gc_th, 0, as_sindex__gc_fn, 0)) { + cf_crash(AS_SINDEX, " Could not create sindex gc thread "); + } + + g_sindex_populateall_done_q = cf_queue_create(sizeof(int), true); + // At the beginning it is false. It is set to true when all the sindex + // are populated. + g_sindex_boot_done = false; +} + + +//============================================================================== +// Secondary index builder. +// + +// sbld_job - derived class header: +typedef struct sbld_job_s { + // Base object must be first: + as_job _base; + + // Derived class data: + as_sindex* si; + + char* si_name; + cf_atomic64 n_reduced; +} sbld_job; + +sbld_job* sbld_job_create(as_namespace* ns, uint16_t set_id, as_sindex* si); + +// as_job_manager instance for secondary index builder: +static as_job_manager g_sbld_manager; + + +//------------------------------------------------ +// Sindex builder public API. +// + +void +as_sbld_init() +{ + // TODO - config for max done? + // Initialize with maximum threads since first use is always build-all at + // startup. The thread pool will be down-sized right after that. + as_job_manager_init(&g_sbld_manager, UINT_MAX, 100, MAX_SINDEX_BUILDER_THREADS); +} + +int +as_sbld_build(as_sindex* si) +{ + as_sindex_metadata *imd = si->imd; + as_namespace *ns = as_namespace_get_byname(imd->ns_name); + + if (! ns) { + cf_warning(AS_SINDEX, "sindex build %s ns %s - unrecognized namespace", imd->iname, imd->ns_name); + as_sindex_populate_done(si); + AS_SINDEX_RELEASE(si); + return -1; + } + + uint16_t set_id = INVALID_SET_ID; + + if (imd->set && (set_id = as_namespace_get_set_id(ns, imd->set)) == INVALID_SET_ID) { + cf_info(AS_SINDEX, "sindex build %s ns %s - set %s not found - assuming empty", imd->iname, imd->ns_name, imd->set); + as_sindex_populate_done(si); + AS_SINDEX_RELEASE(si); + return -3; + } + + sbld_job* job = sbld_job_create(ns, set_id, si); + + // Can't fail for this kind of job. + as_job_manager_start_job(&g_sbld_manager, (as_job*)job); + + return 0; +} + +void +as_sbld_build_all(as_namespace* ns) +{ + sbld_job* job = sbld_job_create(ns, INVALID_SET_ID, NULL); + + // Can't fail for this kind of job. + as_job_manager_start_job(&g_sbld_manager, (as_job*)job); +} + +void +as_sbld_resize_thread_pool(uint32_t n_threads) +{ + as_job_manager_resize_thread_pool(&g_sbld_manager, n_threads); +} + +int +as_sbld_list(char* name, cf_dyn_buf* db) +{ + as_mon_info_cmd(AS_MON_MODULES[SBLD_MOD], NULL, 0, 0, db); + return 0; +} + +as_mon_jobstat* +as_sbld_get_jobstat(uint64_t trid) +{ + return as_job_manager_get_job_info(&g_sbld_manager, trid); +} + +as_mon_jobstat* +as_sbld_get_jobstat_all(int* size) +{ + return as_job_manager_get_info(&g_sbld_manager, size); +} + +int +as_sbld_abort(uint64_t trid) +{ + return as_job_manager_abort_job(&g_sbld_manager, trid) ? 0 : -1; +} + + +//------------------------------------------------ +// sbld_job derived class implementation. +// + +void sbld_job_slice(as_job* _job, as_partition_reservation* rsv); +void sbld_job_finish(as_job* _job); +void sbld_job_destroy(as_job* _job); +void sbld_job_info(as_job* _job, as_mon_jobstat* stat); + +const as_job_vtable sbld_job_vtable = { + sbld_job_slice, + sbld_job_finish, + sbld_job_destroy, + sbld_job_info +}; + +void sbld_job_reduce_cb(as_index_ref* r_ref, void* udata); + +// +// sbld_job creation. +// + +sbld_job* +sbld_job_create(as_namespace* ns, uint16_t set_id, as_sindex* si) +{ + sbld_job* job = cf_malloc(sizeof(sbld_job)); + + as_job_init((as_job*)job, &sbld_job_vtable, &g_sbld_manager, + RSV_MIGRATE, 0, ns, set_id, AS_JOB_PRIORITY_MEDIUM); + + job->si = si; + job->si_name = si ? cf_strdup(si->imd->iname) : NULL; + job->n_reduced = 0; + + return job; +} + +// +// sbld_job mandatory as_job interface. +// + +void +sbld_job_slice(as_job* _job, as_partition_reservation* rsv) +{ + as_index_reduce_live(rsv->tree, sbld_job_reduce_cb, (void*)_job); +} + +void +sbld_job_finish(as_job* _job) +{ + sbld_job* job = (sbld_job*)_job; + + as_sindex_ticker_done(_job->ns, job->si, _job->start_ms); + + if (job->si) { + as_sindex_populate_done(job->si); + job->si->stats.loadtime = cf_getms() - _job->start_ms; + AS_SINDEX_RELEASE(job->si); + } + else { + as_sindex_boot_populateall_done(_job->ns); + } +} + +void +sbld_job_destroy(as_job* _job) +{ + sbld_job* job = (sbld_job*)_job; + + if (job->si_name) { + cf_free(job->si_name); + } +} + +void +sbld_job_info(as_job* _job, as_mon_jobstat* stat) +{ + sbld_job* job = (sbld_job*)_job; + + if (job->si_name) { + strcpy(stat->job_type, "sindex-build"); + + char *extra = stat->jdata + strlen(stat->jdata); + + sprintf(extra, ":sindex-name=%s", job->si_name); + } + else { + strcpy(stat->job_type, "sindex-build-all"); + } +} + +// +// sbld_job utilities. +// + +void +sbld_job_reduce_cb(as_index_ref* r_ref, void* udata) +{ + as_job* _job = (as_job*)udata; + sbld_job* job = (sbld_job*)_job; + as_namespace* ns = _job->ns; + + if (_job->abandoned != 0) { + as_record_done(r_ref, ns); + return; + } + + if (job->si) { + cf_atomic64_decr(&job->si->stats.recs_pending); + } + + as_sindex_ticker(ns, job->si, cf_atomic64_incr(&job->n_reduced), _job->start_ms); + + as_index *r = r_ref->r; + + if ((_job->set_id != INVALID_SET_ID && _job->set_id != as_index_get_set_id(r)) || + as_record_is_doomed(r, ns)) { + as_record_done(r_ref, ns); + return; + } + + as_storage_rd rd; + as_storage_record_open(ns, r, &rd); + as_storage_rd_load_n_bins(&rd); // TODO - handle error returned + as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; + as_storage_rd_load_bins(&rd, stack_bins); // TODO - handle error returned + + if (job->si) { + if (as_sindex_put_rd(job->si, &rd)) { + as_record_done(r_ref, ns); + as_job_manager_abandon_job(_job->mgr, _job, AS_JOB_FAIL_UNKNOWN); + return; + } + } + else { + as_sindex_putall_rd(ns, &rd); + } + + as_storage_record_close(&rd); + as_record_done(r_ref, ns); + + cf_atomic64_incr(&_job->n_records_read); +} diff --git a/as/src/base/thr_tsvc.c b/as/src/base/thr_tsvc.c new file mode 100644 index 00000000..50ddc4a3 --- /dev/null +++ b/as/src/base/thr_tsvc.c @@ -0,0 +1,580 @@ +/* + * thr_tsvc.c + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "base/thr_tsvc.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_queue.h" + +#include "fault.h" +#include "hardware.h" +#include "node.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/proto.h" +#include "base/scan.h" +#include "base/secondary_index.h" +#include "base/security.h" +#include "base/stats.h" +#include "base/thr_batch.h" +#include "base/transaction.h" +#include "base/transaction_policy.h" +#include "base/xdr_serverside.h" +#include "fabric/fabric.h" +#include "fabric/partition.h" +#include "fabric/partition_balance.h" +#include "storage/storage.h" +#include "transaction/delete.h" +#include "transaction/proxy.h" +#include "transaction/re_replicate.h" +#include "transaction/read.h" +#include "transaction/udf.h" +#include "transaction/write.h" + + +//========================================================== +// Globals. +// + +static cf_queue* g_transaction_queues[MAX_TRANSACTION_QUEUES] = { NULL }; + +// Track number of threads for each queue independently. +static uint32_t g_queues_n_threads[MAX_TRANSACTION_QUEUES] = { 0 }; + +// It's ok for this to not be atomic - might not round-robin perfectly, but will +// be cache friendly. +static uint32_t g_current_q = 0; + + +//========================================================== +// Forward declarations. +// + +void tsvc_add_threads(uint32_t qid, uint32_t n_threads); +void tsvc_remove_threads(uint32_t qid, uint32_t n_threads); +void *run_tsvc(void *arg); + + +//========================================================== +// Inlines & macros. +// + +static inline bool +should_security_check_data_op(const as_transaction *tr) +{ + return tr->origin == FROM_CLIENT || tr->origin == FROM_BATCH; +} + +static inline bool +read_would_duplicate_resolve(const as_namespace* ns, const as_msg* m) +{ + return READ_CONSISTENCY_LEVEL(ns, *m) == AS_READ_CONSISTENCY_LEVEL_ALL; +} + +static const char* +write_type_tag(const as_transaction *tr) +{ + return as_transaction_is_delete(tr) ? "delete" : + (as_transaction_is_udf(tr) ? "udf" : "write"); +} + +static inline void +detail_unique_client_rw(const as_transaction *tr, bool is_write) +{ + if (tr->origin == FROM_CLIENT) { + cf_detail_digest(AS_RW_CLIENT, &tr->keyd, "{%s} client %s %s ", + tr->rsv.ns->name, tr->from.proto_fd_h->client, + is_write ? write_type_tag(tr) : "read"); + } +} + + +//========================================================== +// Public API. +// + +void +as_tsvc_init() +{ + cf_info(AS_TSVC, "%u transaction queues: starting %u threads per queue", + g_config.n_transaction_queues, + g_config.n_transaction_threads_per_queue); + + // Create the transaction queues. + for (uint32_t qid = 0; qid < g_config.n_transaction_queues; qid++) { + g_transaction_queues[qid] = + cf_queue_create(AS_TRANSACTION_HEAD_SIZE, true); + } + + // Start all the transaction threads. + for (uint32_t qid = 0; qid < g_config.n_transaction_queues; qid++) { + tsvc_add_threads(qid, g_config.n_transaction_threads_per_queue); + } +} + + +// Decide which queue to use, and enqueue transaction. +void +as_tsvc_enqueue(as_transaction *tr) +{ + uint32_t qid; + + if (g_config.auto_pin == CF_TOPO_AUTO_PIN_NONE || + g_config.n_namespaces_not_inlined == 0) { + cf_debug(AS_TSVC, "no CPU pinning - dispatching transaction round-robin"); + // Transaction can go on any queue - distribute evenly. + qid = (g_current_q++) % g_config.n_transaction_queues; + } + else { + qid = cf_topo_current_cpu(); + cf_debug(AS_TSVC, "transaction on CPU %u", qid); + } + + cf_queue_push(g_transaction_queues[qid], tr); +} + + +// Triggered via dynamic configuration change. +void +as_tsvc_set_threads_per_queue(uint32_t target_n_threads) +{ + for (uint32_t qid = 0; qid < g_config.n_transaction_queues; qid++) { + uint32_t current_n_threads = g_queues_n_threads[qid]; + + if (target_n_threads > current_n_threads) { + tsvc_add_threads(qid, target_n_threads - current_n_threads); + } + else { + tsvc_remove_threads(qid, current_n_threads - target_n_threads); + } + } + + g_config.n_transaction_threads_per_queue = target_n_threads; +} + + +// Total transactions currently queued, for ticker and info statistics. +int +as_tsvc_queue_get_size() +{ + int current_total = 0; + + for (uint32_t qid = 0; qid < g_config.n_transaction_queues; qid++) { + current_total += cf_queue_sz(g_transaction_queues[qid]); + } + + return current_total; +} + + +// Handle the transaction, including proxy to another node if necessary. +void +as_tsvc_process_transaction(as_transaction *tr) +{ + if (tr->msgp->proto.type == PROTO_TYPE_INTERNAL_XDR) { + as_xdr_read_txn(tr); + return; + } + + int rv; + bool free_msgp = true; + cl_msg *msgp = tr->msgp; + as_msg *m = &msgp->msg; + + as_transaction_init_body(tr); + + // Check that the socket is authenticated. + if (tr->origin == FROM_CLIENT) { + uint8_t result = as_security_check(tr->from.proto_fd_h, PERM_NONE); + + if (result != AS_PROTO_RESULT_OK) { + as_security_log(tr->from.proto_fd_h, result, PERM_NONE, NULL, NULL); + as_transaction_error(tr, NULL, (uint32_t)result); + goto Cleanup; + } + } + + // All transactions must have a namespace. + as_msg_field *nf = as_msg_field_get(m, AS_MSG_FIELD_TYPE_NAMESPACE); + + if (! nf) { + cf_warning(AS_TSVC, "no namespace in protocol request"); + as_transaction_error(tr, NULL, AS_PROTO_RESULT_FAIL_NAMESPACE); + goto Cleanup; + } + + as_namespace *ns = as_namespace_get_bymsgfield(nf); + + if (! ns) { + uint32_t ns_sz = as_msg_field_get_value_sz(nf); + CF_ZSTR_DEFINE(ns_name, AS_ID_NAMESPACE_SZ, nf->data, ns_sz); + + cf_warning(AS_TSVC, "unknown namespace %s (%u) in protocol request - check configuration file", + ns_name, ns_sz); + + as_transaction_error(tr, NULL, AS_PROTO_RESULT_FAIL_NAMESPACE); + goto Cleanup; + } + + // Have we finished the very first partition balance? + if (! as_partition_balance_is_init_resolved()) { + cf_debug(AS_TSVC, "rejecting transaction - initial partition balance unresolved"); + as_transaction_error(tr, NULL, AS_PROTO_RESULT_FAIL_UNAVAILABLE); + // Note that we forfeited namespace info above so scan & query don't get + // counted as single-record error. + goto Cleanup; + } + + //------------------------------------------------------ + // Multi-record transaction. + // + + if (as_transaction_is_multi_record(tr)) { + if (m->transaction_ttl != 0) { + // Old batch and queries may specify transaction_ttl, but don't use + // g_config.transaction_max_ns as a default. Assuming specified TTL + // is large enough that it's not worth checking for timeout here. + tr->end_time = tr->start_time + + ((uint64_t)m->transaction_ttl * 1000000); + } + + if (as_transaction_is_batch_direct(tr)) { + // Old batch. + if (! as_security_check_data_op(tr, ns, PERM_READ)) { + as_multi_rec_transaction_error(tr, tr->result_code); + goto Cleanup; + } + + if ((rv = as_batch_direct_queue_task(tr, ns)) != 0) { + as_multi_rec_transaction_error(tr, rv); + cf_atomic64_incr(&g_stats.batch_errors); + } + } + else if (as_transaction_is_query(tr)) { + // Query. + cf_atomic64_incr(&ns->query_reqs); + + if (! as_security_check_data_op(tr, ns, + as_transaction_is_udf(tr) ? PERM_UDF_QUERY : PERM_QUERY)) { + as_multi_rec_transaction_error(tr, tr->result_code); + goto Cleanup; + } + + if (as_query(tr, ns) != 0) { + cf_atomic64_incr(&ns->query_fail); + as_multi_rec_transaction_error(tr, tr->result_code); + } + } + else { + // Scan. + if (! as_security_check_data_op(tr, ns, + as_transaction_is_udf(tr) ? PERM_UDF_SCAN : PERM_SCAN)) { + as_multi_rec_transaction_error(tr, tr->result_code); + goto Cleanup; + } + + if ((rv = as_scan(tr, ns)) != 0) { + as_multi_rec_transaction_error(tr, rv); + } + } + + goto Cleanup; + } + + //------------------------------------------------------ + // Single-record transaction. + // + + // Calculate end_time based on message transaction TTL. May be recalculating + // for re-queued transactions, but nice if end_time not copied on/off queue. + if (m->transaction_ttl != 0) { + tr->end_time = tr->start_time + + ((uint64_t)m->transaction_ttl * 1000000); + } + else { + // Incorporate g_config.transaction_max_ns if appropriate. + // TODO - should g_config.transaction_max_ns = 0 be special? + tr->end_time = tr->start_time + g_config.transaction_max_ns; + } + + // Did the transaction time out while on the queue? + if (cf_getns() > tr->end_time) { + cf_debug(AS_TSVC, "transaction timed out in queue"); + as_transaction_error(tr, ns, AS_PROTO_RESULT_FAIL_TIMEOUT); + goto Cleanup; + } + + // All single-record transactions must have a digest, or a key from which + // to calculate it. + if (as_transaction_has_digest(tr)) { + // Modern client - just copy digest into tr. + + as_msg_field *df = as_msg_field_get(m, AS_MSG_FIELD_TYPE_DIGEST_RIPE); + uint32_t digest_sz = as_msg_field_get_value_sz(df); + + if (digest_sz != sizeof(cf_digest)) { + cf_warning(AS_TSVC, "digest msg field size %u", digest_sz); + as_transaction_error(tr, ns, AS_PROTO_RESULT_FAIL_PARAMETER); + goto Cleanup; + } + + tr->keyd = *(cf_digest *)df->data; + } + else if (! as_transaction_is_batch_sub(tr)) { + // Old client - calculate digest from key & set, directly into tr. + + as_msg_field *kf = as_msg_field_get(m, AS_MSG_FIELD_TYPE_KEY); + uint32_t key_sz = as_msg_field_get_value_sz(kf); + + as_msg_field *sf = as_transaction_has_set(tr) ? + as_msg_field_get(m, AS_MSG_FIELD_TYPE_SET) : NULL; + uint32_t set_sz = sf ? as_msg_field_get_value_sz(sf) : 0; + + cf_digest_compute2(sf->data, set_sz, kf->data, key_sz, &tr->keyd); + } + // else - batch sub-transactions already (and only) have digest in tr. + + // Process the transaction. + + bool is_write = (m->info2 & AS_MSG_INFO2_WRITE) != 0; + bool is_read = (m->info1 & AS_MSG_INFO1_READ) != 0; + // Both can be set together, but is_write puts us on the 'write path' - + // write reservation, replica writes, etc. Writes quickly get split into + // write, delete, or UDF after the reservation. + + uint32_t pid = as_partition_getid(&tr->keyd); + cf_node dest; + + if (is_write) { + if (should_security_check_data_op(tr) && + ! as_security_check_data_op(tr, ns, PERM_WRITE)) { + as_transaction_error(tr, ns, tr->result_code); + goto Cleanup; + } + + rv = as_partition_reserve_write(ns, pid, &tr->rsv, &dest); + } + else if (is_read) { + if (should_security_check_data_op(tr) && + ! as_security_check_data_op(tr, ns, PERM_READ)) { + as_transaction_error(tr, ns, tr->result_code); + goto Cleanup; + } + + rv = as_partition_reserve_read(ns, pid, &tr->rsv, + read_would_duplicate_resolve(ns, m), &dest); + } + else { + cf_warning(AS_TSVC, "transaction is neither read nor write - unexpected"); + as_transaction_error(tr, ns, AS_PROTO_RESULT_FAIL_PARAMETER); + goto Cleanup; + } + + if (rv == -2) { + // Partition is unavailable. + as_transaction_error(tr, ns, AS_PROTO_RESULT_FAIL_UNAVAILABLE); + goto Cleanup; + } + + if (dest == 0) { + cf_crash(AS_TSVC, "invalid destination while reserving partition"); + } + + if (rv == 0) { + // <><><><><><> Reservation Succeeded <><><><><><> + + if (! as_transaction_is_restart(tr)) { + tr->benchmark_time = 0; + detail_unique_client_rw(tr, is_write); + } + + transaction_status status; + + if (is_write) { + if (as_transaction_is_delete(tr)) { + status = as_delete_start(tr); + } + else if (tr->origin == FROM_IUDF || as_transaction_is_udf(tr)) { + status = as_udf_start(tr); + } + else if (tr->origin == FROM_RE_REPL) { + status = as_re_replicate_start(tr); + } + else { + status = as_write_start(tr); + } + } + else { + status = as_read_start(tr); + } + + switch (status) { + case TRANS_DONE_ERROR: + case TRANS_DONE_SUCCESS: + // Done, response already sent - free msg & release reservation. + as_partition_release(&tr->rsv); + break; + case TRANS_IN_PROGRESS: + // Don't free msg or release reservation - both owned by rw_request. + free_msgp = false; + break; + case TRANS_WAITING: + // Will be re-queued - don't free msg, but release reservation. + free_msgp = false; + as_partition_release(&tr->rsv); + break; + default: + cf_crash(AS_TSVC, "invalid transaction status %d", status); + break; + } + } + else { + // <><><><><><> Reservation Failed <><><><><><> + + switch (tr->origin) { + case FROM_CLIENT: + case FROM_BATCH: + as_proxy_divert(dest, tr, ns); + // CLIENT: fabric owns msgp, BATCH: it's shared, don't free it. + free_msgp = false; + break; + case FROM_PROXY: + as_proxy_return_to_sender(tr, ns); + tr->from.proxy_node = 0; // pattern, not needed + break; + case FROM_IUDF: + tr->from.iudf_orig->cb(tr->from.iudf_orig->udata, + AS_PROTO_RESULT_FAIL_UNKNOWN); + tr->from.iudf_orig = NULL; // pattern, not needed + break; + case FROM_NSUP: + break; + case FROM_RE_REPL: + tr->from.re_repl_orig_cb(tr); + tr->from.re_repl_orig_cb = NULL; // pattern, not needed + break; + default: + cf_crash(AS_PROTO, "unexpected transaction origin %u", tr->origin); + break; + } + } + +Cleanup: + + if (free_msgp && tr->origin != FROM_BATCH) { + cf_free(msgp); + } +} // end process_transaction() + + +//========================================================== +// Local helpers. +// + +void +tsvc_add_threads(uint32_t qid, uint32_t n_threads) +{ + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + for (uint32_t n = 0; n < n_threads; n++) { + if (pthread_create(&thread, &attrs, run_tsvc, + (void*)(uint64_t)qid) == 0) { + g_queues_n_threads[qid]++; + } + else { + cf_warning(AS_TSVC, "tsvc queue %u failed thread create", qid); + } + } +} + + +void +tsvc_remove_threads(uint32_t qid, uint32_t n_threads) +{ + as_transaction death_tr = { .msgp = NULL }; + + for (uint32_t n = 0; n < n_threads; n++) { + // Send terminator (transaction with NULL msgp). + cf_queue_push(g_transaction_queues[qid], &death_tr); + g_queues_n_threads[qid]--; + } +} + + +// Service transactions - arg is the queue we're to service. +void * +run_tsvc(void *arg) +{ + uint32_t qid = (uint32_t)(uint64_t)arg; + + if (g_config.auto_pin != CF_TOPO_AUTO_PIN_NONE && + g_config.n_namespaces_not_inlined != 0) { + cf_detail(AS_TSVC, "pinning thread to CPU %u", qid); + cf_topo_pin_to_cpu((cf_topo_cpu_index)qid); + } + + cf_queue *q = g_transaction_queues[qid]; + + while (true) { + as_transaction tr; + + if (cf_queue_pop(q, &tr, CF_QUEUE_FOREVER) != CF_QUEUE_OK) { + cf_crash(AS_TSVC, "unable to pop from transaction queue"); + } + + if (! tr.msgp) { + break; // thread termination via configuration change + } + + cf_debug(AS_TSVC, "running on CPU %hu", cf_topo_current_cpu()); + + if (g_config.svc_benchmarks_enabled && + tr.benchmark_time != 0 && ! as_transaction_is_restart(&tr)) { + histogram_insert_data_point(g_stats.svc_queue_hist, + tr.benchmark_time); + } + + as_tsvc_process_transaction(&tr); + } + + return NULL; +} diff --git a/as/src/base/ticker.c b/as/src/base/ticker.c new file mode 100644 index 00000000..ba38fc3b --- /dev/null +++ b/as/src/base/ticker.c @@ -0,0 +1,919 @@ +/* + * ticker.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "base/ticker.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" + +#include "dynbuf.h" +#include "fault.h" +#include "hist.h" +#include "hist_track.h" +#include "meminfo.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/secondary_index.h" +#include "base/stats.h" +#include "base/thr_info.h" +#include "base/thr_sindex.h" +#include "base/thr_tsvc.h" +#include "fabric/clustering.h" +#include "fabric/exchange.h" +#include "fabric/fabric.h" +#include "fabric/hb.h" +#include "fabric/partition.h" +#include "fabric/skew_monitor.h" +#include "storage/storage.h" +#include "transaction/proxy.h" +#include "transaction/rw_request_hash.h" + + +//========================================================== +// Forward declarations. +// + +extern int as_nsup_queue_get_size(); +extern bool g_shutdown_started; + +void* run_ticker(void* arg); +void log_ticker_frame(uint64_t delta_time); + +void log_line_clock(); +void log_line_system_memory(); +void log_line_in_progress(); +void log_line_fds(); +void log_line_heartbeat(); +void log_fabric_rate(uint64_t delta_time); +void log_line_early_fail(); +void log_line_batch_index(); + +void log_line_objects(as_namespace* ns, uint64_t n_objects, + repl_stats* mp); +void log_line_tombstones(as_namespace* ns, uint64_t n_tombstones, + repl_stats* mp); +void log_line_appeals(as_namespace* ns); +void log_line_migrations(as_namespace* ns); +void log_line_memory_usage(as_namespace* ns, size_t total_mem, size_t index_mem, + size_t sindex_mem, size_t data_mem); +void log_line_device_usage(as_namespace* ns); + +void log_line_client(as_namespace* ns); +void log_line_xdr_client(as_namespace* ns); +void log_line_batch_sub(as_namespace* ns); +void log_line_scan(as_namespace* ns); +void log_line_query(as_namespace* ns); +void log_line_udf_sub(as_namespace* ns); +void log_line_retransmits(as_namespace* ns); +void log_line_re_repl(as_namespace* ns); +void log_line_special_errors(as_namespace* ns); + +void dump_global_histograms(); +void dump_namespace_histograms(as_namespace* ns); + + +//========================================================== +// Public API. +// + +void +as_ticker_start() +{ + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + if (pthread_create(&thread, &attrs, run_ticker, NULL) != 0) { + cf_crash(AS_INFO, "failed to create ticker thread"); + } +} + + +//========================================================== +// Local helpers. +// + +void* +run_ticker(void* arg) +{ + uint64_t last_time = cf_getns(); + + while (true) { + // Wake up every 1 second to check the ticker interval. + struct timespec delay = { 1, 0 }; + nanosleep(&delay, NULL); + + uint64_t curr_time = cf_getns(); + uint64_t delta_time = curr_time - last_time; + + if (delta_time < (uint64_t)g_config.ticker_interval * 1000000000) { + continue; // period has not been reached for showing a frame + } + + last_time = curr_time; + + // Reduce likelihood of ticker frames showing after shutdown signal. + if (g_shutdown_started) { + break; + } + + log_ticker_frame(delta_time); + } + + return NULL; +} + + +void +log_ticker_frame(uint64_t delta_time) +{ + cf_info(AS_INFO, "NODE-ID %lx CLUSTER-SIZE %u", + g_config.self_node, + as_exchange_cluster_size() + ); + + log_line_clock(); + log_line_system_memory(); + log_line_in_progress(); + log_line_fds(); + log_line_heartbeat(); + log_fabric_rate(delta_time); + log_line_early_fail(); + log_line_batch_index(); + + dump_global_histograms(); + + size_t total_ns_memory_inuse = 0; + + for (int i = 0; i < g_config.n_namespaces; i++) { + as_namespace* ns = g_config.namespaces[i]; + + uint64_t n_objects = ns->n_objects; + uint64_t n_tombstones = ns->n_tombstones; + + size_t index_mem = as_index_size_get(ns) * (n_objects + n_tombstones); + size_t sindex_mem = ns->n_bytes_sindex_memory; + size_t data_mem = ns->n_bytes_memory; + size_t total_mem = index_mem + sindex_mem + data_mem; + + total_ns_memory_inuse += total_mem; + + repl_stats mp; + as_partition_get_replica_stats(ns, &mp); + + log_line_objects(ns, n_objects, &mp); + log_line_tombstones(ns, n_tombstones, &mp); + log_line_appeals(ns); + log_line_migrations(ns); + log_line_memory_usage(ns, total_mem, index_mem, sindex_mem, data_mem); + log_line_device_usage(ns); + + log_line_client(ns); + log_line_xdr_client(ns); + log_line_batch_sub(ns); + log_line_scan(ns); + log_line_query(ns); + log_line_udf_sub(ns); + log_line_retransmits(ns); + log_line_re_repl(ns); + log_line_special_errors(ns); + + dump_namespace_histograms(ns); + } + + if (g_config.fabric_dump_msgs) { + as_fabric_msg_queue_dump(); + } + + cf_dump_ticker_cache(); +} + + +void +log_line_clock() +{ + cf_dyn_buf_define_size(outliers_db, 17 * AS_CLUSTER_SZ); + uint32_t num_outliers = as_skew_monitor_outliers_append(&outliers_db); + + if (num_outliers != 0) { + cf_dyn_buf_append_char(&outliers_db, 0); + + cf_info(AS_INFO, " cluster-clock: skew-ms %lu outliers (%s)", + as_skew_monitor_skew(), + outliers_db.buf + ); + } + else { + cf_info(AS_INFO, " cluster-clock: skew-ms %lu", + as_skew_monitor_skew() + ); + } + + cf_dyn_buf_free(&outliers_db); +} + + +void +log_line_system_memory() +{ + uint64_t freemem; + int freepct; + bool swapping; + + cf_meminfo(NULL, &freemem, &freepct, &swapping); + + size_t allocated_kbytes; + size_t active_kbytes; + size_t mapped_kbytes; + double efficiency_pct; + + cf_alloc_heap_stats(&allocated_kbytes, &active_kbytes, &mapped_kbytes, + &efficiency_pct, NULL); + + cf_info(AS_INFO, " system-memory: free-kbytes %lu free-pct %d%s heap-kbytes (%lu,%lu,%lu) heap-efficiency-pct %.1lf", + freemem / 1024, + freepct, + swapping ? " SWAPPING!" : "", + allocated_kbytes, active_kbytes, mapped_kbytes, + efficiency_pct + ); +} + + +void +log_line_in_progress() +{ + cf_info(AS_INFO, " in-progress: tsvc-q %d info-q %d nsup-delete-q %d rw-hash %u proxy-hash %u tree-gc-q %d", + as_tsvc_queue_get_size(), + as_info_queue_get_size(), + as_nsup_queue_get_size(), + rw_request_hash_count(), + as_proxy_hash_count(), + as_index_tree_gc_queue_size() + ); +} + + +void +log_line_fds() +{ + uint64_t n_proto_fds_opened = g_stats.proto_connections_opened; + uint64_t n_proto_fds_closed = g_stats.proto_connections_closed; + uint64_t n_hb_fds_opened = g_stats.heartbeat_connections_opened; + uint64_t n_hb_fds_closed = g_stats.heartbeat_connections_closed; + uint64_t n_fabric_fds_opened = g_stats.fabric_connections_opened; + uint64_t n_fabric_fds_closed = g_stats.fabric_connections_closed; + + uint64_t n_proto_fds_open = n_proto_fds_opened - n_proto_fds_closed; + uint64_t n_hb_fds_open = n_hb_fds_opened - n_hb_fds_closed; + uint64_t n_fabric_fds_open = n_fabric_fds_opened - n_fabric_fds_closed; + + cf_info(AS_INFO, " fds: proto (%lu,%lu,%lu) heartbeat (%lu,%lu,%lu) fabric (%lu,%lu,%lu)", + n_proto_fds_open, n_proto_fds_opened, n_proto_fds_closed, + n_hb_fds_open, n_hb_fds_opened, n_hb_fds_closed, + n_fabric_fds_open, n_fabric_fds_opened, n_fabric_fds_closed + ); +} + + +void +log_line_heartbeat() +{ + cf_info(AS_INFO, " heartbeat-received: self %lu foreign %lu", + g_stats.heartbeat_received_self, g_stats.heartbeat_received_foreign + ); +} + + +void +log_fabric_rate(uint64_t delta_time) +{ + fabric_rate rate = { { 0 } }; + + as_fabric_rate_capture(&rate); + + uint64_t dt_sec = delta_time / 1000000000; + + if (dt_sec < 1) { + dt_sec = 1; + } + + g_stats.fabric_bulk_s_rate = rate.s_bytes[AS_FABRIC_CHANNEL_BULK] / dt_sec; + g_stats.fabric_bulk_r_rate = rate.r_bytes[AS_FABRIC_CHANNEL_BULK] / dt_sec; + g_stats.fabric_ctrl_s_rate = rate.s_bytes[AS_FABRIC_CHANNEL_CTRL] / dt_sec; + g_stats.fabric_ctrl_r_rate = rate.r_bytes[AS_FABRIC_CHANNEL_CTRL] / dt_sec; + g_stats.fabric_meta_s_rate = rate.s_bytes[AS_FABRIC_CHANNEL_META] / dt_sec; + g_stats.fabric_meta_r_rate = rate.r_bytes[AS_FABRIC_CHANNEL_META] / dt_sec; + g_stats.fabric_rw_s_rate = rate.s_bytes[AS_FABRIC_CHANNEL_RW] / dt_sec; + g_stats.fabric_rw_r_rate = rate.r_bytes[AS_FABRIC_CHANNEL_RW] / dt_sec; + + cf_info(AS_INFO, " fabric-bytes-per-second: bulk (%lu,%lu) ctrl (%lu,%lu) meta (%lu,%lu) rw (%lu,%lu)", + g_stats.fabric_bulk_s_rate, g_stats.fabric_bulk_r_rate, + g_stats.fabric_ctrl_s_rate, g_stats.fabric_ctrl_r_rate, + g_stats.fabric_meta_s_rate, g_stats.fabric_meta_r_rate, + g_stats.fabric_rw_s_rate, g_stats.fabric_rw_r_rate + ); +} + + +void +log_line_early_fail() +{ + uint64_t n_demarshal = g_stats.n_demarshal_error; + uint64_t n_tsvc_client = g_stats.n_tsvc_client_error; + uint64_t n_tsvc_batch_sub = g_stats.n_tsvc_batch_sub_error; + uint64_t n_tsvc_udf_sub = g_stats.n_tsvc_udf_sub_error; + + if ((n_demarshal | + n_tsvc_client | + n_tsvc_batch_sub | + n_tsvc_udf_sub) == 0) { + return; + } + + cf_info(AS_INFO, " early-fail: demarshal %lu tsvc-client %lu tsvc-batch-sub %lu tsvc-udf-sub %lu", + n_demarshal, + n_tsvc_client, + n_tsvc_batch_sub, + n_tsvc_udf_sub + ); +} + + +void +log_line_batch_index() +{ + uint64_t n_complete = g_stats.batch_index_complete; + uint64_t n_error = g_stats.batch_index_errors; + uint64_t n_timeout = g_stats.batch_index_timeout; + + if ((n_complete | n_error | n_timeout) == 0) { + return; + } + + cf_info(AS_INFO, " batch-index: batches (%lu,%lu,%lu)", + n_complete, n_error, n_timeout + ); +} + + +void +log_line_objects(as_namespace* ns, uint64_t n_objects, repl_stats* mp) +{ + // TODO - show if all 0's ??? + cf_info(AS_INFO, "{%s} objects: all %lu master %lu prole %lu non-replica %lu", + ns->name, + n_objects, + mp->n_master_objects, + mp->n_prole_objects, + mp->n_non_replica_objects + ); +} + + +void +log_line_tombstones(as_namespace* ns, uint64_t n_tombstones, repl_stats* mp) +{ + if ((n_tombstones | + mp->n_master_tombstones | + mp->n_prole_tombstones | + mp->n_non_replica_tombstones) == 0) { + return; + } + + cf_info(AS_INFO, "{%s} tombstones: all %lu master %lu prole %lu non-replica %lu", + ns->name, + n_tombstones, + mp->n_master_tombstones, + mp->n_prole_tombstones, + mp->n_non_replica_tombstones + ); +} + + +void +log_line_appeals(as_namespace* ns) +{ + int64_t remaining_tx = (int64_t)ns->appeals_tx_remaining; + int64_t active_tx = (int64_t)ns->appeals_tx_active; + int64_t active_rx = (int64_t)ns->appeals_rx_active; + + if (remaining_tx > 0 || active_tx > 0 || active_rx > 0) { + cf_info(AS_INFO, "{%s} appeals: remaining-tx %ld active (%ld,%ld)", + ns->name, + remaining_tx, active_tx, active_rx + ); + } +} + + +void +log_line_migrations(as_namespace* ns) +{ + int64_t initial_tx = (int64_t)ns->migrate_tx_partitions_initial; + int64_t initial_rx = (int64_t)ns->migrate_rx_partitions_initial; + int64_t remaining_tx = (int64_t)ns->migrate_tx_partitions_remaining; + int64_t remaining_rx = (int64_t)ns->migrate_rx_partitions_remaining; + int64_t initial = initial_tx + initial_rx; + int64_t remaining = remaining_tx + remaining_rx; + + if (initial > 0 && remaining > 0) { + float complete_pct = (1 - ((float)remaining / (float)initial)) * 100; + + cf_info(AS_INFO, "{%s} migrations: remaining (%ld,%ld,%ld) active (%ld,%ld,%ld) complete-pct %0.2f", + ns->name, + remaining_tx, remaining_rx, ns->migrate_signals_remaining, + ns->migrate_tx_partitions_active, ns->migrate_rx_partitions_active, ns->migrate_signals_active, + complete_pct + ); + } + else { + cf_info(AS_INFO, "{%s} migrations: complete", ns->name); + } +} + + +void +log_line_memory_usage(as_namespace* ns, size_t total_mem, size_t index_mem, + size_t sindex_mem, size_t data_mem) +{ + double mem_used_pct = (double)(total_mem * 100) / (double)ns->memory_size; + + if (ns->storage_data_in_memory) { + cf_info(AS_INFO, "{%s} memory-usage: total-bytes %lu index-bytes %lu sindex-bytes %lu data-bytes %lu used-pct %.2lf", + ns->name, + total_mem, + index_mem, + sindex_mem, + data_mem, + mem_used_pct + ); + } + else { + cf_info(AS_INFO, "{%s} memory-usage: total-bytes %lu index-bytes %lu sindex-bytes %lu used-pct %.2lf", + ns->name, + total_mem, + index_mem, + sindex_mem, + mem_used_pct + ); + } +} + + +void +log_line_device_usage(as_namespace* ns) +{ + if (ns->storage_type != AS_STORAGE_ENGINE_SSD) { + return; + } + + int available_pct; + uint64_t inuse_disk_bytes; + as_storage_stats(ns, &available_pct, &inuse_disk_bytes); + + if (ns->storage_data_in_memory) { + cf_info(AS_INFO, "{%s} device-usage: used-bytes %lu avail-pct %d", + ns->name, + inuse_disk_bytes, + available_pct + ); + } + else { + uint32_t n_reads_from_cache = ns->n_reads_from_cache; + uint32_t n_total_reads = ns->n_reads_from_device + n_reads_from_cache; + + cf_atomic32_set(&ns->n_reads_from_device, 0); + cf_atomic32_set(&ns->n_reads_from_cache, 0); + + ns->cache_read_pct = + (float)(100 * n_reads_from_cache) / + (float)(n_total_reads == 0 ? 1 : n_total_reads); + + cf_info(AS_INFO, "{%s} device-usage: used-bytes %lu avail-pct %d cache-read-pct %.2f", + ns->name, + inuse_disk_bytes, + available_pct, + ns->cache_read_pct + ); + } +} + + +void +log_line_client(as_namespace* ns) +{ + uint64_t n_tsvc_error = ns->n_client_tsvc_error; + uint64_t n_tsvc_timeout = ns->n_client_tsvc_timeout; + uint64_t n_proxy_complete = ns->n_client_proxy_complete; + uint64_t n_proxy_error = ns->n_client_proxy_error; + uint64_t n_proxy_timeout = ns->n_client_proxy_timeout; + uint64_t n_read_success = ns->n_client_read_success; + uint64_t n_read_error = ns->n_client_read_error; + uint64_t n_read_timeout = ns->n_client_read_timeout; + uint64_t n_read_not_found = ns->n_client_read_not_found; + uint64_t n_write_success = ns->n_client_write_success; + uint64_t n_write_error = ns->n_client_write_error; + uint64_t n_write_timeout = ns->n_client_write_timeout; + uint64_t n_delete_success = ns->n_client_delete_success; + uint64_t n_delete_error = ns->n_client_delete_error; + uint64_t n_delete_timeout = ns->n_client_delete_timeout; + uint64_t n_delete_not_found = ns->n_client_delete_not_found; + uint64_t n_udf_complete = ns->n_client_udf_complete; + uint64_t n_udf_error = ns->n_client_udf_error; + uint64_t n_udf_timeout = ns->n_client_udf_timeout; + uint64_t n_lang_read_success = ns->n_client_lang_read_success; + uint64_t n_lang_write_success = ns->n_client_lang_write_success; + uint64_t n_lang_delete_success = ns->n_client_lang_delete_success; + uint64_t n_lang_error = ns->n_client_lang_error; + + if ((n_tsvc_error | n_tsvc_timeout | + n_proxy_complete | n_proxy_error | n_proxy_timeout | + n_read_success | n_read_error | n_read_timeout | n_read_not_found | + n_write_success | n_write_error | n_write_timeout | + n_delete_success | n_delete_error | n_delete_timeout | n_delete_not_found | + n_udf_complete | n_udf_error | n_udf_timeout | + n_lang_read_success | n_lang_write_success | n_lang_delete_success | n_lang_error) == 0) { + return; + } + + cf_info(AS_INFO, "{%s} client: tsvc (%lu,%lu) proxy (%lu,%lu,%lu) read (%lu,%lu,%lu,%lu) write (%lu,%lu,%lu) delete (%lu,%lu,%lu,%lu) udf (%lu,%lu,%lu) lang (%lu,%lu,%lu,%lu)", + ns->name, + n_tsvc_error, n_tsvc_timeout, + n_proxy_complete, n_proxy_error, n_proxy_timeout, + n_read_success, n_read_error, n_read_timeout, n_read_not_found, + n_write_success, n_write_error, n_write_timeout, + n_delete_success, n_delete_error, n_delete_timeout, n_delete_not_found, + n_udf_complete, n_udf_error, n_udf_timeout, + n_lang_read_success, n_lang_write_success, n_lang_delete_success, n_lang_error + ); +} + + +void +log_line_xdr_client(as_namespace* ns) +{ + uint64_t n_write_success = ns->n_xdr_write_success; + uint64_t n_write_error = ns->n_xdr_write_error; + uint64_t n_write_timeout = ns->n_xdr_write_timeout; + uint64_t n_delete_success = ns->n_xdr_delete_success; + uint64_t n_delete_error = ns->n_xdr_delete_error; + uint64_t n_delete_timeout = ns->n_xdr_delete_timeout; + uint64_t n_delete_not_found = ns->n_xdr_delete_not_found; + + if ((n_write_success | n_write_error | n_write_timeout | + n_delete_success | n_delete_error | n_delete_timeout | n_delete_not_found) == 0) { + return; + } + + cf_info(AS_INFO, "{%s} xdr-client: write (%lu,%lu,%lu) delete (%lu,%lu,%lu,%lu)", + ns->name, + n_write_success, n_write_error, n_write_timeout, + n_delete_success, n_delete_error, n_delete_timeout, n_delete_not_found + ); +} + + +void +log_line_batch_sub(as_namespace* ns) +{ + uint64_t n_tsvc_error = ns->n_batch_sub_tsvc_error; + uint64_t n_tsvc_timeout = ns->n_batch_sub_tsvc_timeout; + uint64_t n_proxy_complete = ns->n_batch_sub_proxy_complete; + uint64_t n_proxy_error = ns->n_batch_sub_proxy_error; + uint64_t n_proxy_timeout = ns->n_batch_sub_proxy_timeout; + uint64_t n_read_success = ns->n_batch_sub_read_success; + uint64_t n_read_error = ns->n_batch_sub_read_error; + uint64_t n_read_timeout = ns->n_batch_sub_read_timeout; + uint64_t n_read_not_found = ns->n_batch_sub_read_not_found; + + if ((n_tsvc_error | n_tsvc_timeout | + n_proxy_complete | n_proxy_error | n_proxy_timeout | + n_read_success | n_read_error | n_read_timeout | n_read_not_found) == 0) { + return; + } + + cf_info(AS_INFO, "{%s} batch-sub: tsvc (%lu,%lu) proxy (%lu,%lu,%lu) read (%lu,%lu,%lu,%lu)", + ns->name, + n_tsvc_error, n_tsvc_timeout, + n_proxy_complete, n_proxy_error, n_proxy_timeout, + n_read_success, n_read_error, n_read_timeout, n_read_not_found + ); +} + + +void +log_line_scan(as_namespace* ns) +{ + uint64_t n_basic_complete = ns->n_scan_basic_complete; + uint64_t n_basic_error = ns->n_scan_basic_error; + uint64_t n_basic_abort = ns->n_scan_basic_abort; + uint64_t n_aggr_complete = ns->n_scan_aggr_complete; + uint64_t n_aggr_error = ns->n_scan_aggr_error; + uint64_t n_aggr_abort = ns->n_scan_aggr_abort; + uint64_t n_udf_bg_complete = ns->n_scan_udf_bg_complete; + uint64_t n_udf_bg_error = ns->n_scan_udf_bg_error; + uint64_t n_udf_bg_abort = ns->n_scan_udf_bg_abort; + + if ((n_basic_complete | n_basic_error | n_basic_abort | + n_aggr_complete | n_aggr_error | n_aggr_abort | + n_udf_bg_complete | n_udf_bg_error | n_udf_bg_abort) == 0) { + return; + } + + cf_info(AS_INFO, "{%s} scan: basic (%lu,%lu,%lu) aggr (%lu,%lu,%lu) udf-bg (%lu,%lu,%lu)", + ns->name, + n_basic_complete, n_basic_error, n_basic_abort, + n_aggr_complete, n_aggr_error, n_aggr_abort, + n_udf_bg_complete, n_udf_bg_error, n_udf_bg_abort + ); +} + + +void +log_line_query(as_namespace* ns) +{ + uint64_t n_basic_success = ns->n_lookup_success; + uint64_t n_basic_failure = ns->n_lookup_errs + ns->n_lookup_abort; + uint64_t n_aggr_success = ns->n_agg_success; + uint64_t n_aggr_failure = ns->n_agg_errs + ns->n_agg_abort; + uint64_t n_udf_bg_success = ns->n_query_udf_bg_success; + uint64_t n_udf_bg_failure = ns->n_query_udf_bg_failure; + + if ((n_basic_success | n_basic_failure | + n_aggr_success | n_aggr_failure | + n_udf_bg_success | n_udf_bg_failure) == 0) { + return; + } + + cf_info(AS_INFO, "{%s} query: basic (%lu,%lu) aggr (%lu,%lu) udf-bg (%lu,%lu)", + ns->name, + n_basic_success, n_basic_failure, + n_aggr_success, n_aggr_failure, + n_udf_bg_success, n_udf_bg_failure + ); +} + + +void +log_line_udf_sub(as_namespace* ns) +{ + uint64_t n_tsvc_error = ns->n_udf_sub_tsvc_error; + uint64_t n_tsvc_timeout = ns->n_udf_sub_tsvc_timeout; + uint64_t n_udf_complete = ns->n_udf_sub_udf_complete; + uint64_t n_udf_error = ns->n_udf_sub_udf_error; + uint64_t n_udf_timeout = ns->n_udf_sub_udf_timeout; + uint64_t n_lang_read_success = ns->n_udf_sub_lang_read_success; + uint64_t n_lang_write_success = ns->n_udf_sub_lang_write_success; + uint64_t n_lang_delete_success = ns->n_udf_sub_lang_delete_success; + uint64_t n_lang_error = ns->n_udf_sub_lang_error; + + if ((n_tsvc_error | n_tsvc_timeout | + n_udf_complete | n_udf_error | n_udf_timeout | + n_lang_read_success | n_lang_write_success | n_lang_delete_success | n_lang_error) == 0) { + return; + } + + cf_info(AS_INFO, "{%s} udf-sub: tsvc (%lu,%lu) udf (%lu,%lu,%lu) lang (%lu,%lu,%lu,%lu)", + ns->name, + n_tsvc_error, n_tsvc_timeout, + n_udf_complete, n_udf_error, n_udf_timeout, + n_lang_read_success, n_lang_write_success, n_lang_delete_success, n_lang_error + ); +} + + +void +log_line_retransmits(as_namespace* ns) +{ + uint64_t n_migrate_record_retransmits = ns->migrate_record_retransmits; + uint64_t n_client_read_dup_res = ns->n_retransmit_client_read_dup_res; + uint64_t n_client_write_dup_res = ns->n_retransmit_client_write_dup_res; + uint64_t n_client_write_repl_write = ns->n_retransmit_client_write_repl_write; + uint64_t n_client_delete_dup_res = ns->n_retransmit_client_delete_dup_res; + uint64_t n_client_delete_repl_write = ns->n_retransmit_client_delete_repl_write; + uint64_t n_client_udf_dup_res = ns->n_retransmit_client_udf_dup_res; + uint64_t n_client_udf_repl_write = ns->n_retransmit_client_udf_repl_write; + uint64_t n_batch_sub_dup_res = ns->n_retransmit_batch_sub_dup_res; + uint64_t n_udf_sub_dup_res = ns->n_retransmit_udf_sub_dup_res; + uint64_t n_udf_sub_repl_write = ns->n_retransmit_udf_sub_repl_write; + + if ((n_migrate_record_retransmits | + n_client_read_dup_res | + n_client_write_dup_res | n_client_write_repl_write | + n_client_delete_dup_res | n_client_delete_repl_write | + n_client_udf_dup_res | n_client_udf_repl_write | + n_batch_sub_dup_res | + n_udf_sub_dup_res | n_udf_sub_repl_write) == 0) { + return; + } + + cf_info(AS_INFO, "{%s} retransmits: migration %lu client-read %lu client-write (%lu,%lu) client-delete (%lu,%lu) client-udf (%lu,%lu) batch-sub %lu udf-sub (%lu,%lu)", + ns->name, + n_migrate_record_retransmits, + n_client_read_dup_res, + n_client_write_dup_res, n_client_write_repl_write, + n_client_delete_dup_res, n_client_delete_repl_write, + n_client_udf_dup_res, n_client_udf_repl_write, + n_batch_sub_dup_res, + n_udf_sub_dup_res, n_udf_sub_repl_write + ); +} + + +void +log_line_re_repl(as_namespace* ns) +{ + uint64_t n_re_repl_success = ns->n_re_repl_success; + uint64_t n_re_repl_error = ns->n_re_repl_error; + uint64_t n_re_repl_timeout = ns->n_re_repl_timeout; + + if ((n_re_repl_success | n_re_repl_error | n_re_repl_timeout) == 0) { + return; + } + + cf_info(AS_INFO, "{%s} re-repl: all-triggers (%lu,%lu,%lu)", + ns->name, + n_re_repl_success, n_re_repl_error, n_re_repl_timeout + ); +} + + +void +log_line_special_errors(as_namespace* ns) +{ + uint64_t n_fail_key_busy = ns->n_fail_key_busy; + uint64_t n_fail_record_too_big = ns->n_fail_record_too_big; + + if ((n_fail_key_busy | + n_fail_record_too_big) == 0) { + return; + } + + cf_info(AS_INFO, "{%s} special-errors: key-busy %lu record-too-big %lu", + ns->name, + n_fail_key_busy, + n_fail_record_too_big + ); +} + + +void +dump_global_histograms() +{ + if (g_stats.batch_index_hist_active) { + histogram_dump(g_stats.batch_index_hist); + } + + if (g_config.info_hist_enabled) { + histogram_dump(g_stats.info_hist); + } + + if (g_config.svc_benchmarks_enabled) { + histogram_dump(g_stats.svc_demarshal_hist); + histogram_dump(g_stats.svc_queue_hist); + } + + if (g_config.fabric_benchmarks_enabled) { + histogram_dump(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_BULK]); + histogram_dump(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_BULK]); + histogram_dump(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_BULK]); + histogram_dump(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_BULK]); + histogram_dump(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_CTRL]); + histogram_dump(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_CTRL]); + histogram_dump(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_CTRL]); + histogram_dump(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_CTRL]); + histogram_dump(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_META]); + histogram_dump(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_META]); + histogram_dump(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_META]); + histogram_dump(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_META]); + histogram_dump(g_stats.fabric_send_init_hists[AS_FABRIC_CHANNEL_RW]); + histogram_dump(g_stats.fabric_send_fragment_hists[AS_FABRIC_CHANNEL_RW]); + histogram_dump(g_stats.fabric_recv_fragment_hists[AS_FABRIC_CHANNEL_RW]); + histogram_dump(g_stats.fabric_recv_cb_hists[AS_FABRIC_CHANNEL_RW]); + } + + as_query_histogram_dumpall(); +} + + +void +dump_namespace_histograms(as_namespace* ns) +{ + if (ns->read_hist_active) { + cf_hist_track_dump(ns->read_hist); + } + + if (ns->read_benchmarks_enabled) { + histogram_dump(ns->read_start_hist); + histogram_dump(ns->read_restart_hist); + histogram_dump(ns->read_dup_res_hist); + histogram_dump(ns->read_repl_ping_hist); + histogram_dump(ns->read_local_hist); + histogram_dump(ns->read_response_hist); + } + + if (ns->write_hist_active) { + cf_hist_track_dump(ns->write_hist); + } + + if (ns->write_benchmarks_enabled) { + histogram_dump(ns->write_start_hist); + histogram_dump(ns->write_restart_hist); + histogram_dump(ns->write_dup_res_hist); + histogram_dump(ns->write_master_hist); + histogram_dump(ns->write_repl_write_hist); + histogram_dump(ns->write_response_hist); + } + + if (ns->udf_hist_active) { + cf_hist_track_dump(ns->udf_hist); + } + + if (ns->udf_benchmarks_enabled) { + histogram_dump(ns->udf_start_hist); + histogram_dump(ns->udf_restart_hist); + histogram_dump(ns->udf_dup_res_hist); + histogram_dump(ns->udf_master_hist); + histogram_dump(ns->udf_repl_write_hist); + histogram_dump(ns->udf_response_hist); + } + + if (ns->query_hist_active) { + cf_hist_track_dump(ns->query_hist); + } + + if (ns->query_rec_count_hist_active) { + histogram_dump(ns->query_rec_count_hist); + } + + if (ns->proxy_hist_enabled) { + histogram_dump(ns->proxy_hist); + } + + if (ns->batch_sub_benchmarks_enabled) { + histogram_dump(ns->batch_sub_start_hist); + histogram_dump(ns->batch_sub_restart_hist); + histogram_dump(ns->batch_sub_dup_res_hist); + histogram_dump(ns->batch_sub_repl_ping_hist); + histogram_dump(ns->batch_sub_read_local_hist); + histogram_dump(ns->batch_sub_response_hist); + } + + if (ns->udf_sub_benchmarks_enabled) { + histogram_dump(ns->udf_sub_start_hist); + histogram_dump(ns->udf_sub_restart_hist); + histogram_dump(ns->udf_sub_dup_res_hist); + histogram_dump(ns->udf_sub_master_hist); + histogram_dump(ns->udf_sub_repl_write_hist); + histogram_dump(ns->udf_sub_response_hist); + } + + if (ns->re_repl_hist_active) { + histogram_dump(ns->re_repl_hist); + } + + if (ns->storage_benchmarks_enabled) { + as_storage_ticker_stats(ns); + } + + as_sindex_histogram_dumpall(ns); +} diff --git a/as/src/base/transaction.c b/as/src/base/transaction.c new file mode 100644 index 00000000..b1f6d294 --- /dev/null +++ b/as/src/base/transaction.c @@ -0,0 +1,480 @@ +/* + * transaction.c + * + * Copyright (C) 2008-2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * Operations on transactions + */ + +#include "base/transaction.h" + +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" + +#include "fault.h" +#include "socket.h" + +#include "base/batch.h" +#include "base/datamodel.h" +#include "base/proto.h" +#include "base/scan.h" +#include "base/security.h" +#include "base/stats.h" +#include "base/thr_demarshal.h" +#include "fabric/partition.h" +#include "transaction/proxy.h" +#include "transaction/rw_request.h" +#include "transaction/rw_utils.h" +#include "transaction/udf.h" + + +void +as_transaction_init_head(as_transaction *tr, cf_digest *keyd, cl_msg *msgp) +{ + tr->msgp = msgp; + tr->msg_fields = 0; + + tr->origin = 0; + tr->from_flags = 0; + + tr->from.any = NULL; + tr->from_data.any = 0; + + tr->keyd = keyd ? *keyd : cf_digest_zero; + + tr->start_time = 0; + tr->benchmark_time = 0; +} + +void +as_transaction_init_body(as_transaction *tr) +{ + AS_PARTITION_RESERVATION_INIT(tr->rsv); + + tr->end_time = 0; + tr->result_code = AS_PROTO_RESULT_OK; + tr->flags = 0; + tr->generation = 0; + tr->void_time = 0; + tr->last_update_time = 0; +} + +void +as_transaction_copy_head(as_transaction *to, const as_transaction *from) +{ + to->msgp = from->msgp; + to->msg_fields = from->msg_fields; + + to->origin = from->origin; + to->from_flags = from->from_flags; + + to->from.any = from->from.any; + to->from_data.any = from->from_data.any; + + to->keyd = from->keyd; + + to->start_time = from->start_time; + to->benchmark_time = from->benchmark_time; +} + +void +as_transaction_init_from_rw(as_transaction *tr, rw_request *rw) +{ + as_transaction_init_head_from_rw(tr, rw); + // Note - we don't clear rw->msgp, destructor will free it. + + as_partition_reservation_copy(&tr->rsv, &rw->rsv); + // Note - destructor will still release the reservation. + + tr->end_time = rw->end_time; + tr->result_code = rw->result_code; + tr->flags = rw->flags; + tr->generation = rw->generation; + tr->void_time = rw->void_time; + tr->last_update_time = rw->last_update_time; +} + +void +as_transaction_init_head_from_rw(as_transaction *tr, rw_request *rw) +{ + tr->msgp = rw->msgp; + tr->msg_fields = rw->msg_fields; + tr->origin = rw->origin; + tr->from_flags = rw->from_flags; + tr->from.any = rw->from.any; + tr->from_data.any = rw->from_data.any; + tr->keyd = rw->keyd; + tr->start_time = rw->start_time; + tr->benchmark_time = rw->benchmark_time; + + rw->from.any = NULL; + // Note - we don't clear rw->msgp, destructor will free it. +} + +bool +as_transaction_set_msg_field_flag(as_transaction *tr, uint8_t type) +{ + switch (type) { + case AS_MSG_FIELD_TYPE_NAMESPACE: + tr->msg_fields |= AS_MSG_FIELD_BIT_NAMESPACE; + break; + case AS_MSG_FIELD_TYPE_SET: + tr->msg_fields |= AS_MSG_FIELD_BIT_SET; + break; + case AS_MSG_FIELD_TYPE_KEY: + tr->msg_fields |= AS_MSG_FIELD_BIT_KEY; + break; + case AS_MSG_FIELD_TYPE_DIGEST_RIPE: + tr->msg_fields |= AS_MSG_FIELD_BIT_DIGEST_RIPE; + break; + case AS_MSG_FIELD_TYPE_DIGEST_RIPE_ARRAY: + tr->msg_fields |= AS_MSG_FIELD_BIT_DIGEST_RIPE_ARRAY; + break; + case AS_MSG_FIELD_TYPE_TRID: + tr->msg_fields |= AS_MSG_FIELD_BIT_TRID; + break; + case AS_MSG_FIELD_TYPE_SCAN_OPTIONS: + tr->msg_fields |= AS_MSG_FIELD_BIT_SCAN_OPTIONS; + break; + case AS_MSG_FIELD_TYPE_SOCKET_TIMEOUT: + tr->msg_fields |= AS_MSG_FIELD_BIT_SOCKET_TIMEOUT; + break; + case AS_MSG_FIELD_TYPE_INDEX_NAME: + tr->msg_fields |= AS_MSG_FIELD_BIT_INDEX_NAME; + break; + case AS_MSG_FIELD_TYPE_INDEX_RANGE: + tr->msg_fields |= AS_MSG_FIELD_BIT_INDEX_RANGE; + break; + case AS_MSG_FIELD_TYPE_INDEX_TYPE: + tr->msg_fields |= AS_MSG_FIELD_BIT_INDEX_TYPE; + break; + case AS_MSG_FIELD_TYPE_UDF_FILENAME: + tr->msg_fields |= AS_MSG_FIELD_BIT_UDF_FILENAME; + break; + case AS_MSG_FIELD_TYPE_UDF_FUNCTION: + tr->msg_fields |= AS_MSG_FIELD_BIT_UDF_FUNCTION; + break; + case AS_MSG_FIELD_TYPE_UDF_ARGLIST: + tr->msg_fields |= AS_MSG_FIELD_BIT_UDF_ARGLIST; + break; + case AS_MSG_FIELD_TYPE_UDF_OP: + tr->msg_fields |= AS_MSG_FIELD_BIT_UDF_OP; + break; + case AS_MSG_FIELD_TYPE_QUERY_BINLIST: + tr->msg_fields |= AS_MSG_FIELD_BIT_QUERY_BINLIST; + break; + case AS_MSG_FIELD_TYPE_BATCH: // shouldn't get here - batch parent handles this + tr->msg_fields |= AS_MSG_FIELD_BIT_BATCH; + break; + case AS_MSG_FIELD_TYPE_BATCH_WITH_SET: // shouldn't get here - batch parent handles this + tr->msg_fields |= AS_MSG_FIELD_BIT_BATCH_WITH_SET; + break; + case AS_MSG_FIELD_TYPE_PREDEXP: + tr->msg_fields |= AS_MSG_FIELD_BIT_PREDEXP; + break; + default: + return false; + } + + return true; +} + +bool +as_transaction_prepare(as_transaction *tr, bool swap) +{ + uint64_t size = tr->msgp->proto.sz; + + if (size < sizeof(as_msg)) { + cf_warning(AS_PROTO, "proto body size %lu smaller than as_msg", size); + return false; + } + + // The proto data is not smaller than an as_msg - safe to swap header. + as_msg *m = &tr->msgp->msg; + + if (swap) { + as_msg_swap_header(m); + } + + uint8_t* p_end = (uint8_t*)m + size; + uint8_t* p_read = m->data; + + // Parse and swap fields first. + for (uint16_t n = 0; n < m->n_fields; n++) { + if (p_read + sizeof(as_msg_field) > p_end) { + cf_warning(AS_PROTO, "incomplete as_msg_field"); + return false; + } + + as_msg_field* p_field = (as_msg_field*)p_read; + + if (swap) { + as_msg_swap_field(p_field); + } + + p_read = as_msg_field_skip(p_field); + + if (! p_read) { + cf_warning(AS_PROTO, "bad as_msg_field"); + return false; + } + + if (p_read > p_end) { + cf_warning(AS_PROTO, "incomplete as_msg_field value"); + return false; + } + + // Store which message fields are present - prevents lots of re-parsing. + if (! as_transaction_set_msg_field_flag(tr, p_field->type)) { + cf_debug(AS_PROTO, "skipping as_msg_field type %u", p_field->type); + } + } + + // Parse and swap bin-ops, if any. + for (uint16_t n = 0; n < m->n_ops; n++) { + if (p_read + sizeof(as_msg_op) > p_end) { + cf_warning(AS_PROTO, "incomplete as_msg_op"); + return false; + } + + as_msg_op* op = (as_msg_op*)p_read; + + if (swap) { + as_msg_swap_op(op); + } + + p_read = as_msg_op_skip(op); + + if (! p_read) { + cf_warning(AS_PROTO, "bad as_msg_op"); + return false; + } + + if (p_read > p_end) { + cf_warning(AS_PROTO, "incomplete as_msg_op data"); + return false; + } + } + + if (p_read != p_end) { + cf_warning(AS_PROTO, "extra bytes follow fields and bin-ops"); + return false; + } + + return true; +} + +// Initialize an internal UDF transaction (for a UDF scan/query). Allocates a +// message with namespace and digest - no set for now, since these transactions +// won't get security checked, and they can't create a record. +void +as_transaction_init_iudf(as_transaction *tr, as_namespace *ns, cf_digest *keyd, + iudf_origin* iudf_orig, bool is_durable_delete) +{ + uint8_t info2 = AS_MSG_INFO2_WRITE | + (is_durable_delete ? AS_MSG_INFO2_DURABLE_DELETE : 0); + + cl_msg *msgp = as_msg_create_internal(ns->name, keyd, 0, info2, 0); + + as_transaction_init_head(tr, NULL, msgp); + + as_transaction_set_msg_field_flag(tr, AS_MSG_FIELD_TYPE_NAMESPACE); + as_transaction_set_msg_field_flag(tr, AS_MSG_FIELD_TYPE_DIGEST_RIPE); + + tr->origin = FROM_IUDF; + tr->from.iudf_orig = iudf_orig; + + // Do this last, to exclude the setup time in this function. + tr->start_time = cf_getns(); +} + +void +as_transaction_demarshal_error(as_transaction* tr, uint32_t error_code) +{ + as_msg_send_reply(tr->from.proto_fd_h, error_code, 0, 0, NULL, NULL, 0, NULL, 0); + tr->from.proto_fd_h = NULL; + + cf_free(tr->msgp); + tr->msgp = NULL; + + cf_atomic64_incr(&g_stats.n_demarshal_error); +} + +#define UPDATE_ERROR_STATS(name) \ + if (ns) { \ + if (error_code == AS_PROTO_RESULT_FAIL_TIMEOUT) { \ + cf_atomic64_incr(&ns->n_##name##_tsvc_timeout); \ + } \ + else { \ + cf_atomic64_incr(&ns->n_##name##_tsvc_error); \ + } \ + } \ + else { \ + cf_atomic64_incr(&g_stats.n_tsvc_##name##_error); \ + } + +void +as_transaction_error(as_transaction* tr, as_namespace* ns, uint32_t error_code) +{ + if (error_code == 0) { + cf_warning(AS_PROTO, "converting error code 0 to 1 (unknown)"); + error_code = AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + // The 'from' checks below are unnecessary, only paranoia. + switch (tr->origin) { + case FROM_CLIENT: + if (tr->from.proto_fd_h) { + as_msg_send_reply(tr->from.proto_fd_h, error_code, 0, 0, NULL, NULL, 0, NULL, as_transaction_trid(tr)); + tr->from.proto_fd_h = NULL; // pattern, not needed + } + UPDATE_ERROR_STATS(client); + break; + case FROM_PROXY: + if (tr->from.proxy_node != 0) { + as_proxy_send_response(tr->from.proxy_node, tr->from_data.proxy_tid, error_code, 0, 0, NULL, NULL, 0, NULL, as_transaction_trid(tr)); + tr->from.proxy_node = 0; // pattern, not needed + } + break; + case FROM_BATCH: + if (tr->from.batch_shared) { + as_batch_add_error(tr->from.batch_shared, tr->from_data.batch_index, error_code); + tr->from.batch_shared = NULL; // pattern, not needed + tr->msgp = NULL; // pattern, not needed + } + UPDATE_ERROR_STATS(batch_sub); + break; + case FROM_IUDF: + if (tr->from.iudf_orig) { + tr->from.iudf_orig->cb(tr->from.iudf_orig->udata, error_code); + tr->from.iudf_orig = NULL; // pattern, not needed + } + UPDATE_ERROR_STATS(udf_sub); + break; + case FROM_NSUP: + break; + case FROM_RE_REPL: + if (tr->from.re_repl_orig_cb) { + tr->result_code = error_code; + tr->from.re_repl_orig_cb(tr); + tr->from.re_repl_orig_cb = NULL; // pattern, not needed + } + // Re-replications take care of stats independently. + break; + default: + cf_crash(AS_PROTO, "unexpected transaction origin %u", tr->origin); + break; + } +} + +// TODO - temporary, until scan & query can do their own synchronous failure +// responses. (Here we forfeit namespace info and add to global-scope error.) +void +as_multi_rec_transaction_error(as_transaction* tr, uint32_t error_code) +{ + if (error_code == 0) { + cf_warning(AS_PROTO, "converting error code 0 to 1 (unknown)"); + error_code = AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + switch (tr->origin) { + case FROM_CLIENT: + if (tr->from.proto_fd_h) { + as_msg_send_reply(tr->from.proto_fd_h, error_code, 0, 0, NULL, NULL, 0, NULL, as_transaction_trid(tr)); + tr->from.proto_fd_h = NULL; // pattern, not needed + } + cf_atomic64_incr(&g_stats.n_tsvc_client_error); + break; + default: + cf_crash(AS_PROTO, "unexpected transaction origin %u", tr->origin); + break; + } +} + +// Helper to release transaction file handles. +void +as_release_file_handle(as_file_handle *proto_fd_h) +{ + int rc = cf_rc_release(proto_fd_h); + + if (rc > 0) { + return; + } + else if (rc < 0) { + cf_warning(AS_PROTO, "release file handle: negative ref-count %d", rc); + return; + } + + cf_socket_close(&proto_fd_h->sock); + cf_socket_term(&proto_fd_h->sock); + proto_fd_h->fh_info &= ~FH_INFO_DONOT_REAP; + + if (proto_fd_h->proto) { + as_proto *p = proto_fd_h->proto; + + if ((p->version != PROTO_VERSION) || (p->type >= PROTO_TYPE_MAX)) { + cf_warning(AS_PROTO, "release file handle: bad proto buf, corruption"); + } + else { + cf_free(proto_fd_h->proto); + proto_fd_h->proto = NULL; + } + } + + if (proto_fd_h->security_filter) { + as_security_filter_destroy(proto_fd_h->security_filter); + proto_fd_h->security_filter = NULL; + } + + cf_rc_free(proto_fd_h); + cf_atomic64_incr(&g_stats.proto_connections_closed); +} + +void +as_end_of_transaction(as_file_handle *proto_fd_h, bool force_close) +{ + thr_demarshal_rearm(proto_fd_h); + + if (force_close) { + cf_socket_shutdown(&proto_fd_h->sock); + } + + as_release_file_handle(proto_fd_h); +} + +void +as_end_of_transaction_ok(as_file_handle *proto_fd_h) +{ + as_end_of_transaction(proto_fd_h, false); +} + +void +as_end_of_transaction_force_close(as_file_handle *proto_fd_h) +{ + as_end_of_transaction(proto_fd_h, true); +} diff --git a/as/src/base/truncate.c b/as/src/base/truncate.c new file mode 100644 index 00000000..10c00044 --- /dev/null +++ b/as/src/base/truncate.c @@ -0,0 +1,621 @@ +/* + * truncate.c + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "base/truncate.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" + +#include "fault.h" +#include "shash.h" +#include "vmapx.h" + +#include "base/datamodel.h" +#include "base/index.h" +#include "base/system_metadata.h" +#include "transaction/rw_utils.h" + + +//========================================================== +// Typedefs & constants. +// + +typedef struct truncate_reduce_cb_info_s { + as_namespace* ns; + as_index_tree* tree; + int64_t n_deleted; +} truncate_reduce_cb_info; + +static const uint32_t NUM_TRUNCATE_THREADS = 4; + +// Truncate system metadata module name. +const char AS_TRUNCATE_MODULE[] = "truncate"; +#define TRUNCATE_MODULE ((char*)AS_TRUNCATE_MODULE) +// TODO - change smd API to take const char* module names? + +// Includes 1 for delimiter and 1 for null-terminator. +#define TRUNCATE_KEY_SIZE (AS_ID_NAMESPACE_SZ + AS_SET_NAME_MAX_SIZE) + +// System metadata key format token. +#define TOK_DELIMITER ('|') + +// Detect excessive clock skew for warning purposes only. +static const uint64_t WARN_CLOCK_SKEW_MS = 1000UL * 5; + + +//========================================================== +// Globals. +// + +static cf_shash* g_truncate_filter_hash = NULL; +static bool g_truncate_smd_loaded = false; + + +//========================================================== +// Forward declarations. +// + +bool filter_hash_put(const as_smd_item_t* item); +void filter_hash_delete(const as_smd_item_t* item); + +bool truncate_smd_conflict_cb(char* module, as_smd_item_t* existing_item, as_smd_item_t* new_item, void* udata); +int truncate_smd_accept_cb(char* module, as_smd_item_list_t* items, void* udata, uint32_t accept_opt); +int truncate_smd_can_accept_cb(char* module, as_smd_item_t *item, void *udata); + +void truncate_action_do(as_namespace* ns, const char* set_name, uint64_t lut); +void truncate_action_undo(as_namespace* ns, const char* set_name); +void truncate_all(as_namespace* ns); +void* run_truncate(void* arg); +void truncate_finish(as_namespace* ns); +void truncate_reduce_cb(as_index_ref* r_ref, void* udata); + + +//========================================================== +// Inlines & macros. +// + +static inline uint64_t +lut_from_smd(const as_smd_item_t* item) +{ + return strtoul(item->value, NULL, 10); +} + + +//========================================================== +// Public API. +// + +void +as_truncate_init(as_namespace* ns) +{ + truncate_startup_hash_init(ns); + + ns->truncate.state = TRUNCATE_IDLE; + pthread_mutex_init(&ns->truncate.state_lock, 0); +} + + +void +as_truncate_init_smd() +{ + // Create the global filter shash used on the SMD principal. + g_truncate_filter_hash = cf_shash_create(cf_shash_fn_zstr, + TRUNCATE_KEY_SIZE, sizeof(truncate_hval), + 1024 * g_config.n_namespaces, 0); + + // Register the system metadata custom callbacks. + if (as_smd_create_module(TRUNCATE_MODULE, + NULL, NULL, + truncate_smd_conflict_cb, NULL, + truncate_smd_accept_cb, NULL, + truncate_smd_can_accept_cb, NULL) != 0) { + cf_crash(AS_TRUNCATE, "truncate init - failed smd create module"); + } + + while (! g_truncate_smd_loaded) { + usleep(1000); + } +} + + +// SMD key is "ns-name|set-name" or "ns-name". +// SMD value is last-update-time as decimal string. +bool +as_truncate_cmd(const char* ns_name, const char* set_name, const char* lut_str) +{ + char smd_key[TRUNCATE_KEY_SIZE]; + + strcpy(smd_key, ns_name); + + if (set_name) { + char* p_write = smd_key + strlen(ns_name); + + *p_write++ = TOK_DELIMITER; + strcpy(p_write, set_name); + } + + uint64_t now = cf_clepoch_milliseconds(); + uint64_t lut; + + if (lut_str) { + uint64_t utc_nanosec = strtoul(lut_str, NULL, 0); + + // Last update time as human-readable UTC seconds. + // TODO - make generic utility? + char utc_sec[64] = { 0 }; + time_t utc_time = utc_nanosec / 1000000000; + struct tm utc_tm; + + if (cf_fault_is_using_local_time()) { + localtime_r(&utc_time, &utc_tm); + strftime(utc_sec, sizeof(utc_sec), "%b %d %Y %T GMT%z", &utc_tm); + } + else { + gmtime_r(&utc_time, &utc_tm); + strftime(utc_sec, sizeof(utc_sec), "%b %d %Y %T %Z", &utc_tm); + } + + lut = cf_clepoch_ms_from_utc_ns(utc_nanosec); + + if (lut == 0) { + cf_warning(AS_TRUNCATE, "command lut %s (%s) would truncate to 0", + lut_str, utc_sec); + return false; + } + + if (lut > now) { + cf_warning(AS_TRUNCATE, "command lut %s (%s) is in the future", + lut_str, utc_sec); + return false; + } + + cf_info(AS_TRUNCATE, "{%s} got command to truncate to %s (%lu)", + smd_key, utc_sec, lut); + } + else { + // Use a last-update-time threshold of now. + lut = now; + + cf_info(AS_TRUNCATE, "{%s} got command to truncate to now (%lu)", + smd_key, lut); + } + + char smd_value[13 + 1]; // 0xFFffffFFFF (40 bits) is 13 decimal characters + + sprintf(smd_value, "%lu", lut); + + // Broadcast the truncate command to all nodes (including this one). + as_smd_set_metadata(TRUNCATE_MODULE, smd_key, smd_value); + + return true; +} + + +// SMD key is "ns-name|set-name" or "ns-name". +void +as_truncate_undo_cmd(const char* ns_name, const char* set_name) +{ + char smd_key[TRUNCATE_KEY_SIZE]; + + strcpy(smd_key, ns_name); + + if (set_name) { + char* p_write = smd_key + strlen(ns_name); + + *p_write++ = TOK_DELIMITER; + strcpy(p_write, set_name); + } + + cf_info(AS_TRUNCATE, "{%s} got command to undo truncate", smd_key); + + // Broadcast the truncate-undo command to all nodes (including this one). + as_smd_delete_metadata(TRUNCATE_MODULE, smd_key); +} + + +bool +as_truncate_now_is_truncated(struct as_namespace_s* ns, uint16_t set_id) +{ + uint64_t now = cf_clepoch_milliseconds(); + + if (now < ns->truncate.lut) { + return true; + } + + as_set* p_set = as_namespace_get_set_by_id(ns, set_id); + + return p_set ? now < p_set->truncate_lut : false; +} + + +bool +as_truncate_record_is_truncated(const as_record* r, as_namespace* ns) +{ + if (r->last_update_time < ns->truncate.lut) { + return true; + } + + as_set* p_set = as_namespace_get_record_set(ns, r); + + return p_set ? r->last_update_time < p_set->truncate_lut : false; +} + + +//========================================================== +// Local helpers - generic. +// + +bool +filter_hash_put(const as_smd_item_t* item) +{ + char hkey[TRUNCATE_KEY_SIZE] = { 0 }; // pad for consistent shash key + + strcpy(hkey, item->key); + + truncate_hval new_hval = { .lut = lut_from_smd(item) }; + truncate_hval ex_hval; + + if (cf_shash_get(g_truncate_filter_hash, hkey, &ex_hval) != CF_SHASH_OK || + new_hval.lut > ex_hval.lut) { + cf_shash_put(g_truncate_filter_hash, hkey, &new_hval); + + return true; + } + + // This is normal on principal, from truncate_smd_accept_cb(). + cf_detail(AS_TRUNCATE, "{%s} truncate lut %lu <= filter lut %lu", item->key, + (uint64_t)new_hval.lut, (uint64_t)ex_hval.lut); + + return false; +} + + +void +filter_hash_delete(const as_smd_item_t* item) +{ + char hkey[TRUNCATE_KEY_SIZE] = { 0 }; // pad for consistent shash key + + strcpy(hkey, item->key); + + if (cf_shash_delete(g_truncate_filter_hash, hkey) != CF_SHASH_OK) { + cf_warning(AS_TRUNCATE, "{%s} failed filter-hash delete", item->key); + } +} + + +//========================================================== +// Local helpers - SMD callbacks. +// + +bool +truncate_smd_conflict_cb(char* module, as_smd_item_t* existing_item, + as_smd_item_t* new_item, void* udata) +{ + return lut_from_smd(existing_item) >= lut_from_smd(new_item); +} + + +int +truncate_smd_accept_cb(char* module, as_smd_item_list_t* items, void* udata, + uint32_t accept_opt) +{ + if ((accept_opt & AS_SMD_ACCEPT_OPT_CREATE) != 0) { + g_truncate_smd_loaded = true; + return 0; + } + + bool is_merge = (accept_opt & AS_SMD_ACCEPT_OPT_MERGE) != 0; + + for (int i = 0; i < (int)items->num_items; i++) { + as_smd_item_t* item = items->item[i]; + + if (item->action == AS_SMD_ACTION_SET) { + // If we're here via SMD API command (as opposed to via merge), SMD + // principal's hash will already have this item - ignore filter + // result, let as_set/as_namespace cached value do the filtering. + if (! filter_hash_put(item) && is_merge) { + continue; + } + } + else if (item->action == AS_SMD_ACTION_DELETE) { + filter_hash_delete(item); + } + else { + cf_warning(AS_TRUNCATE, "smd accept cb - unknown action"); + continue; + } + + const char* ns_name = item->key; + const char* tok = strchr(ns_name, TOK_DELIMITER); + + uint32_t ns_len = tok ? (uint32_t)(tok - ns_name) : strlen(ns_name); + as_namespace* ns = as_namespace_get_bybuf((uint8_t*)ns_name, ns_len); + + if (! ns) { + cf_detail(AS_TRUNCATE, "skipping invalid ns"); + continue; + } + + const char* set_name = tok ? tok + 1 : NULL; + + if (item->action == AS_SMD_ACTION_SET) { + uint64_t lut = lut_from_smd(item); + + if (g_truncate_smd_loaded) { + truncate_action_do(ns, set_name, lut); + } + else { + truncate_action_startup(ns, set_name, lut); + } + } + else { + truncate_action_undo(ns, set_name); + } + } + + return 0; +} + + +int +truncate_smd_can_accept_cb(char* module, as_smd_item_t* item, void* udata) +{ + if (item->action == AS_SMD_ACTION_SET) { + if (filter_hash_put(item)) { + return 0; + } + + cf_info(AS_TRUNCATE, "{%s} ignoring redundant truncate lut", item->key); + + return -1; + } + else if (item->action == AS_SMD_ACTION_DELETE) { + return 0; + } + else { + cf_warning(AS_TRUNCATE, "smd can accept cb - unknown action"); + return -1; + } +} + + +//========================================================== +// Local helpers - SMD callbacks' helpers. +// + +void +truncate_action_do(as_namespace* ns, const char* set_name, uint64_t lut) +{ + uint64_t now = cf_clepoch_milliseconds(); + + if (lut > now + WARN_CLOCK_SKEW_MS) { + cf_warning(AS_TRUNCATE, "lut is %lu ms in the future - clock skew?", + lut - now); + } + + if (set_name) { + as_set* p_set = as_namespace_get_set_by_name(ns, set_name); + + if (! p_set) { + cf_info(AS_TRUNCATE, "{%s|%s} truncate for nonexistent set", + ns->name, set_name); + return; + } + + if (lut <= p_set->truncate_lut) { + cf_info(AS_TRUNCATE, "{%s|%s} truncate lut %lu <= vmap lut %lu", + ns->name, set_name, lut, p_set->truncate_lut); + return; + } + + cf_info(AS_TRUNCATE, "{%s|%s} truncating to %lu", ns->name, set_name, + lut); + + p_set->truncate_lut = lut; + } + else { + if (lut <= ns->truncate.lut) { + cf_info(AS_TRUNCATE, "{%s} truncate lut %lu <= ns lut %lu", + ns->name, lut, ns->truncate.lut); + return; + } + + cf_info(AS_TRUNCATE, "{%s} truncating to %lu", ns->name, lut); + + ns->truncate.lut = lut; + } + + // Truncate to new last-update-time. + + pthread_mutex_lock(&ns->truncate.state_lock); + + switch (ns->truncate.state) { + case TRUNCATE_IDLE: + cf_info(AS_TRUNCATE, "{%s} starting truncate", ns->name); + truncate_all(ns); + break; + case TRUNCATE_RUNNING: + cf_info(AS_TRUNCATE, "{%s} flagging truncate to restart", ns->name); + ns->truncate.state = TRUNCATE_RESTART; + break; + case TRUNCATE_RESTART: + cf_info(AS_TRUNCATE, "{%s} truncate already will restart", ns->name); + break; + default: + cf_crash(AS_TRUNCATE, "bad truncate state %d", ns->truncate.state); + break; + } + + pthread_mutex_unlock(&ns->truncate.state_lock); +} + + +void +truncate_action_undo(as_namespace* ns, const char* set_name) +{ + if (set_name) { + as_set* p_set = as_namespace_get_set_by_name(ns, set_name); + + if (! p_set) { + cf_info(AS_TRUNCATE, "{%s|%s} undo truncate for nonexistent set", + ns->name, set_name); + return; + } + + cf_info(AS_TRUNCATE, "{%s|%s} undoing truncate - was to %lu", ns->name, + set_name, p_set->truncate_lut); + + p_set->truncate_lut = 0; + } + else { + cf_info(AS_TRUNCATE, "{%s} undoing truncate - was to %lu", ns->name, + ns->truncate.lut); + + ns->truncate.lut = 0; + } +} + + +// Called under truncate lock. +void +truncate_all(as_namespace* ns) +{ + // TODO - skipping sindex deletion shortcut - can't do that if we want to + // keep writing through set truncates. Is this ok? + + ns->truncate.state = TRUNCATE_RUNNING; + cf_atomic32_set(&ns->truncate.n_threads_running, NUM_TRUNCATE_THREADS); + cf_atomic32_set(&ns->truncate.pid, -1); + + cf_atomic64_set(&ns->truncate.n_records_this_run, 0); + + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + for (uint32_t i = 0; i < NUM_TRUNCATE_THREADS; i++) { + if (pthread_create(&thread, &attrs, run_truncate, (void*)ns) != 0) { + cf_crash(AS_TRUNCATE, "failed to create truncate thread"); + // TODO - be forgiving? Is there any point? + } + } +} + + +void* +run_truncate(void* arg) +{ + as_namespace* ns = (as_namespace*)arg; + uint32_t pid; + + while ((pid = (uint32_t)cf_atomic32_incr(&ns->truncate.pid)) < + AS_PARTITIONS) { + as_partition_reservation rsv; + as_partition_reserve(ns, pid, &rsv); + + truncate_reduce_cb_info cb_info = { .ns = ns, .tree = rsv.tree }; + + as_index_reduce(rsv.tree, truncate_reduce_cb, (void*)&cb_info); + as_partition_release(&rsv); + + cf_atomic64_add(&ns->truncate.n_records_this_run, cb_info.n_deleted); + } + + truncate_finish(ns); + + return NULL; +} + + +void +truncate_finish(as_namespace* ns) +{ + if (cf_atomic32_decr(&ns->truncate.n_threads_running) == 0) { + pthread_mutex_lock(&ns->truncate.state_lock); + + ns->truncate.n_records += ns->truncate.n_records_this_run; + + cf_info(AS_TRUNCATE, "{%s} truncated records (%lu,%lu)", ns->name, + ns->truncate.n_records_this_run, ns->truncate.n_records); + + switch (ns->truncate.state) { + case TRUNCATE_RUNNING: + cf_info(AS_TRUNCATE, "{%s} done truncate", ns->name); + ns->truncate.state = TRUNCATE_IDLE; + break; + case TRUNCATE_RESTART: + cf_info(AS_TRUNCATE, "{%s} restarting truncate", ns->name); + truncate_all(ns); + break; + case TRUNCATE_IDLE: + default: + cf_crash(AS_TRUNCATE, "bad truncate state %d", ns->truncate.state); + break; + } + + pthread_mutex_unlock(&ns->truncate.state_lock); + } +} + + +void +truncate_reduce_cb(as_index_ref* r_ref, void* udata) +{ + as_record* r = r_ref->r; + truncate_reduce_cb_info* cb_info = (truncate_reduce_cb_info*)udata; + as_namespace* ns = cb_info->ns; + + if (r->last_update_time < ns->truncate.lut) { + cb_info->n_deleted++; + record_delete_adjust_sindex(r, ns); + as_index_delete(cb_info->tree, &r->keyd); + as_record_done(r_ref, ns); + return; + } + + as_set* p_set = as_namespace_get_record_set(ns, r); + + // Delete records not updated since their set's threshold last-update-time. + if (p_set && r->last_update_time < p_set->truncate_lut) { + cb_info->n_deleted++; + record_delete_adjust_sindex(r, ns); + as_index_delete(cb_info->tree, &r->keyd); + } + + as_record_done(r_ref, ns); +} diff --git a/as/src/base/truncate_ce.c b/as/src/base/truncate_ce.c new file mode 100644 index 00000000..25a21562 --- /dev/null +++ b/as/src/base/truncate_ce.c @@ -0,0 +1,62 @@ +/* + * truncate_ce.c + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "base/truncate.h" + +#include "base/datamodel.h" + + +//========================================================== +// Public API. +// + +void +as_truncate_done_startup(as_namespace* ns) +{ +} + + +void +as_truncate_list_cenotaphs(as_namespace* ns) +{ +} + + +//========================================================== +// Private API - for enterprise separation only. +// + +void +truncate_startup_hash_init(as_namespace* ns) +{ +} + + +void +truncate_action_startup(as_namespace* ns, const char* set_name, uint64_t lut) +{ +} + diff --git a/as/src/base/udf_aerospike.c b/as/src/base/udf_aerospike.c new file mode 100644 index 00000000..d543bea4 --- /dev/null +++ b/as/src/base/udf_aerospike.c @@ -0,0 +1,971 @@ +/* + * udf_aerospike.c + * + * Copyright (C) 2012-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/udf_aerospike.h" + +#include +#include +#include +#include +#include + +#include "aerospike/as_aerospike.h" +#include "aerospike/as_boolean.h" +#include "aerospike/as_buffer.h" +#include "aerospike/as_bytes.h" +#include "aerospike/as_integer.h" +#include "aerospike/as_msgpack.h" +#include "aerospike/as_serializer.h" +#include "aerospike/as_string.h" +#include "aerospike/as_val.h" +#include "citrusleaf/cf_clock.h" + +#include "fault.h" + +#include "base/datamodel.h" +#include "base/index.h" +#include "base/secondary_index.h" +#include "base/transaction.h" +#include "base/truncate.h" +#include "base/udf_record.h" +#include "base/xdr_serverside.h" +#include "storage/storage.h" +#include "transaction/rw_utils.h" +#include "transaction/udf.h" + + +static int udf_aerospike_rec_remove(const as_aerospike *, const as_rec *); +/* + * Internal Function: udf_aerospike_delbin + * + * Parameters: + * r - udf_record to be manipulated + * bname - name of the bin to be deleted + * + * Return value: + * 0 on success + * -1 on failure + * + * Description: + * The function deletes the bin with the name + * passed in as parameter. The as_bin_destroy function + * which is called here, only frees the data and + * the bin is marked as not in use. The bin can then be reused later. + * + * Synchronization : object lock acquired by the transaction thread executing UDF. + * Partition reservation takes place just before the transaction starts executing + * ( look for as_partition_reserve_udf in thr_tsvc.c ) + * + * Callers: + * udf_aerospike__apply_update_atomic + * In this function, if it fails at the time of update, the record is set + * to rollback all the updates till this point. The case where it fails in + * rollback is not handled. + * + * Side Notes: + * i. write_to_device will be set to true on a successful bin destroy. + * If all the updates from udf_aerospike__apply_update_atomic (including this) are + * successful, the record will be written to disk and reopened so that the rest of + * sets of updates can be applied. + * + * ii. If delete from sindex fails, we do not handle it. + */ +static int +udf_aerospike_delbin(udf_record * urecord, const char * bname) +{ + as_storage_rd *rd = urecord->rd; + as_namespace *ns = rd->ns; + + // Check that bname is not completely invalid + if (bname == NULL || (ns->single_bin && bname[0] != 0) || (! ns->single_bin && bname[0] == 0)) { + cf_warning(AS_UDF, "udf_aerospike_delbin: Invalid Parameters: [Invalid bin name supplied]... Fail"); + return -1; + } + + // Check quality of bname -- check that it is proper length, then make sure + // that the bin exists. + if (strlen(bname) >= AS_ID_BIN_SZ) { + // Can't read bin if name too large. + cf_warning(AS_UDF, "udf_aerospike_delbin: Invalid Parameters [bin name(%s) too big]... Fail", bname); + return -1; + } + + as_bin * b = as_bin_get(rd, bname); + if ( !b ) { + cf_debug(AS_UDF, "udf_aerospike_delbin: Invalid Operation [Bin name(%s) not found of delete]... Fail", bname); + return -1; + } + + const char * set_name = as_index_get_set_name(rd->r, ns); + + bool has_sindex = record_has_sindex(rd->r, ns); + SINDEX_BINS_SETUP(sbins, ns->sindex_cnt); + as_sindex * si_arr[ns->sindex_cnt]; + int si_arr_index = 0; + int sbins_populated = 0; + if (has_sindex) { + si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, b->id, &si_arr[si_arr_index]); + sbins_populated += as_sindex_sbins_from_bin(ns, set_name, b, sbins, AS_SINDEX_OP_DELETE); + } + + int32_t i = as_bin_get_index(rd, bname); + if (i != -1) { + if (has_sindex) { + if (sbins_populated > 0) { + urecord->tr->flags |= AS_TRANSACTION_FLAG_SINDEX_TOUCHED; + as_sindex_update_by_sbin(ns, as_index_get_set_name(rd->r, ns), sbins, sbins_populated, &rd->r->keyd); + } + } + as_bin_destroy(rd, i); + } else { + cf_warning(AS_UDF, "udf_aerospike_delbin: Internal Error [Deleting non-existing bin %s]... Fail", bname); + } + + if (has_sindex) { + as_sindex_sbin_freeall(sbins, sbins_populated); + as_sindex_release_arr(si_arr, si_arr_index); + } + + return 0; +} +/* + * Internal function: udf__aerospike_get_particle_buf + * + * Parameters: + * r -- udf_record_bin for which particle buf is requested + * type -- bin type + * pbytes -- current space required + * + * Return value: + * NULL on failure + * valid buf pointer success + * + * Description: + * The function find space on preallocated particle_data for requested size. + * In case it is found it tries to allocate space for bin independently. + * Return back the pointer to the offset on preallocated particle_data or newly + * allocated space. + * + * Return NULL if both fails + * + * Note: ubin->particle_buf will be set if new per bin memory is allocated. + * + * Callers: + * udf_aerospike_setbin + */ +uint8_t * +udf__aerospike_get_particle_buf(udf_record *urecord, udf_record_bin *ubin, uint32_t pbytes) +{ + if (pbytes > urecord->rd->ns->storage_write_block_size) { + cf_warning(AS_UDF, "udf__aerospike_get_particle_buf: Invalid Operation [Bin %s data too big size=%u]... Fail", ubin->name, pbytes); + return NULL; + } + + uint32_t alloc_size = pbytes == 0 ? 0 : urecord->rd->ns->storage_write_block_size; + uint8_t *buf = NULL; + + if (ubin->particle_buf) { + buf = ubin->particle_buf; + } else { + // Disable dynamic shifting from the flat allocater to dynamic + // allocation. + if ((urecord->cur_particle_data + pbytes) < urecord->end_particle_data) { + buf = urecord->cur_particle_data; + urecord->cur_particle_data += pbytes; + } else if (alloc_size) { + // If there is no space in preallocated buffer then go + // ahead and allocate space per bin. This may happen + // if user keeps doing lot of execute update exhausting + // the buffer. After this point the record size check will + // trip instead of at the code when bin value is set. + ubin->particle_buf = cf_malloc(alloc_size); + buf = ubin->particle_buf; + } + } + return buf; +} +/* + * Internal function: udf_aerospike_setbin + * + * Parameters: + * offset -- offset of udf bin in updates array + * r -- udf_record to be manipulated + * bname -- name of the bin to be deleted + * val -- value to be updated with + * + * Return value: + * 0 on success + * -1 on failure + * + * Description: + * The function sets the bin with the name + * passed in as parameter to the value, passed as the third parameter. + * Before updating the bin, it is checked if the value can fit in the storage + * + * Synchronization : object lock acquired by the transaction thread executing UDF. + * Partition reservation takes place just before the transaction starts executing + * ( look for as_partition_reserve_udf in thr_tsvc.c ) + * + * Callers: + * udf_aerospike__apply_update_atomic + * In this function, if it fails at the time of update, the record is set + * to rollback all the updates till this point. The case where it fails in + * rollback is not handled. + * + * Side Notes: + * i. write_to_device will be set to true on a successful bin update. + * If all the updates from udf_aerospike__apply_update_atomic (including this) are + * successful, the record will be written to disk and reopened so that the rest of + * sets of updates can be applied. + * + * ii. If put in sindex fails, we do not handle it. + * + * TODO make sure anything goes into setbin only if the bin value is + * changed + */ +static int +udf_aerospike_setbin(udf_record * urecord, int offset, const char * bname, const as_val * val) +{ + as_storage_rd *rd = urecord->rd; + as_namespace *ns = rd->ns; + + if (bname == NULL || (ns->single_bin && bname[0] != 0) || (! ns->single_bin && bname[0] == 0)) { + cf_warning(AS_UDF, "udf_aerospike_setbin: Invalid Parameters: [Invalid bin name supplied]... Fail"); + return -1; + } + + if (as_particle_type_from_asval(val) == AS_PARTICLE_TYPE_NULL) { + cf_warning(AS_UDF, "udf_aerospike_setbin: [%s] called with unusable as_val", bname); + return -3; + } + + uint8_t type = as_val_type(val); + + as_bin * b = as_bin_get_or_create(rd, bname); + + if ( !b ) { + cf_warning(AS_UDF, "udf_aerospike_setbin: Internal Error [Bin %s not found.. Possibly ran out of bins]... Fail", bname); + return -1; + } + + bool has_sindex = record_has_sindex(rd->r, ns); + SINDEX_BINS_SETUP(sbins, 2 * ns->sindex_cnt); + as_sindex * si_arr[2 * ns->sindex_cnt]; + int sbins_populated = 0; + int si_arr_index = 0; + const char * set_name = as_index_get_set_name(rd->r, ns); + + if (has_sindex ) { + si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, b->id, &si_arr[si_arr_index]); + sbins_populated += as_sindex_sbins_from_bin(ns, set_name, b, &sbins[sbins_populated], AS_SINDEX_OP_DELETE); + } + + // we know we are doing an update now, make sure there is particle data, + // set to be 1 wblock size now @TODO! + int ret = 0; + + cf_detail(AS_UDF, "udf_setbin: bin %s type %d ", bname, type ); + + if (ns->storage_data_in_memory) { + if (as_bin_particle_replace_from_asval(b, val) != 0) { + cf_warning(AS_UDF, "udf_aerospike_setbin: [%s] failed to replace particle", bname); + ret = -4; + } + } + else { + uint32_t size = as_particle_size_from_asval(val); + uint8_t *particle_buf = udf__aerospike_get_particle_buf(urecord, &urecord->updates[offset], size); + + if (particle_buf) { + as_bin_particle_stack_from_asval(b, particle_buf, val); + } + else { + cf_warning(AS_UDF, "udf_aerospike_setbin: [%s] failed to get space for particle size %u", bname, size); + ret = -4; + } + } + + // Update sindex if required + if (has_sindex) { + if (ret) { + if (sbins_populated > 0) { + as_sindex_sbin_freeall(sbins, sbins_populated); + } + as_sindex_release_arr(si_arr, si_arr_index); + return ret; + } + + si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, b->id, &si_arr[si_arr_index]); + sbins_populated += as_sindex_sbins_from_bin(ns, set_name, b, &sbins[sbins_populated], AS_SINDEX_OP_INSERT); + if (sbins_populated > 0) { + urecord->tr->flags |= AS_TRANSACTION_FLAG_SINDEX_TOUCHED; + as_sindex_update_by_sbin(ns, as_index_get_set_name(rd->r, ns), sbins, sbins_populated, &rd->r->keyd); + as_sindex_sbin_freeall(sbins, sbins_populated); + } + as_sindex_release_arr(si_arr, si_arr_index); + } + + return ret; +} // end udf_aerospike_setbin() + +/* + * Check and validate parameter before performing operation + * + * return: + * UDF_ERR * in case of failure + * 0 in case of success + */ +static int +udf_aerospike_param_check(const as_aerospike *as, const as_rec *rec, char *fname, int lineno) +{ + if (!as) { + cf_debug(AS_UDF, "Invalid Parameters: aerospike=%p", as); + return UDF_ERR_INTERNAL_PARAMETER; + } + + int ret = udf_record_param_check(rec, fname, lineno); + if (ret) { + return ret; + } + return 0; +} + +/* + * Internal function: udf_aerospike__apply_update_atomic + * + * Parameters: + * rec -- udf_record to be updated + * + * Return Values: + * 0 success + * -1 failure + * + * Description: + * This function applies all the updates atomically. That is, + * if one of the bin update/delete/create fails, the entire function + * will fail. If the nth update fails, all the n-1 updates are rolled + * back to their initial values + * + * Special Notes: + * i. The basic checks of bin name being too long or if there is enough space + * on the disk for the bin values is done before allocating space for any + * of the bins. + * + * ii. If one of the updates to be rolled back is a bin creation, + * udf_aerospike_delbin is called. This will not free up the bin metadata. + * So there will be a small memory mismatch b/w replica (which did not get the + * record at all and hence no memory is accounted) and the master will be seen. + * To avoid such cases, we are doing checks upfront. + * + * Callers: + * udf_aerospike__execute_updates + * In this function, if udf_aerospike__apply_update_atomic fails, the record + * is not committed to the storage. On success, record is closed which commits to + * the storage and reopened for the next set of udf updates. + * The return value from udf_aerospike__apply_update_atomic is passed on to the + * callers of this function. + */ +int +udf_aerospike__apply_update_atomic(udf_record *urecord) +{ + int rc = 0; + int failmax = 0; + int new_bins = 0; // How many new bins have to be created in this update + as_storage_rd * rd = urecord->rd; + as_namespace * ns = rd->ns; + bool has_sindex = record_has_sindex(rd->r, ns); + bool is_record_dirty = false; + + // This will iterate over all the updates and apply them to storage. + // The items will remain, and be used as cache values. If an error + // occurred during setbin(), we rollback all the operation which + // is and return failure + cf_detail(AS_UDF, "execute updates: %d updates", urecord->nupdates); + + // loop twice to make sure the updates are performed first so in case + // something wrong it can be rolled back. The deletes will go through + // successfully generally. + + // In first iteration, just calculate how many new bins need to be created + for(uint32_t i = 0; i < urecord->nupdates; i++ ) { + if ( urecord->updates[i].dirty ) { + char * k = urecord->updates[i].name; + if ( k != NULL ) { + if ( !as_bin_get(rd, k) ) { + new_bins++; + } + } + } + } + // Free bins - total bins not in use in the record + // Delta bins - new bins that need to be created + int inuse_bins = as_bin_inuse_count(rd); + int free_bins = rd->n_bins - inuse_bins; + int delta_bins = new_bins - free_bins; + cf_detail(AS_UDF, "Total bins %d, In use bins %d, Free bins %d , New bins %d, Delta bins %d", + rd->n_bins, as_bin_inuse_count(urecord->rd), free_bins, new_bins, delta_bins); + + // Check bin usage limit. + if ((inuse_bins + new_bins > UDF_RECORD_BIN_ULIMIT) || + (urecord->flag & UDF_RECORD_FLAG_TOO_MANY_BINS)) { + cf_warning(AS_UDF, "bin limit of %d for UDF exceeded: %d bins in use, %d bins free, %s%d new bins needed", + (int)UDF_RECORD_BIN_ULIMIT, inuse_bins, free_bins, + (urecord->flag & UDF_RECORD_FLAG_TOO_MANY_BINS) ? ">" : "", new_bins); + goto Rollback; + } + + // Allocate space for all the new bins that need to be created beforehand + if (delta_bins > 0 && rd->ns->storage_data_in_memory && ! rd->ns->single_bin) { + as_bin_allocate_bin_space(rd, delta_bins); + } + + if (!rd->ns->storage_data_in_memory && !urecord->particle_data) { + urecord->particle_data = cf_malloc(rd->ns->storage_write_block_size); + urecord->cur_particle_data = urecord->particle_data; + urecord->end_particle_data = urecord->particle_data + rd->ns->storage_write_block_size; + } + + if (has_sindex) { + SINDEX_GRLOCK(); + } + + // In second iteration apply updates. + for(uint32_t i = 0; i < urecord->nupdates; i++ ) { + urecord->updates[i].oldvalue = NULL; + if ( urecord->updates[i].dirty && rc == 0) { + + char * k = urecord->updates[i].name; + as_val * v = urecord->updates[i].value; + + if ( k != NULL ) { + if ( v == NULL || v->type == AS_NIL ) { + // if the value is NIL, then do a delete + cf_detail(AS_UDF, "execute update: position %d deletes bin %s", i, k); + urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k); + // Only case delete fails if bin is not found that is + // as good as delete. Ignore return code !! + udf_aerospike_delbin(urecord, k); + + if (urecord->dirty != NULL) { + xdr_fill_dirty_bins(urecord->dirty); + } + } + else { + // otherwise, it is a set + cf_detail(AS_UDF, "execute update: position %d sets bin %s", i, k); + urecord->updates[i].oldvalue = udf_record_storage_get(urecord, k); + rc = udf_aerospike_setbin(urecord, i, k, v); + if (rc) { + if (urecord->updates[i].oldvalue) { + as_val_destroy(urecord->updates[i].oldvalue); + urecord->updates[i].oldvalue = NULL; + } + failmax = i; + goto Rollback; + } + + if (urecord->dirty != NULL) { + xdr_add_dirty_bin(ns, urecord->dirty, k, strlen(k)); + } + } + } + + is_record_dirty = true; + } + } + + { + // This is _NOT_ for writing to the storage but for simply performing sizing + // calculation. If we know the upper bounds of size of rec_props.. we could + // avoid this work and check with that much correction ... + // + // See + // - udf_rw_post_processing for building rec_props for replication + // - udf_record_close for building rec_props for writing it to storage + size_t rec_props_data_size = as_storage_record_rec_props_size(rd); + uint8_t rec_props_data[rec_props_data_size]; + if (rec_props_data_size > 0) { + as_storage_record_set_rec_props(rd, rec_props_data); + } + + if (! as_storage_record_size_and_check(rd)) { + cf_warning(AS_UDF, "record failed storage size check, will not be updated"); + failmax = (int)urecord->nupdates; + goto Rollback; + } + + if (rd->ns->clock_skew_stop_writes) { + failmax = (int)urecord->nupdates; + goto Rollback; + } + + if (cf_atomic32_get(rd->ns->stop_writes) == 1) { + cf_warning(AS_UDF, "UDF failed by stop-writes, record will not be updated"); + failmax = (int)urecord->nupdates; + goto Rollback; + } + + if (! as_storage_has_space(rd->ns)) { + cf_warning(AS_UDF, "drives full, record will not be updated"); + failmax = (int)urecord->nupdates; + goto Rollback; + } + + if (! is_valid_ttl(rd->ns, urecord->tr->msgp->msg.record_ttl)) { + cf_warning(AS_UDF, "invalid ttl %u", urecord->tr->msgp->msg.record_ttl); + failmax = (int)urecord->nupdates; + goto Rollback; + } + } + + if (has_sindex) { + SINDEX_GRUNLOCK(); + } + + // If there were updates do miscellaneous successful commit + // tasks + if (is_record_dirty + || (urecord->flag & UDF_RECORD_FLAG_METADATA_UPDATED)) { + urecord->flag |= UDF_RECORD_FLAG_HAS_UPDATES; // will write to storage + } + + // Clean up oldvalue cache and reset dirty. All the changes made + // here has made to the particle buffer. Nothing will now be backed out. + for (uint32_t i = 0; i < urecord->nupdates; i++) { + udf_record_bin * bin = &urecord->updates[i]; + if (bin->oldvalue != NULL ) { + as_val_destroy(bin->oldvalue); + bin->oldvalue = NULL; + } + bin->dirty = false; + } + return rc; + +Rollback: + cf_debug(AS_UDF, "Rollback Called: failmax %d", failmax); + for (int i = 0; i < failmax; i++) { + if (urecord->updates[i].dirty) { + char * k = urecord->updates[i].name; + // Pick the oldvalue for rollback + as_val * v = urecord->updates[i].oldvalue; + if ( k != NULL ) { + if ( v == NULL || v->type == AS_NIL ) { + // if the value is NIL, then do a delete + cf_detail(AS_UDF, "execute rollback: position %d deletes bin %s", i, k); + rc = udf_aerospike_delbin(urecord, k); + } + else { + // otherwise, it is a set + cf_detail(AS_UDF, "execute rollback: position %d sets bin %s", i, k); + rc = udf_aerospike_setbin(urecord, i, k, v); + if (rc) { + cf_warning(AS_UDF, "Rollback failed .. not good ... !!"); + } + } + } + if (v) { + as_val_destroy(v); + cf_debug(AS_UDF, "ROLLBACK as_val_destroy()"); + } + } + } + + if (is_record_dirty && urecord->dirty != NULL) { + xdr_clear_dirty_bins(urecord->dirty); + } + + if (has_sindex) { + SINDEX_GRUNLOCK(); + } + + // Reset the flat size in case the stuff is backedout !!! it should not + // fail in the backout code ... + if (! as_storage_record_size_and_check(rd)) { + cf_warning(AS_UDF, "Does not fit even after rollback... it is trouble"); + } + + // Do not clean up the cache in case of failure + return -1; +} + +/* + * Internal function: udf_aerospike_execute_updates + * + * Parameters: + * rec - udf record to be updated + * + * Return values + * 0 on success + * -1 on failure + * + * Description: + * Execute set of udf_record updates. If these updates are successfully + * applied atomically, the storage record is closed (committed to the disk) + * and reopened. The cache is freed up at the end. + * + * Callers: + * udf_aerospike_rec_create, interface func - aerospike:create(r) + * udf_aerospike_rec_update, interface func - aerospike:update(r) + * udf_aerospike__execute_updates is the key function which is executed in these + * functions. The return value is directly passed on to the lua. + */ +int +udf_aerospike__execute_updates(udf_record * urecord) +{ + int rc = 0; + as_storage_rd *rd = urecord->rd; + + if ( urecord->nupdates == 0 && + (urecord->flag & UDF_RECORD_FLAG_METADATA_UPDATED) == 0 ) { + cf_detail(AS_UDF, "No Update when execute update is called"); + return 0; + } + + // fail updates in case update is not allowed. Queries and scans do not + // not allow updates. Updates will never be true .. just being paranoid + if (!(urecord->flag & UDF_RECORD_FLAG_ALLOW_UPDATES)) { + cf_warning(AS_UDF, "Udf: execute updates: allow updates false; FAIL"); + return -1; + } + + // Commit semantics is either all the update make it or none of it + rc = udf_aerospike__apply_update_atomic(urecord); + + // allocate down if bins are deleted / not in use + if (rd->ns && rd->ns->storage_data_in_memory && ! rd->ns->single_bin) { + int32_t delta_bins = (int32_t)as_bin_inuse_count(rd) - (int32_t)rd->n_bins; + if (delta_bins) { + as_bin_allocate_bin_space(rd, delta_bins); + } + } + return rc; +} + +static void +udf_aerospike_destroy(as_aerospike * as) +{ + as_aerospike_destroy(as); +} + +static cf_clock +udf_aerospike_get_current_time(const as_aerospike * as) +{ + (void)as; + return cf_clock_getabsolute(); +} + +/** + * aerospike::create(record) + * Function: udf_aerospike_rec_create + * + * Parameters: + * as - as_aerospike + * rec - as_rec + * + * Return Values: + * 1 if record is being read or on a create, it already exists + * o/w return value of udf_aerospike__execute_updates + * + * Description: + * Create a new record in local storage. + * The record will only be created if it does not exist. + * This assumes the record has a digest that is valid for local storage. + * + * Synchronization : object lock acquired by the transaction thread executing UDF. + * Partition reservation takes place just before the transaction starts executing + * ( look for as_partition_reserve_udf in thr_tsvc.c ) + * + * Callers: + * lua interfacing function, mod_lua_aerospike_rec_create + * The return value of udf_aerospike_rec_create is pushed on to the lua stack + * + * Notes: + * The 'read' and 'exists' flag of udf_record are set to true. +*/ +static int +udf_aerospike_rec_create(const as_aerospike * as, const as_rec * rec) +{ + int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__); + if (ret) { + return ret; + } + + udf_record * urecord = (udf_record *) as_rec_source(rec); + + // make sure record isn't already successfully read + if ((urecord->flag & UDF_RECORD_FLAG_OPEN) != 0) { + if (as_bin_inuse_has(urecord->rd)) { + cf_detail(AS_UDF, "udf_aerospike_rec_create: Record Already Exists"); + return 1; + } + // else - binless record ok... + + if ((ret = udf_aerospike__execute_updates(urecord)) != 0) { + cf_warning(AS_UDF, "udf_aerospike_rec_create: failure executing record updates"); + udf_aerospike_rec_remove(as, rec); + } + + return ret; + } + + as_transaction *tr = urecord->tr; + as_index_ref *r_ref = urecord->r_ref; + as_storage_rd *rd = urecord->rd; + as_index_tree *tree = tr->rsv.tree; + + // make sure we got the record as a create + int rv = as_record_get_create(tree, &tr->keyd, r_ref, tr->rsv.ns); + cf_detail_digest(AS_UDF, &tr->keyd, "Creating Record "); + + // rv 0 means record exists, 1 means create, < 0 means fail + // TODO: Verify correct result codes. + if (rv == 1) { + // Record created. + } else if (rv == 0) { + // If it's an expired or truncated record, pretend it's a fresh create. + if (as_record_is_doomed(r_ref->r, tr->rsv.ns)) { + as_record_rescue(r_ref, tr->rsv.ns); + } else { + cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Already Exists 2"); + as_record_done(r_ref, tr->rsv.ns); + // DO NOT change it has special meaning for caller + return 1; + } + } else if (rv < 0) { + cf_warning(AS_UDF, "udf_aerospike_rec_create: Record Open Failed with rv=%d", rv); + return rv; + } + + // Associates the set name with the storage rec and index + if (tr->msgp) { + // Set the set name to index and close record if the setting the set name + // is not successful + int rv_set = as_transaction_has_set(tr) ? + set_set_from_msg(r_ref->r, tr->rsv.ns, &tr->msgp->msg) : 0; + if (rv_set != 0) { + cf_warning(AS_UDF, "udf_aerospike_rec_create: Failed to set setname"); + as_index_delete(tree, &tr->keyd); + as_record_done(r_ref, tr->rsv.ns); + return 4; + } + + // Don't write record if it would be truncated. + if (as_truncate_now_is_truncated(tr->rsv.ns, as_index_get_set_id(r_ref->r))) { + as_index_delete(tree, &tr->keyd); + as_record_done(r_ref, tr->rsv.ns); + return 4; + } + } + + // open up storage + as_storage_record_create(tr->rsv.ns, r_ref->r, rd); + + // If the message has a key, apply it to the record. + if (! get_msg_key(tr, rd)) { + cf_warning(AS_UDF, "udf_aerospike_rec_create: Can't store key"); + as_storage_record_close(rd); + as_index_delete(tree, &tr->keyd); + as_record_done(r_ref, tr->rsv.ns); + return 4; + } + + // if multibin storage, we will use urecord->stack_bins, so set the size appropriately + if (rd->ns->single_bin) { + rd->n_bins = 1; + } + else if (! rd->ns->storage_data_in_memory) { + rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin); + } + + // side effect: will set the unused bins to properly unused + as_storage_rd_load_bins(rd, urecord->stack_bins); // TODO - handle error returned + + int rc = udf_aerospike__execute_updates(urecord); + + if (rc != 0) { + // Creating the udf record failed, destroy the as_record + cf_warning(AS_UDF, "udf_aerospike_rec_create: failure executing record updates (%d)", rc); + udf_record_close(urecord); // handles particle data and cache only + as_storage_record_close(rd); + as_index_delete(tree, &tr->keyd); + as_record_done(r_ref, tr->rsv.ns); + return rc; + } + + // Success... + + urecord->flag |= UDF_RECORD_FLAG_OPEN | UDF_RECORD_FLAG_STORAGE_OPEN; + + return 0; +} + +/** + * aerospike::update(record) + * Function: udf_aerospike_rec_update + * + * Parameters: + * + * Return Values: + * -2 if record does not exist + * o/w return value of udf_aerospike__execute_updates + * + * Description: + * Updates an existing record in local storage. + * The record will only be updated if it exists. + * + * Synchronization : object lock acquired by the transaction thread executing UDF. + * Partition reservation takes place just before the transaction starts executing + * ( look for as_partition_reserve_udf in thr_tsvc.c ) + * + * Callers: + * lua interfacing function, mod_lua_aerospike_rec_update + * The return value of udf_aerospike_rec_update is pushed on to the lua stack + * + * Notes: + * If the record does not exist or is not read by anyone yet, we cannot + * carry on with the update. 'exists' and 'set' are set to false on record + * init or record remove. +*/ +static int +udf_aerospike_rec_update(const as_aerospike * as, const as_rec * rec) +{ + int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__); + if (ret) { + return ret; + } + + udf_record * urecord = (udf_record *) as_rec_source(rec); + + // make sure record exists and is already opened up + if (!urecord || !(urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN) + || !(urecord->flag & UDF_RECORD_FLAG_OPEN) ) { + cf_warning(AS_UDF, "Record not found to be open while updating urecord flag=%d", urecord ? urecord->flag : -1); + return -2; + } + cf_detail_digest(AS_UDF, &urecord->rd->r->keyd, "Executing Updates"); + ret = udf_aerospike__execute_updates(urecord); + + if (ret < 0) { + cf_warning(AS_UDF, "udf_aerospike_rec_update: failure executing record updates (%d)", ret); + } + + return ret; +} + +/** + * Function udf_aerospike_rec_exists + * + * Parameters: + * + * Return Values: + * 1 if record exists + * 0 o/w + * + * Description: + * Check to see if the record exists + */ +static int +udf_aerospike_rec_exists(const as_aerospike * as, const as_rec * rec) +{ + int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__); + if (ret) { + return ret; + } + + udf_record * urecord = (udf_record *) as_rec_source(rec); + + return (urecord && (urecord->flag & UDF_RECORD_FLAG_OPEN)) ? true : false; +} + +/* + * Function: udf_aerospike_rec_remove + * + * Parameters: + * + * Return Values: + * 1 if record does not exist + * 0 on success + * + * Description: + * Removes an existing record from local storage. + * The record will only be removed if it exists. + */ +static int +udf_aerospike_rec_remove(const as_aerospike * as, const as_rec * rec) +{ + int ret = udf_aerospike_param_check(as, rec, __FILE__, __LINE__); + if (ret) { + return ret; + } + udf_record * urecord = (udf_record *) as_rec_source(rec); + + // make sure record is already exists before removing it + if (!urecord || !(urecord->flag & UDF_RECORD_FLAG_OPEN)) { + return 1; + } + + as_storage_rd* rd = urecord->rd; + + if (rd->ns->storage_data_in_memory && ! rd->ns->single_bin) { + delete_adjust_sindex(rd); + } + + as_record_destroy_bins(rd); + + if (rd->ns->storage_data_in_memory && ! rd->ns->single_bin) { + as_record_free_bin_space(rd->r); + rd->bins = NULL; + rd->n_bins = 0; + } + + if (urecord->particle_data) { + cf_free(urecord->particle_data); + urecord->particle_data = NULL; + } + + udf_record_cache_free(urecord); + urecord->flag |= UDF_RECORD_FLAG_HAS_UPDATES; + + return 0; +} + +/** + * Writes a log message + */ +static int +udf_aerospike_log(const as_aerospike * a, const char * file, const int line, const int lvl, const char * msg) +{ + (void)a; + cf_fault_event(AS_UDF, lvl, file, line, "%s", (char *) msg); + return 0; +} + +// Would someone please explain the structure of these hooks? Why are some null? +const as_aerospike_hooks udf_aerospike_hooks = { + .rec_create = udf_aerospike_rec_create, + .rec_update = udf_aerospike_rec_update, + .rec_remove = udf_aerospike_rec_remove, + .rec_exists = udf_aerospike_rec_exists, + .log = udf_aerospike_log, + .get_current_time = udf_aerospike_get_current_time, + .destroy = udf_aerospike_destroy +}; diff --git a/as/src/base/udf_arglist.c b/as/src/base/udf_arglist.c new file mode 100644 index 00000000..e3b79b63 --- /dev/null +++ b/as/src/base/udf_arglist.c @@ -0,0 +1,81 @@ +/* + * udf_arglist.c + * + * Copyright (C) 2012-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "aerospike/as_list.h" +#include "aerospike/as_list_iterator.h" +#include "aerospike/as_msgpack.h" + +#include "base/proto.h" +#include "base/udf_arglist.h" + +/****************************************************************************** + * STATIC FUNCTIONS + ******************************************************************************/ + +static bool udf_arglist_foreach(const as_list *, as_list_foreach_callback, void *); +static as_val *udf_arglist_get(const as_list *, const uint32_t idx); + +/****************************************************************************** + * VARIABLES + ******************************************************************************/ + +const as_list_hooks udf_arglist_hooks = { + .destroy = NULL, + .hashcode = NULL, + .size = NULL, + .append = NULL, + .prepend = NULL, + .get = udf_arglist_get, + .set = NULL, + .head = NULL, + .tail = NULL, + .drop = NULL, + .take = NULL, + .foreach = udf_arglist_foreach, + .iterator_init = NULL, + .iterator_new = NULL +}; + +/****************************************************************************** + * FUNCTIONS + ******************************************************************************/ + +static bool udf_arglist_foreach(const as_list * l, as_list_foreach_callback callback, void * context) { + if (l) { + as_list_iterator list_iter; + as_iterator* iter = (as_iterator*) &list_iter; + as_list_iterator_init(&list_iter, l); + + while (as_iterator_has_next(iter)) { + const as_val* v = as_iterator_next(iter); + callback((as_val *) v, context); + } + as_iterator_destroy(iter); + } + + return true; +} + +static as_val *udf_arglist_get(const as_list * l, const uint32_t idx) { + return as_list_get(l, idx); +} + diff --git a/as/src/base/udf_cask.c b/as/src/base/udf_cask.c new file mode 100644 index 00000000..137cb9b3 --- /dev/null +++ b/as/src/base/udf_cask.c @@ -0,0 +1,745 @@ +/* + * udf_cast.c + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/udf_cask.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "jansson.h" + +#include "aerospike/as_module.h" +#include "aerospike/mod_lua.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_b64.h" +#include "citrusleaf/cf_crypto.h" + +#include "dynbuf.h" +#include "fault.h" + +#include "base/cfg.h" +#include "base/thr_info.h" +#include "base/system_metadata.h" +#include + +char udf_smd_module_name[] = "UDF"; +char *as_udf_type_name[] = {"LUA", 0}; + +static bool g_udf_smd_loaded = false; + +static int file_read(char *, uint8_t **, size_t *, unsigned char *); +static int file_write(char *, uint8_t *, size_t, unsigned char *); +static int file_remove(char *); +static int file_generation(char *, uint8_t *, size_t, unsigned char *); + +static inline int file_resolve(char * filepath, char * filename, char * ext) { + + char * p = filepath; + char * user_path = g_config.mod_lua.user_path; + size_t user_path_len = strlen(user_path); + int filename_len = strlen(filename); + + memcpy(p, user_path, sizeof(char) * user_path_len); + p += user_path_len; + + memcpy(p, "/", 1); + p += 1; + + memcpy(p, filename, filename_len); + p += filename_len; + + if ( ext ) { + int ext_len = strlen(ext); + memcpy(p, ext, ext_len); + p += ext_len; + } + + p[0] = '\0'; + + return 0; +} + +static int file_read(char * filename, uint8_t ** content, size_t * content_len, unsigned char * hash) { + + char filepath[256] = {0}; + char line[1024] = {0}; + size_t line_len = sizeof(line); + + file_resolve(filepath, filename, NULL); + + cf_dyn_buf_define(buf); + + FILE *file = fopen(filepath, "r"); + + if ( file ) { + + while( fgets(line, line_len, file) != NULL ) { + cf_dyn_buf_append_string(&buf, line); + } + + fclose(file); + file = NULL; + + if ( buf.used_sz > 0 ) { + + char *src = cf_dyn_buf_strdup(&buf); + + file_generation(filepath, (uint8_t *)src, buf.used_sz, hash); + + uint32_t src_len = (uint32_t)buf.used_sz; + uint32_t out_size = cf_b64_encoded_len(src_len); + + *content = (uint8_t *)cf_malloc(out_size); + *content_len = out_size; + + cf_b64_encode((const uint8_t*)src, src_len, (char*)(*content)); + + cf_free(src); + src = NULL; + + return 0; + } + + *content = NULL; + *content_len = 0; + return 2; + } + + *content = NULL; + *content_len = 0; + return 1; +} + +static int file_write(char * filename, uint8_t * content, size_t content_len, unsigned char * hash) { + + char filepath[256] = {0}; + + file_resolve(filepath, filename, NULL); + + FILE *file = fopen(filepath, "w"); + + if (file == NULL) { + cf_warning(AS_UDF, "could not open udf put to %s: %s", filepath, cf_strerror(errno)); + return -1; + } + int r = fwrite(content, sizeof(char), content_len, file); + if (r <= 0) { + cf_warning(AS_UDF, "could not write file %s: %d", filepath, r); + fclose(file); + return -1; + } + + fclose(file); + file = NULL; + + file_generation(filepath, content, content_len, hash); + + return 0; +} + +static int file_remove(char * filename) { + char filepath[256] = {0}; + file_resolve(filepath, filename, NULL); + unlink(filepath); + return 0; +} + +static int file_generation(char * filename, uint8_t * content, size_t content_len, unsigned char * hash) { + unsigned char sha1[128] = {0}; + int len = 20; + SHA1((const unsigned char *) content, (unsigned long) content_len, (unsigned char *) sha1); + cf_b64_encode(sha1, len, (char*)hash); + hash[cf_b64_encoded_len(len)] = 0; + return 0; +} + +// return -1 if not found otherwise the index in as_udf_type_name +static int udf_type_getid(char *type) { + int index = 0; + while (as_udf_type_name[index]) { + if (strcmp( type, as_udf_type_name[index]) == 0 ) { + return(index); + } + index++; + } + return(-1); +} + +/* + * Type for user data passed to the get metadata callback. + */ +typedef struct udf_get_data_s { + cf_dyn_buf *db; // DynBuf for output. + pthread_cond_t *cv; // Condition variable for signaling callback completion. + pthread_mutex_t *mt; // Mutex protecting the condition variable. + bool done; // Has the callback finished? +} udf_get_data_t; + +/* + * UDF SMD get metadata items callback. + */ +static int udf_cask_get_metadata_cb(char *module, as_smd_item_list_t *items, void *udata) +{ + udf_get_data_t *p_get_data = (udf_get_data_t *) udata; + cf_dyn_buf *out = p_get_data->db; + + unsigned char hash[SHA_DIGEST_LENGTH]; + // hex string to be returned to the client + unsigned char sha1_hex_buff[CF_SHA_HEX_BUFF_LEN]; + // Currently just return directly for LUA + uint8_t udf_type = AS_UDF_TYPE_LUA; + + for (int index = 0; index < items->num_items; index++) { + as_smd_item_t *item = items->item[index]; + cf_debug(AS_UDF, "UDF metadata item[%d]: module \"%s\" ; key \"%s\" ; value \"%s\" ; generation %u ; timestamp %lu", + index, item->module_name, item->key, item->value, item->generation, item->timestamp); + cf_dyn_buf_append_string(out, "filename="); + cf_dyn_buf_append_buf(out, (uint8_t *)item->key, strlen(item->key)); + cf_dyn_buf_append_string(out, ","); + SHA1((uint8_t *)item->value, strlen(item->value), hash); + + // Convert to a hexadecimal string + cf_convert_sha1_to_hex(hash, sha1_hex_buff); + cf_dyn_buf_append_string(out, "hash="); + cf_dyn_buf_append_buf(out, sha1_hex_buff, CF_SHA_HEX_BUFF_LEN); + cf_dyn_buf_append_string(out, ",type="); + cf_dyn_buf_append_string(out, as_udf_type_name[udf_type]); + cf_dyn_buf_append_string(out, ";"); + } + + pthread_mutex_lock(p_get_data->mt); + + p_get_data->done = true; + int retval = pthread_cond_signal(p_get_data->cv); + if (retval) { + cf_warning(AS_UDF, "pthread_cond_signal failed (rv %d)", retval); + } + + pthread_mutex_unlock(p_get_data->mt); + + return retval; +} + +/* + * Implementation of the "udf-list" Info. Command. + */ +int udf_cask_info_list(char *name, cf_dyn_buf *out) +{ + cf_debug(AS_UDF, "UDF CASK INFO LIST"); + + pthread_mutex_t get_data_mutex = PTHREAD_MUTEX_INITIALIZER; + pthread_cond_t get_data_cond_var = PTHREAD_COND_INITIALIZER; + + udf_get_data_t get_data; + get_data.db = out; + get_data.cv = &get_data_cond_var; + get_data.mt = &get_data_mutex; + get_data.done = false; + + pthread_mutex_lock(&get_data_mutex); + + int retval = as_smd_get_metadata(udf_smd_module_name, "", udf_cask_get_metadata_cb, &get_data); + if (!retval) { + do { // [Note: Loop protects against spurious wakeups.] + if ((retval = pthread_cond_wait(&get_data_cond_var, &get_data_mutex))) { + cf_warning(AS_UDF, "pthread_cond_wait failed (rv %d)", retval); + break; + } + } while (!get_data.done); + } else { + cf_warning(AS_UDF, "failed to get UDF metadata (rv %d)", retval); + } + + pthread_mutex_unlock(&get_data_mutex); + + pthread_mutex_destroy(&get_data_mutex); + pthread_cond_destroy(&get_data_cond_var); + + return retval; +} + +/* + * Reading local directory to get specific module item's contents. + * In future if needed we can change this to reading from smd metadata. + */ +int udf_cask_info_get(char *name, char * params, cf_dyn_buf * out) { + + int resp = 0; + char filename[128] = {0}; + int filename_len = sizeof(filename); + uint8_t * content = NULL; + size_t content_len = 0; + unsigned char content_gen[256] = {0}; + uint8_t udf_type = AS_UDF_TYPE_LUA; + + cf_debug(AS_INFO, "UDF CASK INFO GET"); + + // get (required) script filename + if ( as_info_parameter_get(params, "filename", filename, &filename_len) ) { + cf_info(AS_INFO, "invalid or missing filename"); + cf_dyn_buf_append_string(out, "error=invalid_filename"); + return 0; + } + + mod_lua_rdlock(&mod_lua); + // read the script from filesystem + resp = file_read(filename, &content, &content_len, content_gen); + mod_lua_unlock(&mod_lua); + if ( resp ) { + switch ( resp ) { + case 1 : { + cf_dyn_buf_append_string(out, "error=not_found"); + break; + } + case 2 : { + cf_dyn_buf_append_string(out, "error=empty"); + break; + } + default : { + cf_dyn_buf_append_string(out, "error=unknown_error"); + break; // complier complains without a break; + } + } + } + else { + // put back the result + cf_dyn_buf_append_string(out, "gen="); + cf_dyn_buf_append_string(out, (char *) content_gen); + cf_dyn_buf_append_string(out, ";type="); + cf_dyn_buf_append_string(out, as_udf_type_name[udf_type]); + cf_dyn_buf_append_string(out, ";content="); + cf_dyn_buf_append_buf(out, content, content_len); + cf_dyn_buf_append_string(out, ";"); + } + + if ( content ) { + cf_free(content); + content = NULL; + } + + return 0; +} + +// An info put call will call system metadata +// +// Data is reflected into json as an object with the following fields +// which can be added to later if necessary, for example, instead of using +// the specific data, it could include the URL to the data +// +// key - name of the UDF file +// +// content64 - base64 encoded data +// type - language to execute +// name - reptition of the name, same as the key + +int udf_cask_info_put(char *name, char * params, cf_dyn_buf * out) { + + cf_debug(AS_INFO, "UDF CASK INFO PUT"); + + int rc = 0; + char filename[128] = {0}; + int filename_len = sizeof(filename); + // Content_len from the client and its expected size + char content_len[32] = {0}; + int clen = sizeof(content_len); + // Udf content from the client and its expected length + char *udf_content = NULL; + int udf_content_len = 0; + // Udf type from the client and its expected size + char type[8] = {0}; + int type_len = sizeof(type); + + // get (required) script filename + char *tmp_char; + + if ( as_info_parameter_get(params, "filename", filename, &filename_len) + || !(tmp_char = strchr(filename, '.')) // No extension in filename + || tmp_char == filename // '.' at the begining of filename + || strlen (tmp_char) <= 1) { // '.' in filename, but no extnsion e.g. "abc." + cf_info(AS_INFO, "invalid or missing filename"); + cf_dyn_buf_append_string(out, "error=invalid_filename"); + return 0; + } + + if ( as_info_parameter_get(params, "content-len", content_len, &(clen)) ) { + cf_info(AS_INFO, "invalid or missing content-len"); + cf_dyn_buf_append_string(out, "error=invalid_content_len"); + return 0; + } + + if ( as_info_parameter_get(params, "udf-type", type, &type_len) ) { + // Replace with DEFAULT IS LUA + strcpy(type, as_udf_type_name[0]); + } + + // check type field + if (-1 == udf_type_getid(type)) { + cf_info(AS_INFO, "invalid or missing udf-type : %s not valid", type); + cf_dyn_buf_append_string(out, "error=invalid_udf_type"); + return 0; + } + + // get b64 encoded script + udf_content_len = atoi(content_len) + 1; + udf_content = (char *) cf_malloc(udf_content_len); + + // cf_info(AS_UDF, "content_len = %s", content_len); + // cf_info(AS_UDF, "udf_content_len = %d", udf_content_len); + + + // get (required) script content - base64 encoded here. + if ( as_info_parameter_get(params, "content", udf_content, &(udf_content_len)) ) { + cf_info(AS_UDF, "invalid content"); + cf_dyn_buf_append_string(out, "error=invalid_content"); + cf_free(udf_content); + return 0; + } + + // base 64 decode it + uint32_t encoded_len = strlen(udf_content); + uint32_t decoded_len = cf_b64_decoded_buf_size(encoded_len) + 1; + + // Don't allow UDF file size > 1MB + if ( decoded_len > MAX_UDF_CONTENT_LENGTH) { + cf_info(AS_INFO, "lua file size:%d > 1MB", decoded_len); + cf_dyn_buf_append_string(out, "error=invalid_udf_content_len, lua file size > 1MB"); + cf_free(udf_content); + return 0; + } + + char * decoded_str = cf_malloc(decoded_len); + + if ( ! cf_b64_validate_and_decode(udf_content, encoded_len, (uint8_t*)decoded_str, &decoded_len) ) { + cf_info(AS_UDF, "invalid base64 content %s", filename); + cf_dyn_buf_append_string(out, "error=invalid_base64_content"); + cf_free(decoded_str); + cf_free(udf_content); + return 0; + } + + decoded_str[decoded_len] = '\0'; + + as_module_error err; + rc = as_module_validate(&mod_lua, NULL, filename, decoded_str, decoded_len, &err); + + cf_free(decoded_str); + decoded_str = NULL; + decoded_len = 0; + + if ( rc ) { + cf_warning(AS_UDF, "udf-put: compile error: [%s:%d] %s", err.file, err.line, err.message); + cf_dyn_buf_append_string(out, "error=compile_error"); + cf_dyn_buf_append_string(out, ";file="); + cf_dyn_buf_append_string(out, err.file); + cf_dyn_buf_append_string(out, ";line="); + cf_dyn_buf_append_uint32(out, err.line); + + uint32_t message_len = strlen(err.message); + uint32_t enc_message_len = cf_b64_encoded_len(message_len); + char enc_message[enc_message_len]; + + cf_b64_encode((const uint8_t*)err.message, message_len, enc_message); + + cf_dyn_buf_append_string(out, ";message="); + cf_dyn_buf_append_buf(out, (uint8_t *)enc_message, enc_message_len); + + cf_free(udf_content); + return 0; + } + + // Create an empty JSON object + json_t *udf_obj = 0; + if (!(udf_obj = json_object())) { + cf_warning(AS_UDF, "failed to create JSON array for receiving UDF"); + cf_free(udf_content); + return -1; + } + int e = 0; + e += json_object_set_new(udf_obj, "content64", json_string(udf_content)); + e += json_object_set_new(udf_obj, "type", json_string(type)); + e += json_object_set_new(udf_obj, "name", json_string(filename)); + + cf_free(udf_content); + + if (e) { + cf_warning(AS_UDF, "could not encode UDF object, error %d", e); + json_decref(udf_obj); + return(-1); + } + // make it into a string, yet another buffer copy + char *udf_obj_str = json_dumps(udf_obj, 0/*flags*/); + json_decref(udf_obj); + udf_obj = 0; + + cf_debug(AS_UDF, "created json object %s", udf_obj_str); + + // how do I know whether to call create or add? + e = as_smd_set_metadata(udf_smd_module_name, filename, udf_obj_str); + if (e) { + cf_warning(AS_UDF, "could not add UDF metadata, error %d", e); + cf_free(udf_obj_str); + return(-1); + } + + cf_info(AS_UDF, "UDF module '%s' (%s/%s) registered", filename, g_config.mod_lua.user_path, filename); + + // free the metadata + cf_free(udf_obj_str); + udf_obj_str = 0; + + return 0; +} + +int udf_cask_info_remove(char *name, char * params, cf_dyn_buf * out) { + + char filename[128] = {0}; + int filename_len = sizeof(filename); + char file_path[1024] = {0}; + struct stat buf; + + cf_debug(AS_INFO, "UDF CASK INFO REMOVE"); + + // get (required) script filename + if ( as_info_parameter_get(params, "filename", filename, &filename_len) ) { + cf_info(AS_UDF, "invalid or missing filename"); + cf_dyn_buf_append_string(out, "error=invalid_filename"); + } + + // now check if such a file-name exists : + if (!g_config.mod_lua.user_path) + { + return -1; + } + + snprintf(file_path, 1024, "%s/%s", g_config.mod_lua.user_path, filename); + + cf_debug(AS_INFO, " Lua file removal full-path is : %s \n", file_path); + + if (stat(file_path, &buf) != 0) { + cf_info(AS_UDF, "failed to read file from : %s, error : %s", file_path, cf_strerror(errno)); + cf_dyn_buf_append_string(out, "error=file_not_found"); + return -1; + } + + as_smd_delete_metadata(udf_smd_module_name, filename); + + // this is what an error would look like + // cf_dyn_buf_append_string(out, "error="); + // cf_dyn_buf_append_int(out, resp); + + cf_dyn_buf_append_string(out, "ok"); + + cf_info(AS_UDF, "UDF module '%s' (%s) removed", filename, file_path); + + return 0; +} + +/* + * Clear out the Lua cache. + */ +int udf_cask_info_clear_cache(char *name, char * params, cf_dyn_buf * out) +{ + cf_debug(AS_INFO, "UDF CASK INFO CLEAR CACHE"); + + mod_lua_wrlock(&mod_lua); + + as_module_event e = { + .type = AS_MODULE_EVENT_CLEAR_CACHE + }; + as_module_update(&mod_lua, &e); + + mod_lua_unlock(&mod_lua); + + cf_dyn_buf_append_string(out, "ok"); + + return 0; +} + +/** + * (Re-)Configure UDF modules + */ +int udf_cask_info_configure(char *name, char * params, cf_dyn_buf * buf) { + as_module_configure(&mod_lua, &g_config.mod_lua); + return 0; +} + +// +// take a current list and return the new list +// Validates that items are correct? or is that done with the add? +// How do you signal that there are no changes between the current list and the new list? + +int +udf_cask_smd_merge_fn (char *module, as_smd_item_list_t **item_list_out, as_smd_item_list_t **item_lists_in, size_t num_lists, void *udata) +{ + cf_debug(AS_UDF, "UDF CASK merge function"); + + // (For now, just send back an empty metadata item list.) + as_smd_item_list_t *item_list = as_smd_item_list_create(0); + *item_list_out = item_list; + + return(0); +} + +// This function must take the current "view of the world" and +// make the local store the same as that. + +int +udf_cask_smd_accept_fn(char *module, as_smd_item_list_t *items, void *udata, uint32_t accept_opt) +{ + if (accept_opt & AS_SMD_ACCEPT_OPT_CREATE) { + cf_debug(AS_UDF, "(doing nothing in UDF accept cb for module creation)"); + g_udf_smd_loaded = true; + return 0; + } + + cf_debug(AS_UDF, "UDF CASK accept fn : n items %zu", items->num_items); + + // For each item in the list, see if the current version + // is different from the curretly stored version + // and if the new item is new, write to the storage directory + for (int i = 0; i < items->num_items ; i++) { + + as_smd_item_t *item = items->item[i]; + + if (item->action == AS_SMD_ACTION_SET) { + + json_error_t json_error; + json_t *item_obj = json_loads(item->value, 0 /*flags*/, &json_error); + if (!item_obj) { + cf_warning(AS_UDF, "failed to parse UDF \"%s\" with JSON error: %s ; source: %s ; line: %d ; column: %d ; position: %d", + item->key, json_error.text, json_error.source, json_error.line, json_error.column, json_error.position); + continue; + } + + /*item->key is name */ + json_t *content64_obj = json_object_get(item_obj, "content64"); + const char *content64_str = json_string_value(content64_obj); + + // base 64 decode it + uint32_t encoded_len = strlen(content64_str); + uint32_t decoded_len = cf_b64_decoded_buf_size(encoded_len) + 1; + char *content_str = cf_malloc(decoded_len); + + if (! cf_b64_validate_and_decode(content64_str, encoded_len, (uint8_t*)content_str, &decoded_len)) { + cf_info(AS_UDF, "invalid script on accept, will not register %s", item->key); + cf_free(content_str); + json_decref(item_obj); + continue; + } + + content_str[decoded_len] = 0; + + cf_debug(AS_UDF, "pushing to %s, %d bytes [%s]", item->key, decoded_len, content_str); + mod_lua_wrlock(&mod_lua); + + // content_gen is actually a hash. Not sure if it's filled out or what. + unsigned char content_gen[256] = {0}; + int e = file_write(item->key, (uint8_t *) content_str, decoded_len, content_gen); + cf_free(content_str); + json_decref(item_obj); + if ( e ) { + mod_lua_unlock(&mod_lua); + cf_info(AS_UDF, "invalid script on accept, will not register %s", item->key); + continue; + } + // Update the cache + as_module_event ame = { + .type = AS_MODULE_EVENT_FILE_ADD, + .data.filename = item->key + }; + as_module_update(&mod_lua, &ame); + mod_lua_unlock(&mod_lua); + } + else if (item->action == AS_SMD_ACTION_DELETE) { + cf_debug(AS_UDF, "received DELETE SMD action %d key %s", item->action, item->key); + + mod_lua_wrlock(&mod_lua); + file_remove(item->key); + + // fixes potential cache issues + as_module_event e = { + .type = AS_MODULE_EVENT_FILE_REMOVE, + .data.filename = item->key + }; + as_module_update(&mod_lua, &e); + + mod_lua_unlock(&mod_lua); + + } + else { + cf_info(AS_UDF, "received unknown SMD action %d", item->action); + } + } + + return(0); +} + + +void +udf_cask_init() +{ + // Have to delete the existing files in the user path on startup + struct dirent * entry = NULL; + // opendir(NULL) seg-faults + if (!g_config.mod_lua.user_path) + { + cf_crash(AS_UDF, "cask init: null mod-lua user-path"); + } + + DIR *dir = opendir(g_config.mod_lua.user_path); + if ( dir == 0 ) { + cf_crash(AS_UDF, "cask init: could not open udf directory %s: %s", g_config.mod_lua.user_path, cf_strerror(errno)); + } + while ( (entry = readdir(dir))) { + // readdir also reads "." and ".." entries. + if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) + { + char fn[1024]; + snprintf(fn, sizeof(fn), "%s/%s", g_config.mod_lua.user_path, entry->d_name); + int rem_rv = remove(fn); + if (rem_rv != 0) { + cf_warning(AS_UDF, "Failed to remove the file %s. Error %d", fn, errno); + } + } + } + closedir(dir); + + // as_smd_create_module(udf_smd_module_name, udf_cask_smd_merge_fn, 0, udf_cask_smd_accept_fn, 0); + // take the default merge function + if (as_smd_create_module(udf_smd_module_name, 0, 0, 0, 0, udf_cask_smd_accept_fn, 0, 0, 0)) { + cf_crash(AS_UDF, "failed to create SMD module \"%s\"", udf_smd_module_name); + } + + while (! g_udf_smd_loaded) { + usleep(1000); + } + + // there may be existing data. Read it and populate the local file system. +} diff --git a/as/src/base/udf_memtracker.c b/as/src/base/udf_memtracker.c new file mode 100644 index 00000000..aceaded4 --- /dev/null +++ b/as/src/base/udf_memtracker.c @@ -0,0 +1,105 @@ +/* + * udf_memtracker.c + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + +#include + +#include "fault.h" + +#include "base/udf_memtracker.h" + + +/***************************************************************************** + * STATIC FUNCTIONS + *****************************************************************************/ + +static pthread_key_t modules_tlskey = 0; +static as_memtracker g_udf_memtracker; +static int +udf_memtracker_generic(mem_tracker *mt, const uint32_t num_bytes, memtracker_op op) +{ + if (!mt || !mt->udata || !mt->cb) { + return false; + } + + mt->cb(mt, num_bytes, op); + if (op == MEM_RESERVE) { + cf_detail(AS_UDF, "%ld: Memory Tracker %p reserved = %d (bytes)", + pthread_self(), mt, num_bytes); + } else if (op == MEM_RELEASE) { + cf_detail(AS_UDF, "%ld: Memory Tracker %p released = %d (bytes)", + pthread_self(), mt, num_bytes); + } else { + cf_detail(AS_UDF, "%ld: Memory Tracker %p reset", + pthread_self(), mt); + } + return 0; +} + +void +udf_memtracker_setup(mem_tracker *mt) +{ + pthread_setspecific(modules_tlskey, mt); + cf_detail(AS_UDF, "%ld: Memory Tracker %p set", pthread_self(), mt); +} + +void +udf_memtracker_cleanup() +{ + pthread_setspecific(modules_tlskey, NULL); + cf_detail(AS_UDF, "%ld: Memory Tracker reset", pthread_self()); +} + +static bool +udf_memtracker_reset(const as_memtracker *as_mt) { + mem_tracker *mt = (mem_tracker *)pthread_getspecific(modules_tlskey); + return udf_memtracker_generic(mt, 0, MEM_RESET); + +} + +static bool +udf_memtracker_reserve(const as_memtracker *as_mt, const uint32_t num_bytes) +{ + mem_tracker *mt = (mem_tracker *)pthread_getspecific(modules_tlskey); + return udf_memtracker_generic(mt, num_bytes, MEM_RESERVE); +} + +static bool +udf_memtracker_release(const as_memtracker *as_mt, const uint32_t num_bytes) +{ + mem_tracker *mt = (mem_tracker *)pthread_getspecific(modules_tlskey); + return udf_memtracker_generic(mt, num_bytes, MEM_RELEASE); +} + +static const as_memtracker_hooks udf_memtracker_hooks = { + .destroy = NULL, + .reserve = udf_memtracker_reserve, + .release = udf_memtracker_release, + .reset = udf_memtracker_reset +}; + +as_memtracker * +udf_memtracker_init() +{ + as_memtracker_init(&g_udf_memtracker, NULL, &udf_memtracker_hooks); + return &g_udf_memtracker; +} diff --git a/as/src/base/udf_record.c b/as/src/base/udf_record.c new file mode 100644 index 00000000..2a740e9a --- /dev/null +++ b/as/src/base/udf_record.c @@ -0,0 +1,959 @@ +/* + * udf_record.c + * + * Copyright (C) 2012-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/udf_record.h" + +#include +#include +#include +#include + +#include "aerospike/as_rec.h" +#include "aerospike/as_val.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_byte_order.h" +#include "citrusleaf/cf_clock.h" + +#include "fault.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/rec_props.h" +#include "base/transaction.h" +#include "storage/storage.h" +#include "transaction/rw_utils.h" +#include "transaction/udf.h" + + +/* + * Function: Open storage record for passed in udf record + * also set up flag like exists / read et al. + * + * Parameters: + * urec : UDF record + * + * Return value : 0 on success + * -1 if the record's bin count exceeds the UDF limit + * + * Callers: + * udf_record_open + * + * Note: There are no checks, so the caller has to make sure that all + * protections are taken and all checks are done. + * + * Side effect: + * Counters will be reset + * flag will be set + * bins will be opened + */ +int +udf_storage_record_open(udf_record *urecord) +{ + cf_debug_digest(AS_UDF, &urecord->tr->keyd, "[ENTER] Opening record key:"); + as_storage_rd *rd = urecord->rd; + as_index *r = urecord->r_ref->r; + as_transaction *tr = urecord->tr; + + as_storage_record_open(tr->rsv.ns, r, rd); + + // Deal with delete durability (enterprise only). + if ((urecord->flag & UDF_RECORD_FLAG_ALLOW_UPDATES) != 0 && + set_delete_durablility(tr, rd) != 0) { + as_storage_record_close(rd); + return -1; + } + + as_storage_rd_load_n_bins(rd); // TODO - handle error returned + + if (rd->n_bins > UDF_RECORD_BIN_ULIMIT) { + cf_warning(AS_UDF, "record has too many bins (%d) for UDF processing", rd->n_bins); + as_storage_record_close(rd); + return -1; + } + + // if multibin storage, we will use urecord->stack_bins, so set the size appropriately + if ( ! tr->rsv.ns->storage_data_in_memory && ! tr->rsv.ns->single_bin ) { + rd->n_bins = sizeof(urecord->stack_bins) / sizeof(as_bin); + } + + as_storage_rd_load_bins(rd, urecord->stack_bins); // TODO - handle error returned + urecord->starting_memory_bytes = as_storage_record_get_n_bytes_memory(rd); + + as_storage_record_get_key(rd); + + urecord->flag |= UDF_RECORD_FLAG_STORAGE_OPEN; + + cf_detail_digest(AS_UDF, &tr->keyd, "Storage Open: Rec(%p) flag(%x) Digest:", urecord, urecord->flag); + return 0; +} + +/* + * Function: Close storage record if it open and also set flags + * + * Parameters: + * urec : UDF record + * + * Return value : 0 in case storage was open + * 1 in case storage was not open + * + * Callers: + * udf_record_close + * + * Side effect: + * flag will be reset + * bins will be closed + */ +int +udf_storage_record_close(udf_record *urecord) +{ + if (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN) { + as_index_ref *r_ref = urecord->r_ref; + as_storage_rd *rd = urecord->rd; + + // In case allow update is not set .. the record has been opened for + // the aggregation. Do not do any rec property update. + // Pick info from index and put it in storage record. + size_t rec_props_data_size = as_storage_record_rec_props_size(rd); + uint8_t rec_props_data[rec_props_data_size]; + if (urecord->flag & UDF_RECORD_FLAG_ALLOW_UPDATES) { + if (rec_props_data_size > 0) { + as_storage_record_set_rec_props(rd, rec_props_data); + } + } + + bool has_bins = as_bin_inuse_has(rd); + + if (r_ref) { + if (urecord->flag & UDF_RECORD_FLAG_HAS_UPDATES) { + as_storage_record_write(rd); + urecord->flag &= ~UDF_RECORD_FLAG_HAS_UPDATES; // TODO - necessary? + } + + if (! has_bins) { + write_delete_record(r_ref->r, urecord->tr->rsv.tree); + } + + as_storage_record_close(rd); + } else { + // Should never happen. + cf_warning(AS_UDF, "Unexpected Internal Error (null r_ref)"); + } + + urecord->flag &= ~UDF_RECORD_FLAG_STORAGE_OPEN; + cf_detail_digest(AS_UDF, &urecord->tr->keyd, "Storage Close:: Rec(%p) Flag(%x) Digest:", + urecord, urecord->flag ); + return 0; + } else { + return 1; + } +} + +/* + * Function: Open storage record for passed in udf record + * also set up flag like exists / read et al. + * Does as_record_get as well if it is not done yet. + * + * Parameters: + * urec : UDF record + * + * Return value : + * 0 in case record is successfully read + * -1 in case record is not found + * -2 in case record is found but has expired + * + * Callers: + * query_agg_istream_read + */ +int +udf_record_open(udf_record * urecord) +{ + cf_debug_digest(AS_UDF, &urecord->tr->keyd, "[ENTER] Opening record key:"); + if (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN) { + cf_info(AS_UDF, "Record already open"); + return 0; + } + as_transaction *tr = urecord->tr; + as_index_ref *r_ref = urecord->r_ref; + as_index_tree *tree = tr->rsv.tree; + + int rec_rv = 0; + if (!(urecord->flag & UDF_RECORD_FLAG_OPEN)) { + cf_detail(AS_UDF, "Opening Record"); + rec_rv = as_record_get_live(tree, &tr->keyd, r_ref, tr->rsv.ns); + } + + if (!rec_rv) { + as_index *r = r_ref->r; + // check to see this isn't an expired record waiting to die + if (as_record_is_doomed(r, tr->rsv.ns)) { + as_record_done(r_ref, tr->rsv.ns); + cf_detail(AS_UDF, "udf_record_open: Record has expired cannot read"); + rec_rv = -2; + } else { + urecord->flag |= UDF_RECORD_FLAG_OPEN; + urecord->flag |= UDF_RECORD_FLAG_PREEXISTS; + cf_detail_digest(AS_UDF, &tr->keyd, "Open %p %x Digest:", urecord, urecord->flag); + rec_rv = udf_storage_record_open(urecord); + } + } else { + cf_detail_digest(AS_UDF, &urecord->tr->keyd, "udf_record_open: rec_get returned with %d ", + rec_rv); + } + return rec_rv; +} + +/* + * Function: Close storage record for udf record. Release + * all locks and partition reservation / namespace + * reservation etc. if requested. + * Also cleans up entire cache (updated from udf) + * + * Parameters: + * urec : UDF record being operated on + * + * Return value : Nothing + * + * Callers: + * query_agg_istream_read + * as_query__agg + * udf_record_destroy + */ +void +udf_record_close(udf_record *urecord) +{ + as_transaction *tr = urecord->tr; + cf_debug_digest(AS_UDF, &tr->keyd, "[ENTER] Closing record key:"); + + if (urecord->flag & UDF_RECORD_FLAG_OPEN) { + as_index_ref *r_ref = urecord->r_ref; + cf_detail(AS_UDF, "Closing Record"); + udf_storage_record_close(urecord); + as_record_done(r_ref, tr->rsv.ns); + urecord->flag &= ~UDF_RECORD_FLAG_OPEN; + cf_detail_digest(AS_UDF, &urecord->tr->keyd, + "Storage Close:: Rec(%p) Flag(%x) Digest:", urecord, urecord->flag ); + } + + // Replication happens when the main record replicates + if (urecord->particle_data) { + cf_free(urecord->particle_data); + urecord->particle_data = 0; + } + udf_record_cache_free(urecord); +} + +/* + * Function: This function called to reinitialize the udf_record. It sets up + * the basic value back to default. Can be called after the UDF + * record has been used. Reset the fact that record pre_exits or + * was actually read etc. + * + * Parameters: + * urec : UDF record being initialized + * + * Return value : Nothing + * + * Callers: + * udf_rw_local (parent record before calling UDF) + */ +void +udf_record_init(udf_record *urecord, bool allow_updates) +{ + urecord->tr = NULL; + urecord->r_ref = NULL; + urecord->rd = NULL; + urecord->dirty = NULL; + urecord->nupdates = 0; + urecord->particle_data = NULL; + urecord->cur_particle_data = NULL; + urecord->end_particle_data = NULL; + urecord->starting_memory_bytes = 0; + + // Init flag + urecord->flag = UDF_RECORD_FLAG_ISVALID; + + if (allow_updates) { + urecord->flag |= UDF_RECORD_FLAG_ALLOW_UPDATES; + } + + urecord->keyd = cf_digest_zero; + for (uint32_t i = 0; i < UDF_RECORD_BIN_ULIMIT; i++) { + urecord->updates[i].particle_buf = NULL; + } +} + +/* +static int print_buffer(as_buffer * buff) { + msgpack_sbuffer sbuf; + msgpack_sbuffer_init(&sbuf); + + sbuf.data = buff->data; + sbuf.size = buff->size; + sbuf.alloc = buff->capacity; + + msgpack_zone mempool; + msgpack_zone_init(&mempool, 2048); + + msgpack_object deserialized; + msgpack_unpack(sbuf.data, sbuf.size, NULL, &mempool, &deserialized); + + printf("msg_buf:\n"); + msgpack_object_print(stdout, deserialized); + puts(""); + + msgpack_zone_destroy(&mempool); + return 0; +} +*/ + +/* + * Function: Get bin value from cached copy. All the update in a + * commit window is not applied to the record directly + * but maintained in-memory cache. This function used + * to retrieve cached value + * + * Similar function for get and free of cache + * + * Return value : + * value (as_val) in case of success [for get] + * NULL in case of failure + * set and free return Nothing + * + * Callers: + * GET and SET + * udf_record_get + * udf_record_set + * udf_record_remove + * + * FREE + * udf_aerospike__execute_updates (when crossing commit window) + * udf_record_close (finally closing record) + * udf_rw_commit (commit the udf record) + */ +static as_val * +udf_record_cache_get(udf_record * urecord, const char * name) +{ + cf_debug(AS_UDF, "[ENTER] BinName(%s) ", name ); + if ( urecord->nupdates > 0 ) { + cf_detail(AS_UDF, "udf_record_get: %s find", name); + for ( uint32_t i = 0; i < urecord->nupdates; i++ ) { + udf_record_bin * bin = &(urecord->updates[i]); + if ( strncmp(name, bin->name, AS_ID_BIN_SZ) == 0 ) { + cf_detail(AS_UDF, "Bin %s found, type(%d)", name, bin->value->type ); + return bin->value; // note it's OK if the bin contains a nil + } + } + } + return NULL; +} + +void +udf_record_cache_free(udf_record * urecord) +{ + cf_debug(AS_UDF, "[ENTER] NumUpdates(%d) ", urecord->nupdates ); + + for (uint32_t i = 0; i < urecord->nupdates; i ++ ) { + udf_record_bin * bin = &urecord->updates[i]; + if ( bin->name[0] != '\0' && bin->value != NULL ) { + bin->name[0] = '\0'; + as_val_destroy(bin->value); + bin->value = NULL; + } + if ( bin->name[0] != '\0' && bin->oldvalue != NULL ) { + bin->name[0] = '\0'; + as_val_destroy(bin->oldvalue); + bin->oldvalue = NULL; + } + } + + for (uint32_t i = 0; i < UDF_RECORD_BIN_ULIMIT; i++) { + if (urecord->updates[i].particle_buf) { + cf_free(urecord->updates[i].particle_buf); + urecord->updates[i].particle_buf = NULL; + } + } + urecord->nupdates = 0; + urecord->flag &= ~UDF_RECORD_FLAG_TOO_MANY_BINS; +} + +/** + * Set the cache value for a bin, including flags. + */ +static void +udf_record_cache_set(udf_record * urecord, const char * name, as_val * value, + bool dirty) +{ + cf_debug(AS_UDF, "[ENTER] urecord(%p) name(%p)[%s] dirty(%d)", + urecord, name, name, dirty); + + bool modified = false; + + for ( uint32_t i = 0; i < urecord->nupdates; i++ ) { + udf_record_bin * bin = &(urecord->updates[i]); + + // bin exists, then we will release old value and set new value. + if ( strncmp(name, bin->name, AS_ID_BIN_SZ) == 0 ) { + cf_detail(AS_UDF, "udf_record_set: %s found", name); + + // release previously set value + as_val_destroy(bin->value); + + // set new value, with dirty flag + if( value != NULL ) { + bin->value = (as_val *) value; + } + bin->dirty = dirty; + cf_detail(AS_UDF, "udf_record_set: %s set for %p:%p", name, + urecord, bin->value); + + modified = true; + break; + } + } + + // If not modified, then we will add the bin to the cache + if ( ! modified ) { + if ( urecord->nupdates < UDF_RECORD_BIN_ULIMIT ) { + udf_record_bin * bin = &(urecord->updates[urecord->nupdates]); + strncpy(bin->name, name, AS_ID_BIN_SZ); + bin->value = (as_val *) value; + bin->dirty = dirty; + urecord->nupdates++; + cf_detail(AS_UDF, "udf_record_set: %s not modified, add for %p:%p", + name, urecord, bin->value); + } + else { + cf_warning(AS_UDF, "UDF bin limit (%d) exceeded (bin %s)", + UDF_RECORD_BIN_ULIMIT, name); + urecord->flag |= UDF_RECORD_FLAG_TOO_MANY_BINS; + } + } +} + +/* + * Internal Function: Read the bin from storage and convert it + * into as_val and return + * + * Parameters: + * r : udf record + * bname: Bin name of the bin which need to be read. + * + * Return value : + * value (as_val *) in case of success + * NULL in case of failure + * + * Description: + * Expectation is the record is already open. No checks are + * performed in this function. Caller needs to make sure the + * record is good to read e.g binname etc. + * + * NB: as_val which is returned is allocated one. It is callers + * responsibility to free else in case it is passed on to + * lua ... lua has responsibility of garbage collecting it. + * Hence this function call incurs and malloc cost. + * + * Callers: + * udf_record_get + */ +as_val * +udf_record_storage_get(const udf_record *urecord, const char *name) +{ + if (!name) { + cf_detail(AS_UDF, "Passed Null bin name to storage get"); + return NULL; + } + as_bin * bb = as_bin_get(urecord->rd, name); + + if ( !bb ) { + cf_detail(AS_UDF, "udf_record_get: bin not found (%s)", name); + return NULL; + } + + return as_bin_particle_to_asval(bb); +} + +/* + * Check and validate parameter before performing operation + * + * return: + * 2 : UDF_ERR_INTERNAL_PARAM + * 3 : UDF_ERR_RECORD_IS_NOT_VALID + * 4 : UDF_ERR_PARAMETER + * 0 : Success + * + */ +int +udf_record_param_check(const as_rec *rec, char *fname, int lineno) +{ + if (! rec) { + cf_warning(AS_UDF, "Invalid Parameter: null record"); + return UDF_ERR_INTERNAL_PARAMETER; + } + + udf_record *urecord = (udf_record *)as_rec_source(rec); + if (!urecord) { + return UDF_ERR_INTERNAL_PARAMETER;; + } + + if (!(urecord->flag & UDF_RECORD_FLAG_ISVALID)) { + cf_debug(AS_UDF, "(%s:%d): Trying to Open Invalid Record", fname, lineno); + return UDF_ERR_RECORD_NOT_VALID; + } + + return 0; +} + +static int +udf_record_param_check_w_bin(const as_rec *rec, const char *bname, char *fname, int lineno) +{ + int rv = udf_record_param_check(rec, fname, lineno); + + if (rv != 0) { + return rv; + } + + if (! bname) { + cf_warning(AS_UDF, "Invalid Parameter: null bin name"); + return UDF_ERR_INTERNAL_PARAMETER; + } + + udf_record *urecord = (udf_record *)as_rec_source(rec); + as_namespace *ns = urecord->tr->rsv.ns; + + if (ns->single_bin) { + if (*bname != 0) { + cf_warning(AS_UDF, "Invalid Parameter: non-empty bin name in single-bin namespace"); + return UDF_ERR_INTERNAL_PARAMETER; + } + + return 0; + } + + if (*bname == 0) { + cf_warning(AS_UDF, "Invalid Parameter: empty bin name"); + return UDF_ERR_INTERNAL_PARAMETER; + } + + if (strlen(bname) >= AS_ID_BIN_SZ) { + cf_warning(AS_UDF, "Invalid Parameter: bin name %s too big", bname); + return UDF_ERR_PARAMETER; + } + + if (! as_bin_name_within_quota(ns, bname)) { + cf_warning(AS_UDF, "{%s} exceeded bin name quota", ns->name); + return UDF_ERR_PARAMETER; + } + + return 0; +} + +/********************************************************************* + * INTERFACE FUNCTIONS * + * * + * See the as_aerospike for the API definition * + ********************************************************************/ +static as_val * +udf_record_get(const as_rec * rec, const char * name) +{ + if (udf_record_param_check_w_bin(rec, name, __FILE__, __LINE__)) { + return NULL; + } + udf_record * urecord = (udf_record *) as_rec_source(rec); + as_val * value = NULL; + + cf_debug(AS_UDF, "[ENTER] rec(%p) name(%s)", rec, name ); + + // Get from cache + value = udf_record_cache_get(urecord, name); + + // If value not NULL, then return it. + if ( value != NULL ) { + return value; + } + + // Check in the cache before trying to look up in record + // Note: Record may not have been created yet ... Do not + // change the order unless you fully understand what you + // are doing + if ( !(urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN) ) { + if (udf_record_open(urecord)) { // lazy read the record from storage + return NULL; + } + } + + // Check if storage is available + if ( !urecord->rd->ns ) { + cf_detail(AS_UDF, "udf_record_get: storage unavailable"); + return NULL; + } + + value = udf_record_storage_get(urecord, name); + + // We have a value, so we will cache it. + // DO NOT remove this. We need to cache copy to makes sure ref count + // gets decremented post handing this as_val over to the lua world + if (value) { + udf_record_cache_set(urecord, name, value, false); + } + + cf_detail(AS_UDF, "udf_record_get: end (%s) [%p,%p]", name, urecord, value); + return value; +} + +static int +udf_record_set(const as_rec * rec, const char * name, const as_val * value) +{ + int ret = udf_record_param_check_w_bin(rec, name, __FILE__, __LINE__); + if (ret) { + return ret; + } + + udf_record * urecord = (udf_record *) as_rec_source(rec); + cf_detail(AS_UDF, "udf_record_set: begin (%s)", name); + if ( urecord && name ) { + udf_record_cache_set(urecord, name, (as_val *) value, true); + } + cf_detail(AS_UDF, "udf_record_set: end (%s)", name); + + return 0; +} + +static int +udf_record_set_ttl(const as_rec * rec, uint32_t ttl) +{ + int ret = udf_record_param_check(rec, __FILE__, __LINE__); + if (ret) { + return ret; + } + + udf_record * urecord = (udf_record *) as_rec_source(rec); + if (!(urecord->flag & UDF_RECORD_FLAG_ALLOW_UPDATES)) { + return -1; + } + + urecord->tr->msgp->msg.record_ttl = ttl; + urecord->flag |= UDF_RECORD_FLAG_METADATA_UPDATED; + + return 0; +} + +static int +udf_record_drop_key(const as_rec * rec) +{ + int ret = udf_record_param_check(rec, __FILE__, __LINE__); + if (ret) { + return ret; + } + + udf_record * urecord = (udf_record *) as_rec_source(rec); + if (!(urecord->flag & UDF_RECORD_FLAG_ALLOW_UPDATES)) { + return -1; + } + + // Flag the key to be dropped. + if (urecord->rd->key) { + urecord->rd->key = NULL; + urecord->rd->key_size = 0; + } + + urecord->flag |= UDF_RECORD_FLAG_METADATA_UPDATED; + + return 0; +} + +static int +udf_record_remove(const as_rec * rec, const char * name) +{ + int ret = udf_record_param_check(rec, __FILE__, __LINE__); + if (ret) { + return ret; + } + udf_record * urecord = (udf_record *) as_rec_source(rec); + + + cf_detail(AS_UDF, "udf_record_remove: begin (%s)", name); + if ( urecord && name ) { + udf_record_cache_set(urecord, name, (as_val *) &as_nil, true); + } + cf_detail(AS_UDF, "udf_record_remove: end (%s)", name); + + return 0; +} + +static uint32_t +udf_record_ttl(const as_rec * rec) +{ + int ret = udf_record_param_check(rec, __FILE__, __LINE__); + if (ret) { + return 0; + } + + udf_record * urecord = (udf_record *) as_rec_source(rec); + + if ((urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)) { + uint32_t now = as_record_void_time_get(); + + return urecord->r_ref->r->void_time > now ? + urecord->r_ref->r->void_time - now : 0; + } + else { + cf_info(AS_UDF, "Error in getting ttl: no record found"); + return 0; // since we can't indicate the record doesn't exist + } + return 0; +} + +static uint64_t +udf_record_last_update_time(const as_rec * rec) +{ + int ret = udf_record_param_check(rec, __FILE__, __LINE__); + if (ret) { + return 0; + } + + udf_record * urecord = (udf_record *) as_rec_source(rec); + if (urecord && (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)) { + return urecord->r_ref->r->last_update_time; + } + else { + cf_warning(AS_UDF, "Error getting last update time: no record found"); + return 0; + } +} + +static uint16_t +udf_record_gen(const as_rec * rec) +{ + int ret = udf_record_param_check(rec, __FILE__, __LINE__); + if (ret) { + return 0; + } + + udf_record * urecord = (udf_record *) as_rec_source(rec); + if (urecord && (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN) != 0) { + return plain_generation(urecord->rd->r->generation, urecord->rd->ns); + } + else { + cf_warning(AS_UDF, "Error in getting generation: no record found"); + return 0; + } +} + +// Local utility. +static as_val * +as_val_from_flat_key(uint8_t * flat_key, uint32_t size) +{ + uint8_t type = *flat_key; + uint8_t * key = flat_key + 1; + + switch ( type ) { + case AS_PARTICLE_TYPE_INTEGER: + // TODO - verify size is (1 + 8) ??? + // Flat integer keys are in big-endian order. + return (as_val *) as_integer_new(cf_swap_from_be64(*(int64_t *)key)); + case AS_PARTICLE_TYPE_STRING: + { + // Key length is size - 1, then +1 for null-termination. + char * buf = cf_malloc(size); + uint32_t len = size - 1; + memcpy(buf, key, len); + buf[len] = '\0'; + + return (as_val *) as_string_new(buf, true); + } + case AS_PARTICLE_TYPE_BLOB: + { + uint32_t blob_size = size - 1; + uint8_t *buf = cf_malloc(blob_size); + + memcpy(buf, key, blob_size); + + return (as_val *) as_bytes_new_wrap(buf, blob_size, true); + } + default: + return NULL; + } +} + +static as_val * +udf_record_key(const as_rec * rec) +{ + int ret = udf_record_param_check(rec, __FILE__, __LINE__); + if (ret) { + return NULL; + } + + udf_record * urecord = (udf_record *) as_rec_source(rec); + if (urecord && (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)) { + if (urecord->rd->key) { + return as_val_from_flat_key(urecord->rd->key, urecord->rd->key_size); + } + // TODO - perhaps look for the key in the message. + return NULL; + } + else { + cf_warning(AS_UDF, "Error in getting key: no record found"); + return NULL; + } +} + +static const char * +udf_record_setname(const as_rec * rec) +{ + int ret = udf_record_param_check(rec, __FILE__, __LINE__); + if (ret) { + return NULL; + } + + udf_record * urecord = (udf_record *) as_rec_source(rec); + if (urecord && (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)) { + return as_index_get_set_name(urecord->r_ref->r, urecord->rd->ns); + } + else { + cf_warning(AS_UDF, "Error in getting set name: no record found"); + return NULL; + } +} + +bool +udf_record_destroy(as_rec *rec) +{ + if (!rec) { + return false; + } + + udf_record *urecord = (udf_record *) as_rec_source(rec); + udf_record_close(urecord); + as_rec_destroy(rec); + return true; +} + +static as_bytes * +udf_record_digest(const as_rec *rec) +{ + int ret = udf_record_param_check(rec, __FILE__, __LINE__); + if (ret) { + return NULL; + } + + udf_record *urecord = (udf_record *)as_rec_source(rec); + if (urecord && urecord->flag & UDF_RECORD_FLAG_OPEN) { + cf_digest *keyd = cf_malloc(sizeof(cf_digest)); + memcpy(keyd, &urecord->keyd, CF_DIGEST_KEY_SZ); + as_bytes *b = as_bytes_new_wrap(keyd->digest, CF_DIGEST_KEY_SZ, true); + return b; + } + return NULL; +} + +static int +udf_record_bin_names(const as_rec *rec, as_rec_bin_names_callback callback, void * udata) +{ + int ret = udf_record_param_check(rec, __FILE__, __LINE__); + if (ret) { + return 1; + } + + udf_record *urecord = (udf_record *)as_rec_source(rec); + char * bin_names = NULL; + if (urecord && (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)) { + uint16_t nbins; + + if (urecord->rd->ns->single_bin) { + nbins = 1; + bin_names = alloca(1); + *bin_names = 0; + } + else { + nbins = urecord->rd->n_bins; + bin_names = alloca(nbins * AS_ID_BIN_SZ); + for (uint16_t i = 0; i < nbins; i++) { + as_bin *b = &urecord->rd->bins[i]; + if (! as_bin_inuse(b)) { + nbins = i; + break; + } + const char * name = as_bin_get_name_from_id(urecord->rd->ns, b->id); + strcpy(bin_names + (i * AS_ID_BIN_SZ), name); + } + } + callback(bin_names, nbins, AS_ID_BIN_SZ, udata); + return 0; + } + else { + cf_warning(AS_UDF, "Error in getting bin names: no record found"); + bin_names = alloca(1); + *bin_names = 0; + callback(bin_names, 1, AS_ID_BIN_SZ, udata); + return -1; + } +} + +static uint16_t +udf_record_numbins(const as_rec * rec) +{ + int ret = udf_record_param_check(rec, __FILE__, __LINE__); + if (ret) { + return 0; + } + + udf_record *urecord = (udf_record *) as_rec_source(rec); + if (urecord && (urecord->flag & UDF_RECORD_FLAG_STORAGE_OPEN)) { + + if (urecord->rd->ns->single_bin) { + return 1; + } + + uint16_t i; + as_storage_rd *rd = urecord->rd; + for (i = 0; i < rd->n_bins; i++) { + as_bin *b = &rd->bins[i]; + if (! as_bin_inuse(b)) { + break; + } + } + return i; + } + else { + cf_warning(AS_UDF, "Error in getting numbins: no record found"); + return 0; + } +} + +const as_rec_hooks udf_record_hooks = { + .get = udf_record_get, + .set = udf_record_set, + .remove = udf_record_remove, + .ttl = udf_record_ttl, + .last_update_time = udf_record_last_update_time, + .gen = udf_record_gen, + .key = udf_record_key, + .setname = udf_record_setname, + .destroy = NULL, + .digest = udf_record_digest, + .set_ttl = udf_record_set_ttl, + .drop_key = udf_record_drop_key, + .bin_names = udf_record_bin_names, + .numbins = udf_record_numbins +}; diff --git a/as/src/base/udf_timer.c b/as/src/base/udf_timer.c new file mode 100644 index 00000000..78a66fcd --- /dev/null +++ b/as/src/base/udf_timer.c @@ -0,0 +1,96 @@ +/* + * udf_timer.c + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/udf_timer.h" + +#include + +#include "citrusleaf/cf_clock.h" + +#include "fault.h" + + +/***************************************************************************** + * STATIC FUNCTIONS + *****************************************************************************/ + +static pthread_key_t timer_tlskey = 0; +static pthread_once_t key_once = PTHREAD_ONCE_INIT; + +static void +udf_make_key() +{ + pthread_key_create(&timer_tlskey, NULL); +} + +void +udf_timer_setup(time_tracker *tt) +{ + pthread_once(&key_once, udf_make_key); + pthread_setspecific(timer_tlskey, tt); + cf_detail(AS_UDF, "tid=%ld tt=%p", pthread_self(), tt); +} + +void +udf_timer_cleanup() +{ + pthread_setspecific(timer_tlskey, NULL); + cf_detail(AS_UDF, "tid=%ld", pthread_self()); +} + +bool +udf_timer_timedout(const as_timer * timer) +{ + time_tracker *tt = (time_tracker *)pthread_getspecific(timer_tlskey); + cf_detail(AS_UDF, "tid=%ld tt=%p", pthread_self(), tt); + + if (!tt || !tt->end_time) { + return true; + } + uint64_t now = cf_getns(); + bool timedout = (now > tt->end_time(tt)); + if (timedout) { + cf_warning(AS_UDF, "UDF Timed Out [%lu:%lu]", now / 1000000, tt->end_time(tt) / 1000000); + return true; + } + return false; +} + +uint64_t +udf_timer_timeslice(const as_timer * timer) +{ + time_tracker *tt = (time_tracker *)pthread_getspecific(timer_tlskey); + cf_detail(AS_UDF, "tid=%ld tt=%p", pthread_self(), tt); + + if (!tt || !tt->end_time) { + return 0; + } + uint64_t timeslice = (tt->end_time(tt) - cf_getns()) / 1000000; + return (timeslice > 0) ? timeslice : 1; +} + + +const as_timer_hooks udf_timer_hooks = { + .destroy = NULL, + .timedout = udf_timer_timedout, + .timeslice = udf_timer_timeslice +}; diff --git a/as/src/base/xdr_config.c b/as/src/base/xdr_config.c new file mode 100644 index 00000000..d23f7658 --- /dev/null +++ b/as/src/base/xdr_config.c @@ -0,0 +1,73 @@ +/* + * xdr_config.c + * + * Copyright (C) 2011-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * Configuration file-related routines shared between the server and XDR. + */ + +#include +#include "base/xdr_config.h" + +void xdr_config_defaults() +{ + xdr_config *c = &g_xcfg; + memset(c, 0, sizeof(xdr_config)); + + c->xdr_section_configured = false; // Indicates if XDR is configured or not + c->xdr_global_enabled = false; // This config option overrides the enable-xdr setting of the namespace(s) + c->xdr_digestlog_path = NULL; // Path where the digest information is written to the disk + c->xdr_info_port = 0; + c->xdr_max_ship_throughput = 0; // XDR TPS limit + c->xdr_max_ship_bandwidth = 0; // XDR bandwidth limit + c->xdr_min_dlog_free_pct = 0; // Namespace writes are stopped below this limit + c->xdr_hotkey_time_ms = 100; // Expiration time for the de-duplication cache + c->xdr_read_threads = 4; // Number of XDR read threads. + c->xdr_write_timeout = 10000; // Timeout for each element that is shipped. + c->xdr_client_threads = 3; // Number of async client threads (event loops) + c->xdr_forward_xdrwrites = false; // If the writes due to xdr should be forwarded + c->xdr_nsup_deletes_enabled = false;// Shall XDR ship deletes of evictions or expiration + c->xdr_internal_shipping_delay = 0; // Default sleep between shipping each batch is 0 seconds + c->xdr_conf_change_flag = false; + c->xdr_shipping_enabled = true; + c->xdr_delete_shipping_enabled = true; + c->xdr_ship_bins = false; + c->xdr_info_request_timeout_ms = 10000; + c->xdr_compression_threshold = 0; // 0 disables compressed shipping, > 0 specifies minimum request size for compression + c->xdr_handle_failednode = true; + c->xdr_handle_linkdown = true; + c->xdr_digestlog_iowait_ms = 500; + + for (uint32_t index = 0; index < DC_MAX_NUM; index++) { + g_dc_xcfg_opt[index].dc_name = NULL; + g_dc_xcfg_opt[index].dc_node_v.vector = NULL; + g_dc_xcfg_opt[index].dc_addr_map_v.vector = NULL; + g_dc_xcfg_opt[index].dc_security_cfg.sec_config_file = NULL; + g_dc_xcfg_opt[index].dc_use_alternate_services = false; + g_dc_xcfg_opt[index].dc_connections = 64; + g_dc_xcfg_opt[index].dc_connections_idle_ms = 55000; + } +} + +xdr_config g_xcfg = { 0 }; +dc_config_opt g_dc_xcfg_opt[DC_MAX_NUM]; +int g_dc_count = 0; + diff --git a/as/src/base/xdr_serverside_stubs.c b/as/src/base/xdr_serverside_stubs.c new file mode 100644 index 00000000..5f29d328 --- /dev/null +++ b/as/src/base/xdr_serverside_stubs.c @@ -0,0 +1,130 @@ +/* + * xdr_serverside_stubs.c + * + * Copyright (C) 2014-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/xdr_serverside.h" + +int as_xdr_init() +{ + return -1; +} + +void xdr_config_post_process() +{ +} + +void as_xdr_start() +{ +} + +int as_xdr_shutdown() +{ + return -1; +} + +void xdr_sig_handler(int signum) +{ +} + +void xdr_broadcast_lastshipinfo(uint64_t val[]) +{ +} + +void xdr_clear_dirty_bins(xdr_dirty_bins *dirty) +{ +} + +void xdr_fill_dirty_bins(xdr_dirty_bins *dirty) +{ +} + +void xdr_copy_dirty_bins(xdr_dirty_bins *from, xdr_dirty_bins *to) +{ +} + +void xdr_add_dirty_bin(as_namespace *ns, xdr_dirty_bins *dirty, const char *name, size_t name_len) +{ +} + +void xdr_write(as_namespace *ns, cf_digest *keyd, uint16_t generation, cf_node masternode, xdr_op_type op_type, uint16_t set_id, xdr_dirty_bins *dirty) +{ +} + +void as_xdr_read_txn(as_transaction *txn) +{ +} + +void as_xdr_info_init(void) +{ +} + +void as_xdr_info_port(cf_serv_cfg *serv_cfg) +{ + (void)serv_cfg; +} + +int as_info_command_xdr(char *name, char *params, cf_dyn_buf *db) +{ + return -1; +} + +void as_xdr_get_stats(cf_dyn_buf *db) +{ +} + +void as_xdr_get_config(cf_dyn_buf *db) +{ +} + +bool as_xdr_set_config(char *params) +{ + return false; +} + +bool as_xdr_set_config_ns(char *ns_name, char *params) +{ + return false; +} + +bool is_xdr_delete_shipping_enabled() +{ + return false; +} + +bool is_xdr_digestlog_low(as_namespace *ns) +{ + return false; +} + +bool is_xdr_forwarding_enabled() +{ + return false; +} + +bool is_xdr_nsup_deletes_enabled() +{ + return false; +} + +void xdr_cfg_add_int_ext_mapping(dc_config_opt *dc_cfg, char* orig, char* alt) +{ +} + diff --git a/as/src/fabric/clustering.c b/as/src/fabric/clustering.c new file mode 100644 index 00000000..c07ebc30 --- /dev/null +++ b/as/src/fabric/clustering.c @@ -0,0 +1,8163 @@ +/* + * clustering.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "fabric/clustering.h" + +#include +#include +#include +#include +#include // For MAX() and MIN(). + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_random.h" + +#include "fault.h" +#include "msg.h" +#include "node.h" +#include "shash.h" + +#include "base/cfg.h" +#include "fabric/fabric.h" +#include "fabric/hlc.h" + +/* + * Overview + * ======== + * Clustering v5 implementation based on the design at + * https://aerospike.atlassian.net/wiki/pages/viewpage.action?spaceKey=DEV&title=Central+Wiki%3A++Clustering+V5 + * + * Public and private view of the cluster + * ======================================= + * This clustering algorithm introduces an orphan state, in which this node is + * not part of a cluster, but is looking to form/join a cluster. During this + * transitionary phase, the public view of the cluster the tuple, . + * + * This ensures clients continue to function, (maybe with errors), during the + * transition from orphan to part of a cluster state. This is in line with the + * clustering v4 and prior behaviour. + * + * TODO: (revise) + * + * Deviations from paxos + * ===================== + * + * Accepted value + * --------------- + * + * Accepted value is not send along with accept and accepted message. The latest + * accepted value overwrites the previous value at a node. In paxos if a node + * has already accepted a value, it is send back to the proposer who should use + * the value with highest proposal id as the final value. The proposer generates + * the final consensus value as the succession list with the nodes that have + * both returned promise and accepted replies. + * + * This is not safe in terms of achieveing a single paxos value, however it is + * safe in that nodes courted by other principals will get filtered out during + * paxos and not require additional paxos rounds. + * + * It is still possible that the final consensus succession list might has a few + * nodes moving out owing to a neighboring principal. However the faulty node + * check in the next quantum interval will fix this. + * + * Quorum + * ------ + * The prepare phase uses a majority quorum for the promise messages, to speed + * through the paxos round. However the accept phase uses a complete / full + * quorum for accepted messages. This helps with ensuring that when a node + * generartes a cluster change event all cluster member have applied the current + * cluster membership. + * + * Design + * ====== + * The clustering sub-system with rest of Aerospike via input event notification + * (primarily heartbeat events) and output events notifications (primary cluster + * change notifications). + * + * The subsystem is driven by internal events (that also encapsulate external + * input event notifications) like timer, quantum interval start, adjaceny + * changed, message received, etc. + * + * The clustering-v5 subsystem is further organized as the following sub-modules + * each of which reacts to the above mentioned events based on individual state + * transition diagrams. + * + * 1. Timer + * 2. Quantum interval generator + * 3. Paxos proposer + * 4. Paxos acceptor + * 5. Register + * 6. External event publisher + * 7. Internal event dispatcher + * 8. Clustering main + * + * The sub modules also interact with each other via inline internal event + * dispatch and handling. + * + * Timer + * ----- + * Generates timer events that serve as the internal tick/clock for the + * clustering-v5 sub system. Other sub-modules use the timer events to drive + * actions to be performed at fixed intervals, for e.g. message retransmits. + * + * Quantum interval generator + * -------------------------- + * Generates quantum interval start events, at which cluster change decision are + * taken. + * + * Paxos proposer + * -------------- + * The paxos proposer proposes a cluster change. The node may or may not be the + * eventual principal for the cluster. + * + * Paxos acceptor + * -------------- + * Participates in voting for a proposal. A paxos proposer is also necessarily + * an accetor in this design. + * + * Register + * -------- + * Holds current cluster membership and cluster key. It is responsible for + * ensuring all cluster members have their registers in sync before publishing + * an external cluster change event. + * + * External event publisher + * ------------------------ + * Generate and publishes external events or cluster changes. Runs as a separate + * thread to prevent interference and potential deadlocks with the clustering + * subsystem. + * + * Internal event dispatcher + * ------------------------- + * Dispatches internal events to current function based in the event type and + * current state. + * + * Clustering main + * --------------- + * Monitors the cluster and triggers cluster changes. + * + * State transitions + * ================= + * TODO: diagrams for each sub-module + * + * Message send rules + * ================== + * Message send should preferably be outside the main clustering lock and should + * not be followed by any state change in the same function. This is because + * fabric relays messages to self inline in the send call itself which can lead + * to corruption if the message handler involves a state change as well or can + * result in the message handler seeing inconsistent partially updated state. + */ + +/* + * ---------------------------------------------------------------------------- + * Constants + * ---------------------------------------------------------------------------- + */ + +/** + * A soft limit for the maximum cluster size. Meant to be optimize hash and list + * data structures and not as a limit on the number of nodes. + */ +#define AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT 200 + +/** + * Timer event generation interval. + */ +#define CLUSTERING_TIMER_TICK_INTERVAL 75 + +/** + * Maximum time paxos round would take for completion. 3 RTTs paxos message + * exchanges and 1 RTT as a buffer. + */ +#define PAXOS_COMPLETION_TIME_MAX (4 * network_rtt_max()) + +/** + * Maximum quantum interval duration, should be at least two heartbeat + * intervals, to ensure there is at least one exchange of clustering information + * over heartbeats. + */ +#define QUANTUM_INTERVAL_MAX MAX(5000, 2 * as_hb_tx_interval_get()) + +/** + * Block size for allocating node plugin data. Ensure the allocation is in + * multiples of 128 bytes, allowing expansion to 16 nodes without reallocating. + */ +#define HB_PLUGIN_DATA_BLOCK_SIZE 128 + +/** + * Scratch size for clustering messages. + * + * TODO: Compute this properly. + */ +#define AS_CLUSTERING_MSG_SCRATCH_SIZE 1024 + +/** + * Majority value for preferred principal to be selected for move. Use tow + * thirds as the majority value. + */ +#define AS_CLUSTERING_PREFERRRED_PRINCIPAL_MAJORITY (2 / 3) + +/* + * ---------------------------------------------------------------------------- + * Paxos data structures + * ---------------------------------------------------------------------------- + */ + +/** + * Paxos sequence number. We will use the hybrid logical clock timestamp as + * sequence numbers, to ensure node restarts do not reset the sequence number + * back to zero and sequence numbers are monotoniocally increasing. A sequence + * number value of zero is invalid. + */ +typedef as_hlc_timestamp as_paxos_sequence_number; + +/** + * Paxos proposal identifier. + * Note: The nodeid can be skipped when sending the proposal id over the wire + * and can be inferred from the source duirng paxos message exchanges. + */ +typedef struct as_paxos_proposal_id_s +{ + /** + * The sequence number. + */ + as_paxos_sequence_number sequence_number; + + /** + * The proposing node's nodeid to break ties. + */ + cf_node src_nodeid; +} as_paxos_proposal_id; + +/** + * The proposed cluster membership. + */ +typedef struct as_paxos_proposed_value_s +{ + /** + * The cluster key. + */ + as_cluster_key cluster_key; + + /** + * The succession list. + */ + cf_vector succession_list; +} as_paxos_proposed_value; + +/** + * Paxos acceptor state. + */ +typedef enum +{ + /** + * Acceptor is idel with no active paxos round. + */ + AS_PAXOS_ACCEPTOR_STATE_IDLE, + + /** + * Acceptor has received and acked a promise message. + */ + AS_PAXOS_ACCEPTOR_STATE_PROMISED, + + /** + * Acceptor has received and accepted an accept message from a proposer. + */ + AS_PAXOS_ACCEPTOR_STATE_ACCEPTED +} as_paxos_acceptor_state; + +/** + * Data tracked by the node in the role of a paxos acceptor. + * All nodes are paxos acceptors. + */ +typedef struct as_paxos_acceptor_s +{ + /** + * The paxos acceptor state. + */ + as_paxos_acceptor_state state; + + /** + * Monotonic timestamp when the first message for current proposal was + * received from the proposer. + */ + cf_clock acceptor_round_start; + + /** + * Monotonic timestamp when the promise message was sent. + */ + cf_clock promise_send_time; + + /** + * Monotonic timestamp when the promise message was sent. + */ + cf_clock accepted_send_time; + + /** + * Id of the last proposal, promised or accepted by this node. + */ + as_paxos_proposal_id last_proposal_received_id; +} as_paxos_acceptor; + +/** + * State of a paxos proposer. + */ +typedef enum as_paxos_proposer_state_e +{ + /** + * Paxos proposer is idle. No pending paxos rounds. + */ + AS_PAXOS_PROPOSER_STATE_IDLE, + + /** + * Paxos proposer sent out a prepare message. + */ + AS_PAXOS_PROPOSER_STATE_PREPARE_SENT, + + /** + * Paxos proposer has sent out an accept message. + */ + AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT +} as_paxos_proposer_state; + +/** + * Data tracked by the node in the role of a paxos proposer. The proposer node + * may or may not be the current or eventual principal. + */ +typedef struct as_paxos_proposer_s +{ + /** + * The state of the proposer. + */ + as_paxos_proposer_state state; + + /** + * The sequence number / id for the last proposed paxos value. + */ + as_paxos_sequence_number sequence_number; + + /** + * The proposed cluster value. + */ + as_paxos_proposed_value proposed_value; + + /** + * The time current paxos round was started. + */ + cf_clock paxos_round_start_time; + + /** + * The time current proposal's prepare message was sent. + */ + cf_clock prepare_send_time; + + /** + * The time current proposal's accept message was sent. + */ + cf_clock accept_send_time; + + /** + * The time current proposal's learn message was sent. + */ + cf_clock learn_send_time; + + /** + * Indicates if learn message needs retransmit. + */ + bool learn_retransmit_needed; + + /** + * The set of acceptor nodes including self. + */ + cf_vector acceptors; + + /** + * Set of nodeids that send out a promise response to the current prepare + * message. + */ + cf_vector promises_received; + + /** + * Set of nodeids that send out an accepted response to the current accept + * message. + */ + cf_vector accepted_received; +} as_paxos_proposer; + +/** + * Result of paxos round start call. + */ +typedef enum as_paxos_start_result_e +{ + /** + * Paxos round started successfully. + */ + AS_PAXOS_RESULT_STARTED, + + /** + * cluster size is less than minimum required cluster size. + */ + AS_PAXOS_RESULT_CLUSTER_TOO_SMALL, + + /** + * Paxos round already in progress. Paxos not started. + */ + AS_PAXOS_RESULT_ROUND_RUNNING +} as_paxos_start_result; + +/** + * Node clustering status. + */ +typedef enum +{ + /** + * Peer node is orphaned. + */ + AS_NODE_ORPHAN, + + /** + * Peer node has a cluster assigned. + */ + AS_NODE_CLUSTER_ASSIGNED, + + /** + * Peer node status is unknown. + */ + AS_NODE_UNKNOWN +} as_clustering_peer_node_state; + +/* + * ---------------------------------------------------------------------------- + * Clustering data structures + * ---------------------------------------------------------------------------- + */ + +/** + * Clustering message types. + */ +typedef enum +{ + /* + * ---- Clustering management messages ---- + */ + AS_CLUSTERING_MSG_TYPE_JOIN_REQUEST, + AS_CLUSTERING_MSG_TYPE_JOIN_REJECT, + AS_CLUSTERING_MSG_TYPE_MERGE_MOVE, + AS_CLUSTERING_MSG_TYPE_CLUSTER_CHANGE_APPLIED, + + /* + * ---- Paxos messages ---- + */ + AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE, + AS_CLUSTERING_MSG_TYPE_PAXOS_PROMISE, + AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE_NACK, + AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT, + AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPTED, + AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT_NACK, + AS_CLUSTERING_MSG_TYPE_PAXOS_LEARN, +} as_clustering_msg_type; + +/** + * The fields in the clustering message. + */ +typedef enum +{ + /** + * Clustering message identifier. + */ + AS_CLUSTERING_MSG_ID, + + /** + * Clustering message type. + */ + AS_CLUSTERING_MSG_TYPE, + + /** + * The source node send timestamp. + */ + AS_CLUSTERING_MSG_HLC_TIMESTAMP, + + /** + * The paxos sequence number. Not all messages will have this. + */ + AS_CLUSTERING_MSG_SEQUENCE_NUMBER, + + /** + * The proposed cluster key. Only part of the paxos accept message. + */ + AS_CLUSTERING_MSG_CLUSTER_KEY, + + /** + * The proposed succession list. Only part of the paxos accept message. + */ + AS_CLUSTERING_MSG_SUCCESSION_LIST, + + /** + * The proposed principal relevant only to cluster move commands, which will + * merge two well formed paxos clusters. + */ + AS_CLUSTERING_MSG_PROPOSED_PRINCIPAL, + + /** + * Sentinel value to keep track of the number of message fields. + */ + AS_CLUSTERING_MGS_SENTINEL +} as_clustering_msg_field; + +/** + * Internal clustering event type. + */ +typedef enum +{ + /** + * Timer event. + */ + AS_CLUSTERING_INTERNAL_EVENT_TIMER, + + /** + * Incoming message event. + */ + AS_CLUSTERING_INTERNAL_EVENT_MSG, + + /** + * A join request was accepted. + */ + AS_CLUSTERING_INTERNAL_EVENT_JOIN_REQUEST_ACCEPTED, + + /** + * Indicates the start of a quantum interval. + */ + AS_CLUSTERING_INTERNAL_EVENT_QUANTUM_INTERVAL_START, + + /** + * Indicates that self node's cluster membership changed. + */ + AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_CHANGED, + + /** + * Indicates that self node's cluster membership has been synced across all + * cluster members. + */ + AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_SYNCED, + + /** + * Indicates that self node has been marked as an orphan. + */ + AS_CLUSTERING_INTERNAL_EVENT_REGISTER_ORPHANED, + + /** + * Indicates an incoming heartbeat event. + */ + AS_CLUSTERING_INTERNAL_EVENT_HB, + + /** + * Indicates that plugin data for a node has changed. + */ + AS_CLUSTERING_INTERNAL_EVENT_HB_PLUGIN_DATA_CHANGED, + + /** + * The paxos round being accepted succeeded and the proposed value should be + * committed. + * This implies that all the proposed cluster members have all agreed on the + * proposed cluster key and the proposed cluster membership. + */ + AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_SUCCESS, + + /** + * The last paxos round being accepted failed. + */ + AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_FAIL, + + /** + * The paxos round proposed by this node. + */ + AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_SUCCESS, + + /** + * The last paxos round proposed failed. + */ + AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_FAIL, +} as_clustering_internal_event_type; + +/** + * An event used internally by the clustering subsystem. + */ +typedef struct as_clustering_internal_event_s +{ + /** + * The event type. + */ + as_clustering_internal_event_type type; + + /** + * The event qualifier. + */ + as_clustering_event_qualifier qualifier; + + /* + * ----- Quantum interval start event related fields + */ + /** + * Indicates if this quantum interval start can be skipped by the event + * handler. + */ + bool quantum_interval_is_skippable; + + /* + * ----- Message event related fields. + */ + /** + * The source node id. + */ + cf_node msg_src_nodeid; + + /** + * Incoming message type. + */ + as_clustering_msg_type msg_type; + + /** + * The hlc timestamp for message receipt. + */ + as_hlc_msg_timestamp msg_hlc_ts; + + /** + * Local monotonic received timestamp. + */ + cf_clock msg_recvd_ts; + + /** + * The received message. + */ + msg* msg; + + /* + * ----- HB event related fields. + */ + /** + * Number of heartbeat events. + */ + int hb_n_events; + + /** + * Heartbeat events. + */ + as_hb_event_node* hb_events; + + /* + * ----- HB plugin data changed event related fields. + */ + /** + * Node id of the node whose plugin data has changed. + */ + cf_node plugin_data_changed_nodeid; + + /** + * Node's plugin data. + */ + as_hb_plugin_node_data* plugin_data; + + /** + * The hlc timestamp for message receipt. + */ + as_hlc_msg_timestamp plugin_data_changed_hlc_ts; + + /** + * Local monotonic received timestamp. + */ + cf_clock plugin_data_changed_ts; + + /* + * ----- Join request handled related fields. + */ + cf_node join_request_source_nodeid; + + /* + * ----- Paxos success related fields. + */ + /** + * New succession list. + */ + cf_vector *new_succession_list; + + /** + * New cluster key. + */ + as_cluster_key new_cluster_key; + + /** + * New paxos sequence number. + */ + as_paxos_sequence_number new_sequence_number; +} as_clustering_internal_event; + +/** + * The clustering timer state. + */ +typedef struct as_clustering_timer_s +{ + /** + * The timer thread id. + */ + pthread_t timer_tid; +} as_clustering_timer; + +/** + * Clustering subsystem state. + */ +typedef enum +{ + AS_CLUSTERING_SYS_STATE_UNINITIALIZED, + AS_CLUSTERING_SYS_STATE_RUNNING, + AS_CLUSTERING_SYS_STATE_SHUTTING_DOWN, + AS_CLUSTERING_SYS_STATE_STOPPED +} as_clustering_sys_state; + +/** + * Type of quantum interval fault. Ensure the vtable in quantum iterval table is + * updated for each type. + */ +typedef enum as_clustering_quantum_fault_type_e +{ + /** + * A new node arrived. + */ + QUANTUM_FAULT_NODE_ARRIVED, + + /** + * A node not our principal departed from the cluster. + */ + QUANTUM_FAULT_NODE_DEPARTED, + + /** + * We are in a cluster and out principal departed. + */ + QUANTUM_FAULT_PRINCIPAL_DEPARTED, + + /** + * A member node's adjacency list has changed. + */ + QUANTUM_FAULT_PEER_ADJACENCY_CHANGED, + + /** + * Join request accepted. + */ + QUANTUM_FAULT_JOIN_ACCEPTED, + + /** + * We have seen a principal who might send us a merge request. + */ + QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN, + + /** + * A node in our cluster has been orphaned. + */ + QUANTUM_FAULT_CLUSTER_MEMBER_ORPHANED, + + /** + * Sentinel value. Should be the last in the enum. + */ + QUANTUM_FAULT_TYPE_SENTINEL +} as_clustering_quantum_fault_type; + +/** + * Fault information for for first fault event detected in a quantum interval. + */ +typedef struct as_clustering_quantum_fault_s +{ + /** + * First time the fault event was detected in current quantum based on + * monotonic clock. Should be initialized to zero at quantum start / end. + */ + cf_clock event_ts; + + /** + * Last time the fault event was detected in current quantum based on + * monotonic clock. Should be initialized to zero at quantum start / end. + */ + cf_clock last_event_ts; +} as_clustering_quantum_fault; + +/** + * Function to determine the minimum wait time after given fault happens. + */ +typedef uint32_t +(as_clustering_quantum_fault_wait_fn)(as_clustering_quantum_fault* fault); + +/** + * Vtable for different types of faults. + */ +typedef struct as_clustering_quantum_fault_vtable_s +{ + /** + * String used to log this fault type. + */ + char *fault_log_str; + + /** + * Function providing the wait time for this fault type. + */ + as_clustering_quantum_fault_wait_fn* wait_fn; +} as_clustering_quantum_fault_vtable; + +/** + * Generates quantum intervals. + */ +typedef struct as_clustering_quantum_interval_generator_s +{ + /** + * Quantum interval fault vtable. + */ + as_clustering_quantum_fault_vtable vtable[QUANTUM_FAULT_TYPE_SENTINEL]; + + /** + * Quantum interval faults. + */ + as_clustering_quantum_fault fault[QUANTUM_FAULT_TYPE_SENTINEL]; + + /** + * Time quantum interval last started. + */ + cf_clock last_quantum_start_time; + + /** + * For quantum interval being skippable respect the last quantum interval + * since quantum_interval() will be affected by changes to hb config. + */ + uint32_t last_quantum_interval; + + /** + * Indicates if current quantum interval should be postponed. + */ + bool is_interval_postponed; +} as_clustering_quantum_interval_generator; + +/** + * State of the clustering register. + */ +typedef enum +{ + /** + * The register contents are in synced with all cluster members. + */ + AS_CLUSTERING_REGISTER_STATE_SYNCED, + + /** + * The register contents are being synced with other cluster members. + */ + AS_CLUSTERING_REGISTER_STATE_SYNCING +} as_clustering_register_state; + +/** + * Stores current cluster key and succession list and generates external events. + */ +typedef struct as_clustering_register_s +{ + /** + * The register state. + */ + as_clustering_register_state state; + + /** + * Current cluster key. + */ + as_cluster_key cluster_key; + + /** + * Current succession list. + */ + cf_vector succession_list; + + /** + * Indicates if this node has transitioned to orphan state after being in a + * valid cluster. + */ + bool has_orphan_transitioned; + + /** + * The sequence number for the current cluster. + */ + as_paxos_sequence_number sequence_number; + + /** + * Nodes pending sync. + */ + cf_vector sync_pending; + + /** + * Nodes that send a sync applied for an unexpected cluster. Store it in + * case this is an imminent cluster change we will see in the future. All + * the nodes in this vector have sent the same cluster key and the same + * succession list. + */ + cf_vector ooo_change_applied_received; + + /** + * Cluster key sent by nodes in ooo_change_applied_received vector. + */ + as_cluster_key ooo_cluster_key; + + /** + * Succession sent by nodes in ooo_change_applied_received vector. + */ + cf_vector ooo_succession_list; + + /** + * Timestamp of the first ooo change applied message. + */ + as_hlc_timestamp ooo_hlc_timestamp; + + /** + * The time cluster last changed. + */ + as_hlc_timestamp cluster_modified_hlc_ts; + + /** + * The monotonic clock time cluster last changed. + */ + cf_clock cluster_modified_time; + + /** + * The last time the register sync was checked in the syncing state. + */ + cf_clock last_sync_check_time; +} as_clustering_register; + +/** + * * Clustering state. + */ +typedef enum +{ + /** + * Self node is not part of a cluster. + */ + AS_CLUSTERING_STATE_ORPHAN, + + /** + * Self node is not part of a cluster. + */ + AS_CLUSTERING_STATE_PRINCIPAL, + + /** + * Self node is part of a cluster but not the principal. + */ + AS_CLUSTERING_STATE_NON_PRINCIPAL +} as_clustering_state; + +/** + * Clustering state maintained by this node. + */ +typedef struct as_clustering_s +{ + + /** + * Clustering submodule state, indicates if the clustering sub system is + * running, stopped or initialized. + */ + as_clustering_sys_state sys_state; + + /** + * Simple view of whether or not the cluster is well-formed. + */ + bool has_integrity; + + /** + * Clustering relevant state, e.g. orphan, principal, non-principal. + */ + as_clustering_state state; + + /** + * The preferred principal is a node such that removing current principal + * and making said node new principal will lead to a larger cluster. This is + * updated in the non-principal state at each quantum interval and is sent + * out with each heartbeat pulse. + */ + cf_node preferred_principal; + + /** + * Pending join requests. + */ + cf_vector pending_join_requests; + + /** + * The monotonic clock time when this node entered orphan state. + * Will be set to zero when the node is not an orphan. + */ + cf_clock orphan_state_start_time; + + /** + * Time when the last move command was sent. + */ + cf_clock move_cmd_issue_time; + + /** + * Hash from nodes whom join request was sent to the time the join request + * was send . Used to prevent sending join request too quickly to the same + * principal again and again. + */ + cf_shash* join_request_blackout; + + /** + * The principal to which the last join request was sent. + */ + cf_node last_join_request_principal; + + /** + * The time at which the last join request was sent, to track and timeout + * join requests. + */ + cf_clock last_join_request_sent_time; + + /** + * The time at which the last join request was retransmitted, to track and + * retransmit join requests. + */ + cf_clock last_join_request_retransmit_time; +} as_clustering; + +/** + * Result of sending out a join request. + */ +typedef enum as_clustering_join_request_result_e +{ + /** + * + * Join request was sent out. + */ + AS_CLUSTERING_JOIN_REQUEST_SENT, + + /** + * + * Join request was attempted, but sending failed. + */ + AS_CLUSTERING_JOIN_REQUEST_SEND_FAILED, + + /** + * Join request already pending. A new join request was not sent. + */ + AS_CLUSTERING_JOIN_REQUEST_PENDING, + + /** + * No neighboring principals present to send the join request. + */ + AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS +} as_clustering_join_request_result; + +/** + * External event publisher state. + */ +typedef struct as_clustering_external_event_publisher_s +{ + /** + * State of the external event publisher. + */ + as_clustering_sys_state sys_state; + + /** + * Inidicates if there is an event to publish. + */ + bool event_queued; + + /** + * The pending event to publish. + */ + as_clustering_event to_publish; + + /** + * The static succession list published with the message. + */ + cf_vector published_succession_list; + + /** + * Conditional variable to signal pending event to publish. + */ + pthread_cond_t is_pending; + + /** + * Thread id of the publisher thread. + */ + pthread_t event_publisher_tid; + + /** + * Mutex to protect the conditional variable. + */ + pthread_mutex_t is_pending_mutex; +} as_clustering_external_event_publisher; + +/* + * ---------------------------------------------------------------------------- + * Forward declarations + * ---------------------------------------------------------------------------- + */ +static void +internal_event_dispatch(as_clustering_internal_event* timer_event); +static bool +clustering_is_our_principal(cf_node nodeid); +static bool +clustering_is_principal(); +static bool +clustering_is_cluster_member(cf_node nodeid); + +/* + * ---------------------------------------------------------------------------- + * Non-public hooks to exchange subsystem. + * ---------------------------------------------------------------------------- + */ +extern void +exchange_clustering_event_listener(as_clustering_event* event); + +/* + * ---------------------------------------------------------------------------- + * Timer, timeout values and intervals + * + * All values should be multiples of timer tick interval. + * ---------------------------------------------------------------------------- + */ + +/** + * Timer tick interval, which should be a GCD of all clustering intervals. + */ +static uint32_t +timer_tick_interval() +{ + return CLUSTERING_TIMER_TICK_INTERVAL; +} + +/** + * Maximum network latency for the cluster. + */ +static uint32_t +network_latency_max() +{ + return g_config.fabric_latency_max_ms; +} + +/** + * Maximum network rtt for the cluster. + */ +static uint32_t +network_rtt_max() +{ + return 2 * network_latency_max(); +} + +/** + * Quantum interval in milliseconds. + */ +static uint32_t +quantum_interval() +{ + uint32_t std_quantum_interval = MIN(QUANTUM_INTERVAL_MAX, + as_hb_node_timeout_get() + + 2 * (as_hb_tx_interval_get() + network_latency_max())); + + // Ensure we give paxos enough time to complete. + return MAX(PAXOS_COMPLETION_TIME_MAX, std_quantum_interval); +} + +/** + * Maximum number of times quantum interval start can be skipped. + */ +static uint32_t +quantum_interval_skip_max() +{ + return 2; +} + +/** + * Interval at which register sync is checked. + */ +static uint32_t +register_sync_check_interval() +{ + return MAX(network_rtt_max(), as_hb_tx_interval_get()); +} + +/** + * Timeout for a join request, should definitely be larger than a quantum + * interval to prevent the requesting node from making new requests before the + * current requested principal node can finish the paxos round. + */ +static uint32_t +join_request_timeout() +{ + // Allow for + // - 1 quantum interval, where our request lands just after the potential + // principal's quantum interval start. + // - 0.5 quantum intervals to give time for a paxos round to finish + // - (quantum_interval_skip_max -1) intervals if the principal had to skip + // quantum intervals. + return (uint32_t)( + (1 + 0.5 + (quantum_interval_skip_max() - 1)) * quantum_interval()); +} + +/** + * Timeout for a retransmitting a join request. + */ +static uint32_t +join_request_retransmit_timeout() +{ + return (uint32_t)(MIN(as_hb_tx_interval_get() / 2, quantum_interval() / 2)); +} + +/** + * The interval at which a node checks to see if it should join a cluster. + */ +static uint32_t +join_cluster_check_interval() +{ + return timer_tick_interval(); +} + +/** + * Blackout period for join requests to a particular principal to prevent + * bombarding it with join requests. Should be less than join_request_timeout(). + */ +static uint32_t +join_request_blackout_interval() +{ + return MIN(join_request_timeout(), + MIN(quantum_interval() / 2, 2 * as_hb_tx_interval_get())); +} + +/** + * Blackout period after sending a move command, during which join requests will + * be rejected. + */ +static uint32_t +join_request_move_reject_interval() +{ + // Wait for one quantum interval before accepting join requests after + // sending a move command. + return quantum_interval(); +} + +/** + * Maximum tolerable join request transmission delay in milliseconds. Join + * requests delayed by more than this amount will not be accepted. + */ +static uint32_t +join_request_accept_delay_max() +{ + // Join request is considered stale / delayed if the (received hlc timestamp + // - send hlc timestamp) > this value; + return (2 * as_hb_tx_interval_get() + network_latency_max()); +} + +/** + * Timeout in milliseconds for a paxos proposal. Give a paxos round two thirds + * of an interval to timeout. + * A paxos round should definitely timeout before the next quantum interval, so + * that it does not delay cluster convergence. + */ +static uint32_t +paxos_proposal_timeout() +{ + return MAX(quantum_interval() / 2, network_rtt_max()); +} + +/** + * Timeout in milliseconds after which a paxos message is retransmitted. + */ +static uint32_t +paxos_msg_timeout() +{ + return MAX(MIN(quantum_interval() / 4, 100), network_rtt_max()); +} + +/** + * Maximum amount of time a node will be in orphan state. After this timeout the + * node will try forming a new cluster even if there are other adjacent + * clusters/nodes visible. + */ +static uint32_t +clustering_orphan_timeout() +{ + return UINT_MAX; +} + +/* + * ---------------------------------------------------------------------------- + * Stack allocation + * ---------------------------------------------------------------------------- + */ + +/** + * Maximum memory size allocated on the call stack. + */ +#define STACK_ALLOC_LIMIT() (16 * 1024) + +/** + * Allocate a buffer on stack if possible. Larger buffers are heap allocated to + * prevent stack overflows. + */ +#define BUFFER_ALLOC_OR_DIE(size) \ +(((size) > STACK_ALLOC_LIMIT()) ? cf_malloc(size) : alloca(size)) + +/** + * Free the buffer allocated by BUFFER_ALLOC + */ +#define BUFFER_FREE(buffer, size) \ +if (((size) > STACK_ALLOC_LIMIT()) && buffer) {cf_free(buffer);} + +/* + * ---------------------------------------------------------------------------- + * Logging + * ---------------------------------------------------------------------------- + */ +#define LOG_LENGTH_MAX() (800) +#define CRASH(format, ...) cf_crash(AS_CLUSTERING, format, ##__VA_ARGS__) +#define WARNING(format, ...) cf_warning(AS_CLUSTERING, format, ##__VA_ARGS__) +#define INFO(format, ...) cf_info(AS_CLUSTERING, format, ##__VA_ARGS__) +#define DEBUG(format, ...) cf_debug(AS_CLUSTERING, format, ##__VA_ARGS__) +#define DETAIL(format, ...) cf_detail(AS_CLUSTERING, format, ##__VA_ARGS__) + +#ifdef TRACE_ENABLED +#define TRACE(format, ...) cf_detail(AS_CLUSTERING, format, ##__VA_ARGS__) +#else +#define TRACE(format, ...) +#endif + +#ifdef TRACE_ENABLED +#define TRACE_LOG(context, format, ...) cf_detail(context, format, ##__VA_ARGS__) +#else +#define TRACE_LOG(context, format, ...) +#endif + +#define CF_TRACE CF_FAULT_SEVERITY_UNDEF + +#define ASSERT(expression, message, ...) \ +if (!(expression)) {WARNING(message, ##__VA_ARGS__);} + +#define log_cf_node_array(message, nodes, node_count, severity) \ +as_clustering_log_cf_node_array(severity, AS_CLUSTERING, message, \ + nodes, node_count) +#define log_cf_node_vector(message, nodes, severity) \ + as_clustering_log_cf_node_vector(severity, AS_CLUSTERING, message, \ + nodes) + +/* + * ---------------------------------------------------------------------------- + * Vector functions + * ---------------------------------------------------------------------------- + */ + +/** + * Clear / delete all entries in a vector. + */ +static void +vector_clear(cf_vector* vector) +{ + cf_vector_delete_range(vector, 0, cf_vector_size(vector)); +} + +/** + * Create temporary stack variables. + */ +#define TOKEN_PASTE(x, y) x##y +#define STACK_VAR(x, y) TOKEN_PASTE(x, y) + +/** + * Initialize a lockless vector, initially sized to store cluster node number + * of elements. + */ +#define vector_lockless_init(vectorp, value_type) \ +({ \ + cf_vector_init(vectorp, sizeof(value_type), \ + AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO); \ +}) + +/** + * Create and initialize a lockless stack allocated vector to initially sized to + * store cluster node number of elements. + */ +#define vector_stack_lockless_create(value_type) \ +({ \ + cf_vector * STACK_VAR(vector, __LINE__) = (cf_vector*)alloca( \ + sizeof(cf_vector)); \ + size_t buffer_size = AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT \ + * sizeof(value_type); \ + void* STACK_VAR(buff, __LINE__) = alloca(buffer_size); cf_vector_init_smalloc( \ + STACK_VAR(vector, __LINE__), sizeof(value_type), \ + (uint8_t*)STACK_VAR(buff, __LINE__), buffer_size, \ + VECTOR_FLAG_INITZERO); \ + STACK_VAR(vector, __LINE__); \ +}) + +/** + * Check two vector for equality. Two vector are euql if they have the same + * number of elements and corresponding elements are equal. For now simple + * memory compare is used to compare elements. Assumes the vectors are not + * accessed by other threads during this operation. + * + * @param v1 the first vector to compare. + * @param v2 the second vector to compare. + * @return true if the vectors are true, false otherwise. + */ +static bool +vector_equals(cf_vector* v1, cf_vector* v2) +{ + int v1_count = cf_vector_size(v1); + int v2_count = cf_vector_size(v2); + int v1_elem_sz = VECTOR_ELEM_SZ(v1); + int v2_elem_sz = VECTOR_ELEM_SZ(v2); + + if (v1_count != v2_count || v1_elem_sz != v2_elem_sz) { + return false; + } + + for (int i = 0; i < v1_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + void* v1_element = cf_vector_getp(v1, i); + void* v2_element = cf_vector_getp(v2, i); + + if (v1_element == v2_element) { + // Same reference or both are NULL. + continue; + } + + if (v1_element == NULL || v2_element == NULL) { + // Exactly one reference is NULL. + return false; + } + + if (memcmp(v1_element, v2_element, v1_elem_sz) != 0) { + return false; + } + } + + return true; +} + +/** + * Find the index of an element in the vector. Equality is based on mem compare. + * + * @param vector the source vector. + * @param element the element to find. + * @return the index if the element is found, -1 otherwise. + */ +static int +vector_find(cf_vector* vector, void* element) +{ + int element_count = cf_vector_size(vector); + size_t value_len = VECTOR_ELEM_SZ(vector); + for (int i = 0; i < element_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + void* src_element = cf_vector_getp(vector, i); + if (src_element) { + if (memcmp(element, src_element, value_len) == 0) { + return i; + } + } + } + return -1; +} + +/** + * Copy all elements form the source vector to the destination vector to the + * destination vector. Assumes the source and destination vector are not being + * modified while the copy operation is in progress. + * + * @param dest the destination vector. + * @param src the source vector. + * @return the number of elements copied. + */ +static int +vector_copy(cf_vector* dest, cf_vector* src) +{ + int element_count = cf_vector_size(src); + int copied_count = 0; + for (int i = 0; i < element_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + void* src_element = cf_vector_getp(src, i); + if (src_element) { + cf_vector_append(dest, src_element); + copied_count++; + } + } + return copied_count; +} + +/** + * Copy all elements form the source vector to the destination vector only if + * they do not exist in the destination vector. Assumes the source and + * destination vector are not being modified while the copy operation is in + * progress. + * + * @param dest the destination vector. + * @param src the source vector. + * @return the number of elements copied. + */ +static int +vector_copy_unique(cf_vector* dest, cf_vector* src) +{ + int element_count = cf_vector_size(src); + int copied_count = 0; + for (int i = 0; i < element_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + void* src_element = cf_vector_getp(src, i); + if (src_element) { + cf_vector_append_unique(dest, src_element); + copied_count++; + } + } + return copied_count; +} + +/** + * Sorts in place the elements in the vector using the inout comparator function + * and retains only unique elements. Assumes the source vector is not being + * modified while the sort operation is in progress. + * + * @param src the source vector. + * @return comparator the comparator function, which must return an integer less + * than, equal to, or greater than zero if the first argument is considered to + * be respectively less than, equal to, or greater than the second + */ +static void +vector_sort_unique(cf_vector* src, int +(*comparator)(const void*, const void*)) +{ + int element_count = cf_vector_size(src); + size_t value_len = VECTOR_ELEM_SZ(src); + size_t array_size = element_count * value_len; + void* element_array = BUFFER_ALLOC_OR_DIE(array_size); + + // A lame approach to sorting. Copying the elements to an array and invoking + // qsort. + uint8_t* next_element_ptr = element_array; + int array_element_count = 0; + for (int i = 0; i < element_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + void* src_element = cf_vector_getp(src, i); + if (src_element) { + memcpy(next_element_ptr, src_element, value_len); + next_element_ptr += value_len; + array_element_count++; + } + } + + qsort(element_array, array_element_count, value_len, comparator); + + vector_clear(src); + next_element_ptr = element_array; + for (int i = 0; i < array_element_count; i++) { + cf_vector_append_unique(src, next_element_ptr); + next_element_ptr += value_len; + } + + BUFFER_FREE(element_array, array_size); + return; +} + +/** + * Remove all elements from the to_remove vector present in the target vector. + * Equality is based on simple mem compare. + * + * @param target the target vector being modified. + * @param to_remove the vector whose elements must be removed from the target. + * @return the number of elements removed. + */ +static int +vector_subtract(cf_vector* target, cf_vector* to_remove) +{ + int element_count = cf_vector_size(to_remove); + int removed_count = 0; + for (int i = 0; i < element_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + void* to_remove_element = cf_vector_getp(to_remove, i); + if (to_remove_element) { + int found_at = 0; + while ((found_at = vector_find(target, to_remove_element)) >= 0) { + cf_vector_delete(target, found_at); + removed_count++; + } + } + } + + return removed_count; +} + +/** + * Convert a vector to an array. + * FIXME: return pointer to the internal vector storage. + */ +static cf_node* +vector_to_array(cf_vector* vector) +{ + return (cf_node*)vector->vector; +} + +/** + * Copy elements in a vector to an array. + * @param array the destination array. Should be large enough to hold the number + * all elements in the vector. + * @param src the source vector. + * @param element_count the number of elements to copy from the source vector. + */ +static void +vector_array_cpy(void* array, cf_vector* src, int element_count) +{ + uint8_t* element_ptr = array; + int element_size = VECTOR_ELEM_SZ(src); + for (int i = 0; i < element_count; i++) { + cf_vector_get(src, i, element_ptr); + element_ptr += element_size; + } +} + +/* + * ---------------------------------------------------------------------------- + * Globals + * ---------------------------------------------------------------------------- + */ + +/** + * The big fat lock for all clustering state. + */ +static pthread_mutex_t g_clustering_lock = + PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +/** + * The fat lock for all clustering events listener changes. + */ +static pthread_mutex_t g_clustering_event_publisher_lock = + PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +/** + * Debugging lock acquition. + * #define LOCK_DEBUG_ENABLED 1 + */ +#ifdef LOCK_DEBUG_ENABLED +#define LOCK_DEBUG(format, ...) DEBUG(format, ##__VA_ARGS__) +#else +#define LOCK_DEBUG(format, ...) +#endif + +/** + * Acquire a lock on the clustering module. + */ +#define CLUSTERING_LOCK() \ +({ \ + pthread_mutex_lock (&g_clustering_lock); \ + LOCK_DEBUG("locked in %s", __FUNCTION__); \ +}) + +/** + * Relinquish the lock on the clustering module. + */ +#define CLUSTERING_UNLOCK() \ +({ \ + pthread_mutex_unlock (&g_clustering_lock); \ + LOCK_DEBUG("unLocked in %s", __FUNCTION__); \ +}) + +/** + * Acquire a lock on the clustering publisher. + */ +#define CLUSTERING_EVENT_PUBLISHER_LOCK() \ +({ \ + pthread_mutex_lock (&g_clustering_event_publisher_lock); \ + LOCK_DEBUG("publisher locked in %s", __FUNCTION__); \ +}) + +/** + * Relinquish the lock on the clustering publisher. + */ +#define CLUSTERING_EVENT_PUBLISHER_UNLOCK() \ +({ \ + pthread_mutex_unlock (&g_clustering_event_publisher_lock); \ + LOCK_DEBUG("publisher unLocked in %s", __FUNCTION__); \ +}) + +/** + * Singleton timer. + */ +static as_clustering_timer g_timer; + +/** + * Singleton external events publisher. + */ +static as_clustering_external_event_publisher g_external_event_publisher; + +/** + * Singleton cluster register to store this node's cluster membership. + */ +static as_clustering_register g_register; + +/** + * Singleton clustrering state all initialized to zero. + */ +static as_clustering g_clustering = { 0 }; + +/** + * Singleton paxos proposer. + */ +static as_paxos_proposer g_proposer; + +/** + * Singleton paxos acceptor. + */ +static as_paxos_acceptor g_acceptor; + +/** + * Singleton quantum interval generator. + */ +static as_clustering_quantum_interval_generator g_quantum_interval_generator; + +/** + * Message template for heart beat messages. + */ +static msg_template g_clustering_msg_template[] = { + +{ AS_CLUSTERING_MSG_ID, M_FT_UINT32 }, + +{ AS_CLUSTERING_MSG_TYPE, M_FT_UINT32 }, + +{ AS_CLUSTERING_MSG_HLC_TIMESTAMP, M_FT_UINT64 }, + +{ AS_CLUSTERING_MSG_SEQUENCE_NUMBER, M_FT_UINT64 }, + +{ AS_CLUSTERING_MSG_CLUSTER_KEY, M_FT_UINT64 }, + +{ AS_CLUSTERING_MSG_SUCCESSION_LIST, M_FT_BUF }, + +{ AS_CLUSTERING_MSG_PROPOSED_PRINCIPAL, M_FT_UINT64 } + +}; + +/* + * ---------------------------------------------------------------------------- + * Clustering life cycle + * ---------------------------------------------------------------------------- + */ + +/** + * Check if clustering is initialized. + */ +static bool +clustering_is_initialized() +{ + CLUSTERING_LOCK(); + bool initialized = (g_clustering.sys_state + != AS_CLUSTERING_SYS_STATE_UNINITIALIZED); + CLUSTERING_UNLOCK(); + return initialized; +} + +/** + * * Check if clustering is running. + */ +static bool +clustering_is_running() +{ + CLUSTERING_LOCK(); + bool running = g_clustering.sys_state == AS_CLUSTERING_SYS_STATE_RUNNING; + CLUSTERING_UNLOCK(); + return running; +} + +/* + * ---------------------------------------------------------------------------- + * Config related functions + * ---------------------------------------------------------------------------- + */ + +/** + * The nodeid for this node. + */ +static cf_node +config_self_nodeid_get() +{ + return g_config.self_node; +} + +/* + * ---------------------------------------------------------------------------- + * Compatibility mode functions + * ---------------------------------------------------------------------------- + */ + +/** + * Return current protocol version identifier. + */ +as_cluster_proto_identifier +clustering_protocol_identifier_get() +{ + return 0x707C; +} + +/** + * Compare clustering protocol versions for compatibility. + */ +bool +clustering_versions_are_compatible(as_cluster_proto_identifier v1, + as_cluster_proto_identifier v2) +{ + return v1 == v2; +} + +/* + * ---------------------------------------------------------------------------- + * Timer event generator + * + * TODO: Can be abstracted out as a single scheduler single utility across + * modules. + * ---------------------------------------------------------------------------- + */ + +static void +timer_init() +{ + CLUSTERING_LOCK(); + memset(&g_timer, 0, sizeof(g_timer)); + CLUSTERING_UNLOCK(); +} + +/** + * Clustering timer event generator thread, to help with retries and retransmits + * across all states. + */ +static void* +timer_thr(void* arg) +{ + as_clustering_internal_event timer_event; + memset(&timer_event, 0, sizeof(timer_event)); + timer_event.type = AS_CLUSTERING_INTERNAL_EVENT_TIMER; + + while (clustering_is_running()) { + // Wait for a while and retry. + internal_event_dispatch(&timer_event); + usleep(timer_tick_interval() * 1000); + } + + return NULL; +} + +/** + * Start the timer. + */ +static void +timer_start() +{ + CLUSTERING_LOCK(); + if (pthread_create(&g_timer.timer_tid, 0, timer_thr, NULL) != 0) { + CRASH("could not create timer thread: %s", cf_strerror(errno)); + } + CLUSTERING_UNLOCK(); +} + +/** + * Stop the timer. + */ +static void +timer_stop() +{ + CLUSTERING_LOCK(); + pthread_join(g_timer.timer_tid, NULL); + CLUSTERING_UNLOCK(); +} + +/* + * ---------------------------------------------------------------------------- + * Heartbeat subsystem interfacing + * ---------------------------------------------------------------------------- + */ + +/* + * The structure of data clustring subsystem pushes with in hb pulse messages + * and retains as plugin data is as follows. + * + * Each row occupies 4 bytes. + * + * V5 heartbeat wire payload structure. + * =============================== + * + * ------------|-------------|------------|------------| + * | Clustering Protocol identifier | + * |---------------------------------------------------| + * | | + * |-------- Cluster Key ------------------------------| + * | | + * |---------------------------------------------------| + * | | + * |-------- Paxos sequence number --------------------| + * | | + * |---------------------------------------------------| + * | | + * |-------- Preferred principal ----------------------| + * | | + * |---------------------------------------------------| + * | Length of succession list | + * |---------------------------------------------------| + * | | + * |-------- Succ. Node id 0 --------------------------| + * | | + * |---------------------------------------------------| + * | | + * |-------- Succ. Node id 1 --------------------------| + * | | + * |---------------------------------------------------| + * | . | + * | . | + * + * + * Cluster key and succession lists helps with detecting cluster integrity, + * Plain clusterkey should be good enough but matching succession lists adds to + * another level of safety (may not be required but being cautious). + * + * For orpahned node cluster key and length of succession list are set to zero. + * + * The parsed hb pluging data is just the same as the wire payload structure. + * The plugin code ensure invalid content will never be parsed as plugin data to + * memory. The direct implication is that if plugin data is not NULL, + * required fields + * - Clustering protocol identifier + * - Cluster key + * - Succession list length will always be present when read back from the + * heartbeat subsystem and the succession list will be consistent with the + * succession list length. + */ + +/** + * Read plugin data from hb layer for a node, using stack allocated space. + * Will attempt a max of 3 attempts before crashing. + * plugin_data_p->data_size will be zero and plugin_data_p->data will be NULL if + * an entry for the node does not exist. + */ +#define clustering_hb_plugin_data_get(nodeid, plugin_data_p, \ + hb_msg_hlc_ts_p, msg_recv_ts_p) \ +({ \ + (plugin_data_p)->data_capacity = 1024; \ + int tries_remaining = 3; \ + bool enoent = false; \ + bool rv = -1; \ + while (tries_remaining--) { \ + (plugin_data_p)->data = alloca((plugin_data_p)->data_capacity); \ + if (as_hb_plugin_data_get(nodeid, AS_HB_PLUGIN_CLUSTERING, \ + plugin_data_p, hb_msg_hlc_ts_p, msg_recv_ts_p) == 0) { \ + rv = 0; \ + break; \ + } \ + if (errno == ENOENT) { \ + enoent = true; \ + break; \ + } \ + if (errno == ENOMEM) { \ + (plugin_data_p)->data_capacity = (plugin_data_p)->data_size; \ + } \ + } \ + if (rv != 0 && !enoent && tries_remaining < 0) { \ + CRASH("error allocating space for paxos hb plugin data"); \ + } \ + if (enoent) { \ + (plugin_data_p)->data_size = 0; \ + (plugin_data_p)->data = NULL; \ + } \ + rv; \ +}) + +/** + * Get a pointer to the protocol identifier inside plugin data. Will be NULL if + * plugin data is null or there are not enough bytes in the data to hold the + * identifier. + * @param plugin_data can be NULL. + * @param plugin_data_size the size of plugin data. + * @return pointer to the protocol identifier on success, NULL on failure. + */ +static as_cluster_proto_identifier* +clustering_hb_plugin_proto_get(void* plugin_data, size_t plugin_data_size) +{ + if (plugin_data == NULL + || plugin_data_size < sizeof(as_cluster_proto_identifier)) { + // The data does not hold valid data or there is no cluster key and or + // succession list is missing. + return NULL; + } + + return (as_cluster_proto_identifier*)plugin_data; +} + +/** + * Retrieves the cluster key from clustering hb plugin data. + * @param plugin_data can be NULL. + * @param plugin_data_size the size of plugin data. + * @return pointer to the cluster key on success, NULL on failure. + */ +static as_cluster_key* +clustering_hb_plugin_cluster_key_get(void* plugin_data, size_t plugin_data_size) +{ + uint8_t* proto = (uint8_t*)clustering_hb_plugin_proto_get(plugin_data, + plugin_data_size); + if (proto == NULL) { + // The data does not hold valid data. + return NULL; + } + + if ((uint8_t*)plugin_data + plugin_data_size + < proto + sizeof(as_cluster_proto_identifier) + + sizeof(as_cluster_key)) { + // Not enough bytes for cluster key. + return NULL; + } + + return (as_cluster_key*)(proto + sizeof(as_cluster_proto_identifier)); +} + +/** + * Retrieves the sequence number from clustering hb plugin data. + * @param plugin_data can be NULL. + * @param plugin_data_size the size of plugin data. + * @return pointer to the sequence number on success, NULL on failure. + */ +static as_paxos_sequence_number* +clustering_hb_plugin_sequence_number_get(void* plugin_data, + size_t plugin_data_size) +{ + uint8_t* cluster_key = (uint8_t*)clustering_hb_plugin_cluster_key_get( + plugin_data, plugin_data_size); + if (cluster_key == NULL) { + // The data does not hold valid data or there is no cluster key. + return NULL; + } + + if ((uint8_t*)plugin_data + plugin_data_size + < cluster_key + sizeof(as_cluster_key) + + sizeof(as_paxos_sequence_number)) { + // Not enough bytes for succession list length. + return NULL; + } + + return (as_paxos_sequence_number*)(cluster_key + sizeof(as_cluster_key)); +} + +/** + * Retrieves the preferred principal from clustering hb plugin data. + * @param plugin_data can be NULL. + * @param plugin_data_size the size of plugin data. + * @return pointer to the preferred principal on success, NULL on failure. + */ +static cf_node* +clustering_hb_plugin_preferred_principal_get(void* plugin_data, + size_t plugin_data_size) +{ + uint8_t* sequence_number_p = + (uint8_t*)clustering_hb_plugin_sequence_number_get(plugin_data, + plugin_data_size); + if (sequence_number_p == NULL) { + // The data does not hold valid data or there is no sequence number. + return NULL; + } + + if ((uint8_t*)plugin_data + plugin_data_size + < sequence_number_p + sizeof(as_paxos_sequence_number) + + sizeof(cf_node)) { + // Not enough bytes for preferred principal. + return NULL; + } + + return (as_paxos_sequence_number*)(sequence_number_p + + sizeof(as_paxos_sequence_number)); +} + +/** + * Retrieves the succession list length pointer from clustering hb plugin data. + * @param plugin_data can be NULL. + * @param plugin_data_size the size of plugin data. + * @return pointer to succession list length on success, NULL on failure. + */ +static uint32_t* +clustering_hb_plugin_succession_length_get(void* plugin_data, + size_t plugin_data_size) +{ + uint8_t* preferred_principal_p = + (uint8_t*)clustering_hb_plugin_preferred_principal_get(plugin_data, + plugin_data_size); + if (preferred_principal_p == NULL) { + // The data does not hold valid data or there is no preferred principal + // and or succession list is missing. + return NULL; + } + + if ((uint8_t*)plugin_data + plugin_data_size + < preferred_principal_p + sizeof(cf_node) + sizeof(uint32_t)) { + // Not enough bytes for succession list length. + return NULL; + } + + return (uint32_t*)(preferred_principal_p + sizeof(cf_node)); +} + +/** + * Retrieves the pointer to the first node in the succession list. + * @param plugin_data can be NULL. + * @param plugin_data_size the size of plugin data. + * @return pointer to first node in succession list on success, NULL on failure + * or if the succession list is empty. + */ +static cf_node* +clustering_hb_plugin_succession_get(void* plugin_data, size_t plugin_data_size) +{ + uint8_t* succession_list_length_p = + (uint8_t*)clustering_hb_plugin_succession_length_get(plugin_data, + plugin_data_size); + if (succession_list_length_p == NULL) { + // The data does not hold valid data or there is no cluster key and or + // succession list is missing. + return NULL; + } + + if (*(uint32_t*)succession_list_length_p == 0) { + // Empty succession list. + return NULL; + } + + if ((uint8_t*)plugin_data + plugin_data_size + < succession_list_length_p + sizeof(uint32_t) + + (sizeof(cf_node) * (*(uint32_t*)succession_list_length_p))) { + // Not enough bytes for succession list length. + return NULL; + } + + return (cf_node*)(succession_list_length_p + sizeof(uint32_t)); +} + +/** + * Validate the correctness of plugin data. By ensuring all required fields are + * present and the succession list matches the provided length. + * @param plugin_data can be NULL. + * @param plugin_data_size the size of plugin data. + * @return pointer to first node in succession list on success, NULL on failure. + */ +static bool +clustering_hb_plugin_data_is_valid(void* plugin_data, size_t plugin_data_size) +{ + void* proto_identifier_p = clustering_hb_plugin_proto_get(plugin_data, + plugin_data_size); + if (proto_identifier_p == NULL) { + DEBUG("plugin data missing protocol identifier"); + return false; + } + + as_cluster_proto_identifier current_proto_identifier = + clustering_protocol_identifier_get(); + if (!clustering_versions_are_compatible(current_proto_identifier, + *(as_cluster_proto_identifier*)proto_identifier_p)) { + DEBUG("protocol versions incompatible - expected %"PRIx32" but was: %"PRIx32, + current_proto_identifier, + *(as_cluster_proto_identifier*)proto_identifier_p); + return false; + } + + void* cluster_key_p = clustering_hb_plugin_cluster_key_get(plugin_data, + plugin_data_size); + if (cluster_key_p == NULL) { + DEBUG("plugin data missing cluster key"); + return false; + } + + void* sequence_number_p = clustering_hb_plugin_sequence_number_get( + plugin_data, plugin_data_size); + if (sequence_number_p == NULL) { + DEBUG("plugin data missing sequence number"); + return false; + } + + void* preferred_principal_p = clustering_hb_plugin_preferred_principal_get( + plugin_data, plugin_data_size); + if (preferred_principal_p == NULL) { + DEBUG("plugin data missing preferred principal"); + return false; + } + + uint32_t* succession_list_length_p = + (void*)clustering_hb_plugin_succession_length_get(plugin_data, + plugin_data_size); + if (succession_list_length_p == NULL) { + DEBUG("plugin data missing succession list length"); + return false; + } + + void* succession_list_p = clustering_hb_plugin_succession_get(plugin_data, + plugin_data_size); + + if (*succession_list_length_p > 0 && succession_list_p == NULL) { + DEBUG("succession list length %d, but succession list is empty", + *succession_list_length_p); + return false; + } + + return true; +} + +/** + * Determines if the plugin data with hb subsystem is old to be ignored. + * ALL access to plugin data should be vetted through this function. The plugin + * data is obsolete if it was send before the current cluster state or has a + * version mismatch. + * + * This is detemined by comparing the plugin data hb message hlc timestamp and + * monotonic timestamps with the cluster formation hlc and monotonic times. + * + * @param cluster_modified_hlc_ts the hlc timestamp when current cluster change + * happened. Sent to avoid locking in this function. + * @param cluster_modified_time the monotonic timestamp when current cluster + * change happened. Sento to avoid locking in this function. + * @param plugin_data the plugin data. + * @param plugin_data_size the size of plugin data. + * @param msg_recv_ts the monotonic timestamp for plugin data receive. + * @param hb_msg_hlc_ts the hlc timestamp for plugin data receive. + * @return true if plugin data is obsolete, false otherwise. + */ +static bool +clustering_hb_plugin_data_is_obsolete(as_hlc_timestamp cluster_modified_hlc_ts, + cf_clock cluster_modified_time, void* plugin_data, + size_t plugin_data_size, cf_clock msg_recv_ts, + as_hlc_msg_timestamp* hb_msg_hlc_ts) +{ + if (!clustering_hb_plugin_data_is_valid(plugin_data, plugin_data_size)) { + // Plugin data is invalid. Assume it to be obsolete. + // Seems like a redundant check but required in case clustering protocol + // was switched to an incompatible version. + return true; + } + + if (as_hlc_send_timestamp_order(cluster_modified_hlc_ts, hb_msg_hlc_ts) + != AS_HLC_HAPPENS_BEFORE) { + // Cluster formation time after message send or the order is unknown, + // assume cluster formation is after message send. the caller should + // ignore this message. + return true; + } + + // HB data should be atleast after cluster formation time + one hb interval + // to send out our cluster state + one network delay for our information to + // reach the remote node + one hb interval for the other node to send out + // the his updated state + one network delay for the updated state to reach + // us. + if (cluster_modified_time + 2 * as_hb_tx_interval_get() + + 2 * g_config.fabric_latency_max_ms > msg_recv_ts) { + return true; + } + + return false; +} + +/** + * Indicates if the plugin data for a node indicates that it is an orphan node. + */ +static as_clustering_peer_node_state +clustering_hb_plugin_data_node_status(void* plugin_data, + size_t plugin_data_size) +{ + if (!clustering_hb_plugin_data_is_valid(plugin_data, plugin_data_size)) { + // Either we have not hb channel to this node or it has sen invalid + // plugin data. Assuming the cluster state is unknown. + return AS_NODE_UNKNOWN; + } + + as_cluster_key* cluster_key = clustering_hb_plugin_cluster_key_get( + plugin_data, plugin_data_size); + + if (*cluster_key == 0) { + return AS_NODE_ORPHAN; + } + + // Redundant paranoid check. + uint32_t* succession_list_length_p = + clustering_hb_plugin_succession_length_get(plugin_data, + plugin_data_size); + + if (*succession_list_length_p == 0) { + return AS_NODE_ORPHAN; + } + + return AS_NODE_CLUSTER_ASSIGNED; +} + +/** + * Push clustering payload into a heartbeat pulse message. The payload format is + * as described above. + */ +static void +clustering_hb_plugin_set_fn(msg* msg) +{ + if (!clustering_is_initialized()) { + // Clustering not initialized. Send no data at all. + return; + } + + CLUSTERING_LOCK(); + + uint32_t cluster_size = cf_vector_size(&g_register.succession_list); + + size_t payload_size = + // For the paxos version identifier + sizeof(uint32_t) + // For cluster key + + sizeof(as_cluster_key) + // For sequence number + + sizeof(as_paxos_sequence_number) + // For preferred principal + + sizeof(cf_node) + // For succession list length. + + sizeof(uint32_t) + // For succession list. + + (sizeof(cf_node) * cluster_size); + + uint8_t* payload = alloca(payload_size); + + uint8_t* current_field_p = payload; + + // Set the paxos protocol identifier. + uint32_t protocol = clustering_protocol_identifier_get(); + memcpy(current_field_p, &protocol, sizeof(protocol)); + current_field_p += sizeof(protocol); + + // Set cluster key. + memcpy(current_field_p, &g_register.cluster_key, + sizeof(g_register.cluster_key)); + current_field_p += sizeof(g_register.cluster_key); + + // Set the sequence number. + memcpy(current_field_p, &g_register.sequence_number, + sizeof(g_register.sequence_number)); + current_field_p += sizeof(g_register.sequence_number); + + // Set the preferred principal. + memcpy(current_field_p, &g_clustering.preferred_principal, + sizeof(g_clustering.preferred_principal)); + current_field_p += sizeof(g_clustering.preferred_principal); + + // Set succession length + memcpy(current_field_p, &cluster_size, sizeof(cluster_size)); + current_field_p += sizeof(cluster_size); + + // Copy over the succession list. + cf_node* succession = (cf_node*)(current_field_p); + for (int i = 0; i < cluster_size; i++) { + cf_vector_get(&g_register.succession_list, i, &succession[i]); + } + + if (msg_set_buf(msg, AS_HB_MSG_PAXOS_DATA, payload, payload_size, + MSG_SET_COPY) != 0) { + CRASH("error setting succession list on msg"); + } + + CLUSTERING_UNLOCK(); +} + +/** + * Plugin parse function that copies the msg payload verbatim to a plugin data. + */ +static void +clustering_hb_plugin_parse_data_fn(msg* msg, cf_node source, + as_hb_plugin_node_data* plugin_data) +{ + // Lockless check to prevent deadlocks. + if (g_clustering.sys_state == AS_CLUSTERING_SYS_STATE_UNINITIALIZED) { + // Ignore this heartbeat. + plugin_data->data_size = 0; + return; + } + + void* payload; + size_t payload_size; + + if (msg_get_buf(msg, AS_HB_MSG_PAXOS_DATA, (uint8_t**)&payload, + &payload_size, MSG_GET_DIRECT) != 0) { + cf_ticker_warning(AS_CLUSTERING, + "received empty clustering payload in heartbeat pulse from node %"PRIx64, + source); + plugin_data->data_size = 0; + return; + } + + // Validate and retain only valid plugin data. + if (!clustering_hb_plugin_data_is_valid(payload, payload_size)) { + cf_ticker_warning(AS_CLUSTERING, + "received invalid clustering payload in heartbeat pulse from node %"PRIx64, + source); + plugin_data->data_size = 0; + return; + } + + if (payload_size > plugin_data->data_capacity) { + // Round up to nearest multiple of block size to prevent very frequent + // reallocation. + size_t data_capacity = ((payload_size + HB_PLUGIN_DATA_BLOCK_SIZE - 1) + / HB_PLUGIN_DATA_BLOCK_SIZE) * HB_PLUGIN_DATA_BLOCK_SIZE; + + // Reallocate since we have outgrown existing capacity. + plugin_data->data = cf_realloc(plugin_data->data, data_capacity); + plugin_data->data_capacity = data_capacity; + } + + plugin_data->data_size = payload_size; + memcpy(plugin_data->data, payload, payload_size); +} + +/** + * Check if the input succession list from hb plugin data matches, with a + * succession list vector. + * @param succession_list the first succession list. + * @param succession_list_length the length of the succession list. + * @param succession_list_vector the second succession list as a vector. Should + * be protected from multithreaded access while this function is running. + * @return true if the succcession lists are equal, false otherwise. + */ +bool +clustering_hb_succession_list_matches(cf_node* succession_list, + uint32_t succession_list_length, cf_vector* succession_list_vector) +{ + if (succession_list_length != cf_vector_size(succession_list_vector)) { + return false; + } + + for (uint32_t i = 0; i < succession_list_length; i++) { + cf_node* vector_element = cf_vector_getp(succession_list_vector, i); + if (vector_element == NULL || *vector_element != succession_list[i]) { + return false; + } + } + return true; +} + +/* + * ---------------------------------------------------------------------------- + * Quantum interval generator + * ---------------------------------------------------------------------------- + */ + +/** + * Time taken for the effect of a fault to get propogated via HB. + */ +static uint32_t +quantum_interval_hb_fault_comm_delay() +{ + return as_hb_tx_interval_get() + network_latency_max(); +} + +/** + * Quantum wait time after node arrived event. + */ +static uint32_t +quantum_interval_node_arrived_wait_time(as_clustering_quantum_fault* fault) +{ + return MIN(quantum_interval(), + (fault->last_event_ts - fault->event_ts) / 2 + + 2 * quantum_interval_hb_fault_comm_delay() + + quantum_interval() / 2); +} + +/** + * Quantum wait time after node departs. + */ +static uint32_t +quantum_interval_node_departed_wait_time(as_clustering_quantum_fault* fault) +{ + return MIN(quantum_interval(), + as_hb_node_timeout_get() + + 2 * quantum_interval_hb_fault_comm_delay() + + quantum_interval() / 4); +} + +/** + * Quantum wait time after a peer nodes adjacency changed. + */ +static uint32_t +quantum_interval_peer_adjacency_changed_wait_time( + as_clustering_quantum_fault* fault) +{ + return MIN(quantum_interval(), quantum_interval_hb_fault_comm_delay()); +} + +/** + * Quantum wait time after accepting a join request. + */ +static uint32_t +quantum_interval_join_accepted_wait_time(as_clustering_quantum_fault* fault) +{ + // Ensure we wait for atleast one heartbeat interval to receive the latest + // heartbeat after the last join request and for other nodes to send their + // join requests as well. + return MIN(quantum_interval(), + (fault->last_event_ts - fault->event_ts) + + join_cluster_check_interval() + network_latency_max() + + as_hb_tx_interval_get()); +} + +/** + * Quantum wait time after principal node departs. + */ +static uint32_t +quantum_interval_principal_departed_wait_time( + as_clustering_quantum_fault* fault) +{ + // Anticipate an incoming join request from other orphaned cluster members. + return MIN(quantum_interval(), + as_hb_node_timeout_get() + + 2 * quantum_interval_hb_fault_comm_delay() + + MAX(quantum_interval() / 4, + quantum_interval_join_accepted_wait_time(fault))); +} + +/** + * Quantum wait time after seeing a cluster that might send us a join request. + */ +static uint32_t +quantum_interval_inbound_merge_candidate_wait_time( + as_clustering_quantum_fault* fault) +{ + return quantum_interval(); +} + +/** + * Quantum wait time after a cluster member has been orphaned. + */ +static uint32_t +quantum_interval_member_orphaned_wait_time(as_clustering_quantum_fault* fault) +{ + return quantum_interval(); +} + +/** + * Marks the current quantum interval as skipped. A kludge to allow quantum to + * allow quantum interval generator to mark quantum intervals as postponed. + */ +static void +quantum_interval_mark_postponed() +{ + CLUSTERING_LOCK(); + g_quantum_interval_generator.is_interval_postponed = true; + CLUSTERING_UNLOCK(); +} + +/** + * Update the vtable for a fault. + */ +static void +quantum_interval_vtable_update(as_clustering_quantum_fault_type type, + char *fault_log_str, as_clustering_quantum_fault_wait_fn wait_fn) +{ + CLUSTERING_LOCK(); + g_quantum_interval_generator.vtable[type].fault_log_str = fault_log_str; + g_quantum_interval_generator.vtable[type].wait_fn = wait_fn; + CLUSTERING_UNLOCK(); +} + +/** + * Initialize quantum interval generator. + */ +static void +quantum_interval_generator_init() +{ + CLUSTERING_LOCK(); + memset(&g_quantum_interval_generator, 0, + sizeof(g_quantum_interval_generator)); + g_quantum_interval_generator.last_quantum_start_time = cf_getms(); + g_quantum_interval_generator.last_quantum_interval = quantum_interval(); + + // Initialize the vtable. + quantum_interval_vtable_update(QUANTUM_FAULT_NODE_ARRIVED, "node arrived", + quantum_interval_node_arrived_wait_time); + quantum_interval_vtable_update(QUANTUM_FAULT_NODE_DEPARTED, "node departed", + quantum_interval_node_departed_wait_time); + quantum_interval_vtable_update(QUANTUM_FAULT_PRINCIPAL_DEPARTED, + "principal departed", + quantum_interval_principal_departed_wait_time); + quantum_interval_vtable_update(QUANTUM_FAULT_PEER_ADJACENCY_CHANGED, + "peer adjacency changed", + quantum_interval_peer_adjacency_changed_wait_time); + quantum_interval_vtable_update(QUANTUM_FAULT_JOIN_ACCEPTED, + "join request accepted", quantum_interval_join_accepted_wait_time); + quantum_interval_vtable_update(QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN, + "merge candidate seen", + quantum_interval_inbound_merge_candidate_wait_time); + quantum_interval_vtable_update(QUANTUM_FAULT_CLUSTER_MEMBER_ORPHANED, + "member orphaned", quantum_interval_member_orphaned_wait_time); + + CLUSTERING_UNLOCK(); +} + +/** + * Get the earliest possible monotonic clock time the next quantum interval can + * start. + * + * Start quantum interval after the last update to any one of adjacency, + * pending_join_requests , neighboring_principals. The heuristic is that these + * should be stable to initiate cluster merge / join or cluster formation + * requests. + */ +static cf_clock +quantum_interval_earliest_start_time() +{ + CLUSTERING_LOCK(); + cf_clock fault_event_time = 0; + for (int i = 0; i < QUANTUM_FAULT_TYPE_SENTINEL; i++) { + if (g_quantum_interval_generator.fault[i].event_ts) { + fault_event_time = MAX(fault_event_time, + g_quantum_interval_generator.fault[i].event_ts + + g_quantum_interval_generator.vtable[i].wait_fn( + &g_quantum_interval_generator.fault[i])); + } + + TRACE("Fault:%s event_ts:%"PRIu64, + g_quantum_interval_generator.vtable[i].fault_log_str, + g_quantum_interval_generator.fault[i].event_ts); + } + + TRACE("Last Quantum interval:%"PRIu64, + g_quantum_interval_generator.last_quantum_start_time); + + cf_clock start_time = g_quantum_interval_generator.last_quantum_start_time + + quantum_interval(); + if (fault_event_time) { + // Ensure we have at least 1/2 quantum interval of separation between + // quantum intervals to give chance to multiple fault events that are + // resonably close in time. + start_time = MAX( + g_quantum_interval_generator.last_quantum_start_time + + quantum_interval() / 2, fault_event_time); + } + CLUSTERING_UNLOCK(); + + return start_time; +} + +/** + * Reset quantum interval fault. + * @param fault_type the fault type. + */ +static void +quantum_interval_fault_reset(as_clustering_quantum_fault_type fault_type) +{ + CLUSTERING_LOCK(); + memset(&g_quantum_interval_generator.fault[fault_type], 0, + sizeof(g_quantum_interval_generator.fault[fault_type])); + CLUSTERING_UNLOCK(); +} + +/** + * Update a fault event based on the current fault ts. + * @param fault the fault to update. + * @param fault_ts the new fault timestamp + * @param src_nodeid the fault causing nodeid, 0 if the nodeid is not known. + */ +static void +quantum_interval_fault_update(as_clustering_quantum_fault_type fault_type, + cf_clock fault_ts, cf_node src_nodeid) +{ + CLUSTERING_LOCK(); + as_clustering_quantum_fault* fault = + &g_quantum_interval_generator.fault[fault_type]; + if (fault->event_ts == 0 + || fault_ts - fault->event_ts > quantum_interval() / 2) { + // Fault event detected first time in this quantum or we are seeing the + // effect of a different event more than half quantum apart. + fault->event_ts = fault_ts; + DETAIL("updated '%s' fault with ts %"PRIu64" for node %"PRIx64, + g_quantum_interval_generator.vtable[fault_type].fault_log_str, fault_ts, src_nodeid); + } + + fault->last_event_ts = fault_ts; + CLUSTERING_UNLOCK(); +} + +/** + * Reset the state for the next quantum interval. + */ +static void +quantum_interval_generator_reset(cf_clock last_quantum_start_time) +{ + CLUSTERING_LOCK(); + if (!g_quantum_interval_generator.is_interval_postponed) { + // Update last quantum interval. + g_quantum_interval_generator.last_quantum_interval = MAX(0, + last_quantum_start_time + - g_quantum_interval_generator.last_quantum_start_time); + + g_quantum_interval_generator.last_quantum_start_time = + last_quantum_start_time; + for (int i = 0; i < QUANTUM_FAULT_TYPE_SENTINEL; i++) { + quantum_interval_fault_reset(i); + } + } + g_quantum_interval_generator.is_interval_postponed = false; + + CLUSTERING_UNLOCK(); +} + +/** + * Handle timer event and generate a quantum internal event if required. + */ +static void +quantum_interval_generator_timer_event_handle( + as_clustering_internal_event* timer_event) +{ + CLUSTERING_LOCK(); + cf_clock now = cf_getms(); + + cf_clock earliest_quantum_start_time = + quantum_interval_earliest_start_time(); + + cf_clock expected_quantum_start_time = + g_quantum_interval_generator.last_quantum_start_time + + g_quantum_interval_generator.last_quantum_interval; + + // Provide a buffer for current quantum interval to finish gracefully as + // long as it is less than half a quantum interval. + cf_clock quantum_wait_buffer = MIN( + earliest_quantum_start_time > expected_quantum_start_time ? + earliest_quantum_start_time - expected_quantum_start_time : + 0, g_quantum_interval_generator.last_quantum_interval / 2); + + // Fire quantum interval start event if it is time, or if we have skipped + // quantum interval start for more that the max skip number of intervals. + // Add a buffer of wait time to ensure we wait a bit more if we can cover + // the waiting time. + bool is_skippable = g_quantum_interval_generator.last_quantum_start_time + + (quantum_interval_skip_max() + 1) + * g_quantum_interval_generator.last_quantum_interval + + quantum_wait_buffer > now; + bool fire_quantum_event = earliest_quantum_start_time <= now + || !is_skippable; + CLUSTERING_UNLOCK(); + + if (fire_quantum_event) { + as_clustering_internal_event timer_event; + memset(&timer_event, 0, sizeof(timer_event)); + timer_event.type = AS_CLUSTERING_INTERNAL_EVENT_QUANTUM_INTERVAL_START; + timer_event.quantum_interval_is_skippable = is_skippable; + internal_event_dispatch(&timer_event); + + // Reset for next interval generation. + quantum_interval_generator_reset(now); + } +} + +/** + * Check if the interval generator has seen an adjacency fault in the current + * quantum interval. + * @return true if the quantum interval generator has seen an adjacency fault, + * false otherwise. + */ +static bool +quantum_interval_is_adjacency_fault_seen() +{ + CLUSTERING_LOCK(); + bool is_fault_seen = + g_quantum_interval_generator.fault[QUANTUM_FAULT_NODE_ARRIVED].event_ts + || g_quantum_interval_generator.fault[QUANTUM_FAULT_NODE_DEPARTED].event_ts + || g_quantum_interval_generator.fault[QUANTUM_FAULT_PRINCIPAL_DEPARTED].event_ts; + CLUSTERING_UNLOCK(); + return is_fault_seen; +} + +/** + * Check if the interval generator has seen a peer node adjacency changed fault + * in current quantum interval. + * @return true if the quantum interval generator has seen a peer node adjacency + * changed fault, + * false otherwise. + */ +static bool +quantum_interval_is_peer_adjacency_fault_seen() +{ + CLUSTERING_LOCK(); + bool is_fault_seen = + g_quantum_interval_generator.fault[QUANTUM_FAULT_PEER_ADJACENCY_CHANGED].event_ts; + CLUSTERING_UNLOCK(); + return is_fault_seen; +} + +/** + * Update the fault time for this quantum on self heartbeat adjacency list + * change. + */ +static void +quantum_interval_generator_hb_event_handle( + as_clustering_internal_event* hb_event) +{ + CLUSTERING_LOCK(); + + cf_clock min_event_time[AS_HB_NODE_EVENT_SENTINEL]; + cf_clock min_event_node[AS_HB_NODE_EVENT_SENTINEL]; + + memset(min_event_time, 0, sizeof(min_event_time)); + memset(min_event_node, 0, sizeof(min_event_node)); + + as_hb_event_node* events = hb_event->hb_events; + for (int i = 0; i < hb_event->hb_n_events; i++) { + if (min_event_time[events[i].evt] == 0 + || min_event_time[events[i].evt] > events[i].event_time) { + min_event_time[events[i].evt] = events[i].event_time; + min_event_node[events[i].evt] = events[i].nodeid; + } + + if (events[i].evt == AS_HB_NODE_DEPART + && clustering_is_our_principal(events[i].nodeid)) { + quantum_interval_fault_update(QUANTUM_FAULT_PRINCIPAL_DEPARTED, + events[i].event_time, events[i].nodeid); + } + } + + for (int i = 0; i < AS_HB_NODE_EVENT_SENTINEL; i++) { + if (min_event_time[i]) { + switch (i) { + case AS_HB_NODE_ARRIVE: + quantum_interval_fault_update(QUANTUM_FAULT_NODE_ARRIVED, + min_event_time[i], min_event_node[i]); + break; + case AS_HB_NODE_DEPART: + quantum_interval_fault_update(QUANTUM_FAULT_NODE_DEPARTED, + min_event_time[i], min_event_node[i]); + break; + case AS_HB_NODE_ADJACENCY_CHANGED: + if (clustering_is_cluster_member(min_event_node[i])) { + quantum_interval_fault_update( + QUANTUM_FAULT_PEER_ADJACENCY_CHANGED, + min_event_time[i], min_event_node[i]); + } + break; + default: + break; + } + + } + } + CLUSTERING_UNLOCK(); +} + +/** + * Update the fault time for this quantum on clustering information for an + * adjacent node change. Assumes the node's plugin data is not obsolete. + */ +static void +quantum_interval_generator_hb_plugin_data_changed_handle( + as_clustering_internal_event* change_event) +{ + CLUSTERING_LOCK(); + + if (clustering_hb_plugin_data_is_obsolete( + g_register.cluster_modified_hlc_ts, + g_register.cluster_modified_time, change_event->plugin_data->data, + change_event->plugin_data->data_size, + change_event->plugin_data_changed_ts, + &change_event->plugin_data_changed_hlc_ts)) { + // The plugin data is obsolete. Can't take decisions based on it. + goto Exit; + } + + // Get the changed node's succession list, cluster key. All the fields + // should be present since the obsolete check also checked for fields being + // valid. + cf_node* succession_list_p = clustering_hb_plugin_succession_get( + change_event->plugin_data->data, + change_event->plugin_data->data_size); + uint32_t* succession_list_length_p = + clustering_hb_plugin_succession_length_get( + change_event->plugin_data->data, + change_event->plugin_data->data_size); + + if (*succession_list_length_p > 0 + && !clustering_is_our_principal(succession_list_p[0]) + && clustering_is_principal()) { + if (succession_list_p[0] < config_self_nodeid_get()) { + // We are seeing a new principal who could potentially merge with + // this cluster. + if (g_quantum_interval_generator.fault[QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN].event_ts + != 1) { + quantum_interval_fault_update( + QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN, cf_getms(), + change_event->plugin_data_changed_nodeid); + } + } + else { + // We see a cluster with higher nodeid and most probably we will not + // be the principal of the merged cluster. Reset the fault + // timestamp, however set it to 1 to differentiate between no fault + // and a fault to be ingnored in this quantum interval. A value of 1 + // for practical purposes will never push the quantum interval + // forward. + quantum_interval_fault_update( + QUANTUM_FAULT_INBOUND_MERGE_CANDIDATE_SEEN, 1, + change_event->plugin_data_changed_nodeid); + } + } + else { + if (clustering_is_principal() && *succession_list_length_p == 0 + && vector_find(&g_register.succession_list, + &change_event->plugin_data_changed_nodeid) >= 0) { + // One of our cluster members switched to orphan state. Most likely + // a quick restart. + quantum_interval_fault_update(QUANTUM_FAULT_CLUSTER_MEMBER_ORPHANED, + cf_getms(), change_event->plugin_data_changed_nodeid); + } + else { + // A node becoming an orphan node or seeing a succession with our + // principal does not mean we have seen a new cluster. + } + } +Exit: + CLUSTERING_UNLOCK(); +} + +/** + * Update the fault time for this quantum on self heartbeat adjacency list + * change. + */ +static void +quantum_interval_generator_join_request_accepted_handle( + as_clustering_internal_event* join_request_event) +{ + quantum_interval_fault_update(QUANTUM_FAULT_JOIN_ACCEPTED, cf_getms(), + join_request_event->join_request_source_nodeid); +} + +/** + * Dispatch internal clustering events for the quantum interval generator. + */ +static void +quantum_interval_generator_event_dispatch(as_clustering_internal_event* event) +{ + switch (event->type) { + case AS_CLUSTERING_INTERNAL_EVENT_TIMER: + quantum_interval_generator_timer_event_handle(event); + break; + case AS_CLUSTERING_INTERNAL_EVENT_HB: + quantum_interval_generator_hb_event_handle(event); + break; + case AS_CLUSTERING_INTERNAL_EVENT_HB_PLUGIN_DATA_CHANGED: + quantum_interval_generator_hb_plugin_data_changed_handle(event); + break; + case AS_CLUSTERING_INTERNAL_EVENT_JOIN_REQUEST_ACCEPTED: + quantum_interval_generator_join_request_accepted_handle(event); + break; + default: + break; + } +} + +/** + * Start quantum interval generator. + */ +static void +quantum_interval_generator_start() +{ + CLUSTERING_LOCK(); + g_quantum_interval_generator.last_quantum_start_time = cf_getms(); + CLUSTERING_UNLOCK(); +} + +/* + * ---------------------------------------------------------------------------- + * Clustering common + * ---------------------------------------------------------------------------- + */ + +/** + * Generate a new random and most likely a unique cluster key. + * @param current_cluster_key current cluster key to prevent collision. + * @return randomly generated cluster key. + */ +static as_cluster_key +clustering_cluster_key_generate(as_cluster_key current_cluster_key) +{ + // Generate one uuid and use this for the cluster key + as_cluster_key cluster_key = 0; + + // Generate a non-zero cluster key that fits in 6 bytes. + while ((cluster_key = (cf_get_rand64() >> 16)) == 0 + || cluster_key == current_cluster_key) { + ; + } + + return cluster_key; +} + +/** + * Indicates if this node is an orphan. A node is deemed orphan if it is not a + * memeber of any cluster. + */ +static bool +clustering_is_orphan() +{ + CLUSTERING_LOCK(); + + bool is_orphan = cf_vector_size(&g_register.succession_list) <= 0 + || g_register.cluster_key == 0; + + CLUSTERING_UNLOCK(); + + return is_orphan; +} + +/** + * Return the principal node for current cluster. + * @param principal (output) the current principal for the cluster. + * @return 0 if there is a valid principal, -1 if the node is in orphan state + * and there is no valid principal. + */ +static int +clustering_principal_get(cf_node* principal) +{ + CLUSTERING_LOCK(); + int rv = -1; + + if (cf_vector_get(&g_register.succession_list, 0, principal) == 0) { + rv = 0; + } + + CLUSTERING_UNLOCK(); + + return rv; +} + +/** + * Indicates if this node is the principal for its cluster. + */ +static bool +clustering_is_principal() +{ + CLUSTERING_LOCK(); + cf_node current_principal; + + bool is_principal = clustering_principal_get(¤t_principal) == 0 + && current_principal == config_self_nodeid_get(); + + CLUSTERING_UNLOCK(); + + return is_principal; +} + +/** + * Indicates if input node is this node's principal. Input node can be self node + * as well. + */ +static bool +clustering_is_our_principal(cf_node nodeid) +{ + CLUSTERING_LOCK(); + cf_node current_principal; + + bool is_principal = clustering_principal_get(¤t_principal) == 0 + && current_principal == nodeid; + + CLUSTERING_UNLOCK(); + + return is_principal; +} + +/** + * Indicates if a node is our cluster member. + */ +static bool +clustering_is_cluster_member(cf_node nodeid) +{ + CLUSTERING_LOCK(); + bool is_member = vector_find(&g_register.succession_list, &nodeid) >= 0; + CLUSTERING_UNLOCK(); + return is_member; +} + +/** + * Indicates if the input node is present in a succession list. + * @param nodeid the nodeid to search. + * @param succession_list the succession list. + * @param succession_list_length the length of the succession list. + * @return true if the node is present in the succession list, false otherwise. + */ +static bool +clustering_is_node_in_succession(cf_node nodeid, cf_node* succession_list, + int succession_list_length) +{ + for (int i = 0; i < succession_list_length; i++) { + if (succession_list[i] == nodeid) { + return true; + } + } + + return false; +} + +/** + * Indicates if the input node can be accepted as this a paxos proposer. We can + * accept the new node as our principal if we are in the orphan state or if the + * input node is already our principal. + * + * Note: In case we send a join request to a node with a lower node id, input + * node's nodeid can be less than our nodeid. This is still valid as the + * proposer who will hand over the principalship to us once paxos round is over. + * + * @param nodeid the nodeid of the proposer to check. + * @return true if this input node is an acceptable proposer. + */ +static bool +clustering_can_accept_as_proposer(cf_node nodeid) +{ + return clustering_is_orphan() || clustering_is_our_principal(nodeid); +} + +/** + * Plugin data iterate function that finds and collects neighboring principals, + * excluding current principal if any . + */ +static void +clustering_neighboring_principals_find(cf_node nodeid, void* plugin_data, + size_t plugin_data_size, cf_clock recv_monotonic_ts, + as_hlc_msg_timestamp* msg_hlc_ts, void* udata) +{ + cf_vector* neighboring_principals = (cf_vector*)udata; + + CLUSTERING_LOCK(); + + // For determining neighboring principal it is alright if this data is + // within two heartbeat intervals. So obsolete check has the timestamps as + // zero. This way we will not reject principals that have nothing to do with + // our cluster changes. + if (recv_monotonic_ts + 2 * as_hb_tx_interval_get() >= cf_getms() + && !clustering_hb_plugin_data_is_obsolete(0, 0, plugin_data, + plugin_data_size, recv_monotonic_ts, msg_hlc_ts)) { + cf_node* succession_list = clustering_hb_plugin_succession_get( + plugin_data, plugin_data_size); + + uint32_t* succession_list_length_p = + clustering_hb_plugin_succession_length_get(plugin_data, + plugin_data_size); + + if (succession_list != NULL && succession_list_length_p != NULL + && *succession_list_length_p > 0 + && succession_list[0] != config_self_nodeid_get()) { + cf_vector_append_unique(neighboring_principals, + &succession_list[0]); + } + } + else { + DETAIL( + "neighboring principal check skipped - found obsolete plugin data for node %"PRIx64, + nodeid); + } + + CLUSTERING_UNLOCK(); +} + +/** + * Get a list of adjacent principal nodes ordered by descending nodeids. + */ +static void +clustering_neighboring_principals_get(cf_vector* neighboring_principals) +{ + CLUSTERING_LOCK(); + + // Use a single iteration over the clustering data received via the + // heartbeats instead of individual calls to get a consistent view and avoid + // small lock and releases. + as_hb_plugin_data_iterate_all(AS_HB_PLUGIN_CLUSTERING, + clustering_neighboring_principals_find, neighboring_principals); + + vector_sort_unique(neighboring_principals, cf_node_compare_desc); + + CLUSTERING_UNLOCK(); +} + +/** + * Find dead nodes in current succession list. + */ +static void +clustering_dead_nodes_find(cf_vector* dead_nodes) +{ + CLUSTERING_LOCK(); + + cf_vector* succession_list_p = &g_register.succession_list; + int succession_list_count = cf_vector_size(succession_list_p); + for (int i = 0; i < succession_list_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + cf_node cluster_member_nodeid = *((cf_node*)cf_vector_getp( + succession_list_p, i)); + + if (!as_hb_is_alive(cluster_member_nodeid)) { + cf_vector_append(dead_nodes, &cluster_member_nodeid); + } + } + + CLUSTERING_UNLOCK(); +} + +/** + * Indicates if a node is faulty. A node in the succecssion list deemed faulty + * - if the node is alive and it reports to be an orphan or is part of some + * other cluster. + * - if the node is alive its clustering protocol identifier does not match this + * node's clustering protocol identifier. + */ +static bool +clustering_node_is_faulty(cf_node nodeid) +{ + if (nodeid == config_self_nodeid_get()) { + // Self node is never faulty wrt clustering. + return false; + } + + CLUSTERING_LOCK(); + bool is_faulty = false; + as_hlc_msg_timestamp hb_msg_hlc_ts; + cf_clock msg_recv_ts = 0; + as_hb_plugin_node_data plugin_data = { 0 }; + + if (clustering_hb_plugin_data_get(nodeid, &plugin_data, &hb_msg_hlc_ts, + &msg_recv_ts) != 0 + || clustering_hb_plugin_data_is_obsolete( + g_register.cluster_modified_hlc_ts, + g_register.cluster_modified_time, plugin_data.data, + plugin_data.data_size, msg_recv_ts, &hb_msg_hlc_ts)) { + INFO( + "faulty check skipped - found obsolete plugin data for node %"PRIx64, + nodeid); + is_faulty = false; + goto Exit; + } + + // We have clustering data from the node after the current cluster change. + // Compare protocol identifier, clusterkey, and succession. + as_cluster_proto_identifier* proto_p = clustering_hb_plugin_proto_get( + plugin_data.data, plugin_data.data_size); + + if (proto_p == NULL + || !clustering_versions_are_compatible(*proto_p, + clustering_protocol_identifier_get())) { + DEBUG("for node %"PRIx64" protocol version mismatch - expected: %"PRIx32" but was : %"PRIx32, + nodeid, clustering_protocol_identifier_get(), + proto_p != NULL ? *proto_p : 0); + is_faulty = true; + goto Exit; + } + + as_cluster_key* cluster_key_p = clustering_hb_plugin_cluster_key_get( + plugin_data.data, plugin_data.data_size); + if (cluster_key_p == NULL || *cluster_key_p != g_register.cluster_key) { + DEBUG("for node %"PRIx64" cluster key mismatch - expected: %"PRIx64" but was : %"PRIx64, + nodeid, g_register.cluster_key, cluster_key_p != NULL ? *cluster_key_p : 0); + is_faulty = true; + goto Exit; + } + + // Check succession list just to be sure. + // We have clustering data from the node after the current cluster change. + cf_node* succession_list = clustering_hb_plugin_succession_get( + plugin_data.data, plugin_data.data_size); + + uint32_t* succession_list_length_p = + clustering_hb_plugin_succession_length_get(plugin_data.data, + plugin_data.data_size); + + if (succession_list == NULL || succession_list_length_p == NULL + || !clustering_hb_succession_list_matches(succession_list, + *succession_list_length_p, &g_register.succession_list)) { + INFO("for node %"PRIx64" succession list mismatch", nodeid); + + log_cf_node_vector("self succession list:", &g_register.succession_list, + CF_INFO); + + if (succession_list) { + log_cf_node_array("node succession list:", succession_list, + succession_list && succession_list_length_p ? + *succession_list_length_p : 0, CF_INFO); + } + else { + INFO("node succession list: (empty)"); + } + + is_faulty = true; + goto Exit; + } + +Exit: + CLUSTERING_UNLOCK(); + return is_faulty; +} + +/** + * Find "faulty" nodes in current succession list. + */ +static void +clustering_faulty_nodes_find(cf_vector* faulty_nodes) +{ + CLUSTERING_LOCK(); + + if (clustering_is_orphan()) { + goto Exit; + } + + cf_vector* succession_list_p = &g_register.succession_list; + int succession_list_count = cf_vector_size(succession_list_p); + for (int i = 0; i < succession_list_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + cf_node cluster_member_nodeid = *((cf_node*)cf_vector_getp( + succession_list_p, i)); + if (clustering_node_is_faulty(cluster_member_nodeid)) { + cf_vector_append(faulty_nodes, &cluster_member_nodeid); + } + } + +Exit: + CLUSTERING_UNLOCK(); +} + +/** + * Indicates if a node is in sync with this node's cluster. A node in the + * succecssion list is deemed in sync if the node is alive and it reports to be + * in the same cluster via its heartbeats. + */ +static bool +clustering_node_is_sync(cf_node nodeid) +{ + if (nodeid == config_self_nodeid_get()) { + // Self node is always in sync wrt clustering. + return true; + } + + CLUSTERING_LOCK(); + bool is_sync = false; + as_hlc_msg_timestamp hb_msg_hlc_ts; + cf_clock msg_recv_ts = 0; + as_hb_plugin_node_data plugin_data = { 0 }; + bool data_exists = + clustering_hb_plugin_data_get(nodeid, &plugin_data, &hb_msg_hlc_ts, + &msg_recv_ts) == 0; + + // Latest valid plugin data is ok as long as other checks are met. Hence the + // timestamps are zero. + if (!data_exists || msg_recv_ts + 2 * as_hb_tx_interval_get() < cf_getms() + || clustering_hb_plugin_data_is_obsolete(0, 0, plugin_data.data, + plugin_data.data_size, msg_recv_ts, &hb_msg_hlc_ts)) { + is_sync = false; + goto Exit; + } + + // We have clustering data from the node after the current cluster change. + // Compare protocol identifier, clusterkey, and succession. + as_cluster_proto_identifier* proto_p = clustering_hb_plugin_proto_get( + plugin_data.data, plugin_data.data_size); + + if (proto_p == NULL + || !clustering_versions_are_compatible(*proto_p, + clustering_protocol_identifier_get())) { + DEBUG( + "for node %"PRIx64" protocol version mismatch - expected: %"PRIx32" but was : %"PRIx32, + nodeid, clustering_protocol_identifier_get(), + proto_p != NULL ? *proto_p : 0); + is_sync = false; + goto Exit; + } + + as_cluster_key* cluster_key_p = clustering_hb_plugin_cluster_key_get( + plugin_data.data, plugin_data.data_size); + if (cluster_key_p == NULL || *cluster_key_p != g_register.cluster_key) { + DEBUG( + "for node %"PRIx64" cluster key mismatch - expected: %"PRIx64" but was : %"PRIx64, + nodeid, g_register.cluster_key, cluster_key_p != NULL ? *cluster_key_p : 0); + is_sync = false; + goto Exit; + } + + // Check succession list just to be sure. + // We have clustering data from the node after the current cluster change. + cf_node* succession_list = clustering_hb_plugin_succession_get( + plugin_data.data, plugin_data.data_size); + + uint32_t* succession_list_length_p = + clustering_hb_plugin_succession_length_get(plugin_data.data, + plugin_data.data_size); + + if (succession_list == NULL || succession_list_length_p == NULL + || !clustering_hb_succession_list_matches(succession_list, + *succession_list_length_p, &g_register.succession_list)) { + DEBUG("for node %"PRIx64" succession list mismatch", nodeid); + + log_cf_node_vector("self succession list:", &g_register.succession_list, + CF_DEBUG); + + if (succession_list) { + log_cf_node_array("node succession list:", succession_list, + succession_list && succession_list_length_p ? + *succession_list_length_p : 0, CF_DEBUG); + } + else { + DEBUG("node succession list: (empty)"); + } + + is_sync = false; + goto Exit; + } + + is_sync = true; + +Exit: + CLUSTERING_UNLOCK(); + return is_sync; +} + +/** + * Find orphan nodes using clustering data for each node in the heartbeat's + * adjacency list. + */ +static void +clustering_orphan_nodes_find(cf_node nodeid, void* plugin_data, + size_t plugin_data_size, cf_clock recv_monotonic_ts, + as_hlc_msg_timestamp* msg_hlc_ts, void* udata) +{ + cf_vector* orphans = udata; + + CLUSTERING_LOCK(); + + // For determining orphan it is alright if this data is within two heartbeat + // intervals. So obsolete check has the timestamps as zero. + if (recv_monotonic_ts + 2 * as_hb_tx_interval_get() >= cf_getms() + && !clustering_hb_plugin_data_is_obsolete(0, 0, plugin_data, + plugin_data_size, recv_monotonic_ts, msg_hlc_ts)) { + if (clustering_hb_plugin_data_node_status(plugin_data, plugin_data_size) + == AS_NODE_ORPHAN) { + cf_vector_append(orphans, &nodeid); + } + + } + else { + DETAIL( + "orphan check skipped - found obsolete plugin data for node %"PRIx64, + nodeid); + } + + CLUSTERING_UNLOCK(); +} + +/** + * Get a list of neighboring nodes that are orphans. Does not include self node. + */ +static void +clustering_neighboring_orphans_get(cf_vector* neighboring_orphans) +{ + CLUSTERING_LOCK(); + + // Use a single iteration over the clustering data received via the + // heartbeats instead of individual calls to get a consistent view and avoid + // small lock and release. + as_hb_plugin_data_iterate_all(AS_HB_PLUGIN_CLUSTERING, + clustering_orphan_nodes_find, neighboring_orphans); + + CLUSTERING_UNLOCK(); +} + +/** + * Find neighboring nodes using clustering data for each node in the heartbeat's + * adjacency list. + */ +static void +clustering_neighboring_nodes_find(cf_node nodeid, void* plugin_data, + size_t plugin_data_size, cf_clock recv_monotonic_ts, + as_hlc_msg_timestamp* msg_hlc_ts, void* udata) +{ + cf_vector* nodes = udata; + cf_vector_append(nodes, &nodeid); +} + +/** + * Get a list of all neighboring nodes. Does not include self node. + */ +static void +clustering_neighboring_nodes_get(cf_vector* neighboring_nodes) +{ + CLUSTERING_LOCK(); + + // Use a single iteration over the clustering data received via the + // heartbeats instead of individual calls to get a consistent view and avoid + // small lock and release. + as_hb_plugin_data_iterate_all(AS_HB_PLUGIN_CLUSTERING, + clustering_neighboring_nodes_find, neighboring_nodes); + + CLUSTERING_UNLOCK(); +} + +/** + * Evict nodes not forming a clique from the succession list. + */ +static uint32_t +clustering_succession_list_clique_evict(cf_vector* succession_list, + char* evict_msg) +{ + uint32_t num_evicted = 0; + if (g_config.clustering_config.clique_based_eviction_enabled) { + // Remove nodes that do not form a clique. + cf_vector* evicted_nodes = vector_stack_lockless_create(cf_node); + as_hb_maximal_clique_evict(succession_list, evicted_nodes); + num_evicted = cf_vector_size(evicted_nodes); + log_cf_node_vector(evict_msg, evicted_nodes, + num_evicted > 0 ? CF_INFO : CF_DEBUG); + + vector_subtract(succession_list, evicted_nodes); + cf_vector_destroy(evicted_nodes); + } + return num_evicted; +} + +/* + * ---------------------------------------------------------------------------- + * Clustering network message functions + * ---------------------------------------------------------------------------- + */ + +/** + * Fill common source node specific fields for the message. + * @param msg the message to fill the source fields into. + */ +static void +msg_src_fields_fill(msg* msg) +{ + // Set the hb protocol id / version. + if (msg_set_uint32(msg, AS_CLUSTERING_MSG_ID, + clustering_protocol_identifier_get()) != 0) { + CRASH("error setting clustering protocol on msg"); + } + + // Set the send timestamp + if (msg_set_uint64(msg, AS_CLUSTERING_MSG_HLC_TIMESTAMP, + as_hlc_timestamp_now()) != 0) { + CRASH("error setting send timestamp on msg"); + } +} + +/** + * Read the protocol identifier for this clustering message. These functions can + * get called multiple times for a single message. Hence they do not increment + * error counters. + * @param msg the incoming message. + * @param id the output id. + * @return 0 if the type could be parsed -1 on failure. + */ +static int +msg_proto_id_get(msg* msg, uint32_t* id) +{ + if (msg_get_uint32(msg, AS_CLUSTERING_MSG_ID, id) != 0) { + return -1; + } + + return 0; +} + +/** + * Read the message type. These functions can get called multiple times for a + * single message. Hence they do not increment error counters. + * @param msg the incoming message. + * @param type the output message type. + * @return 0 if the type could be parsed -1 on failure. + */ +static int +msg_type_get(msg* msg, as_clustering_msg_type* type) +{ + if (msg_get_uint32(msg, AS_CLUSTERING_MSG_TYPE, type) != 0) { + return -1; + } + + return 0; +} + +/** + * Set the type for an outgoing message. + * @param msg the outgoing message. + * @param msg_type the type to set. + */ +static void +msg_type_set(msg* msg, as_clustering_msg_type msg_type) +{ + // Set the message type. + if (msg_set_uint32(msg, AS_CLUSTERING_MSG_TYPE, msg_type) != 0) { + CRASH("error setting type on msg"); + } +} + +/** + * Read the proposed principal field from the message. + * @param msg the incoming message. + * @param nodeid the output nodeid. + * @return 0 if the type could be parsed -1 on failure. + */ +static int +msg_proposed_principal_get(msg* msg, cf_node* nodeid) +{ + if (msg_get_uint64(msg, AS_CLUSTERING_MSG_PROPOSED_PRINCIPAL, nodeid) + != 0) { + return -1; + } + + return 0; +} + +/** + * Set the proposed principal field in the message. + * @param msg the outgoing message. + * @param nodeid the proposed principal nodeid. + */ +static void +msg_proposed_principal_set(msg* msg, cf_node nodeid) +{ + if (msg_set_uint64(msg, AS_CLUSTERING_MSG_PROPOSED_PRINCIPAL, nodeid) + != 0) { + CRASH("error setting proposed principal"); + } +} + +/** + * Read the HLC send timestamp for the message. These functions can get called + * multiple times for a single message. Hence they do not increment error + * counters. + * @param msg the incoming message. + * @param send_ts the output hls timestamp. + * @return 0 if the type could be parsed -1 on failure. + */ +static int +msg_send_ts_get(msg* msg, as_hlc_timestamp* send_ts) +{ + if (msg_get_uint64(msg, AS_CLUSTERING_MSG_HLC_TIMESTAMP, send_ts) != 0) { + return -1; + } + + return 0; +} + +/** + * Set the sequence number for an outgoing message. + * @param msg the outgoing message. + * @param sequence_number the sequence number to set. + */ +static void +msg_sequence_number_set(msg* msg, as_paxos_sequence_number sequence_number) +{ + // Set the message type. + if (msg_set_uint64(msg, AS_CLUSTERING_MSG_SEQUENCE_NUMBER, sequence_number) + != 0) { + CRASH("error setting sequence number on msg"); + } +} + +/** + * Read sequence number from the message. + * @param msg the incoming message. + * @param sequence_number the output sequence number. + * @return 0 if the sequence number could be parsed -1 on failure. + */ +static int +msg_sequence_number_get(msg* msg, as_paxos_sequence_number* sequence_number) +{ + if (msg_get_uint64(msg, AS_CLUSTERING_MSG_SEQUENCE_NUMBER, sequence_number) + != 0) { + return -1; + } + + return 0; +} + +/** + * Set the cluster key for an outgoing message field. + * @param msg the outgoing message. + * @param cluster_key the cluster key to set. + * @param field the field to set the cluster key to. + */ +static void +msg_cluster_key_field_set(msg* msg, as_cluster_key cluster_key, + as_clustering_msg_field field) +{ + // Set the cluster key. + if (msg_set_uint64(msg, field, cluster_key) != 0) { + CRASH("error setting cluster key on msg"); + } +} + +/** + * Set the cluster key for an outgoing message. + * @param msg the outgoing message. + * @param cluster_key the cluster key to set. + */ +static void +msg_cluster_key_set(msg* msg, as_cluster_key cluster_key) +{ + msg_cluster_key_field_set(msg, cluster_key, AS_CLUSTERING_MSG_CLUSTER_KEY); +} + +/** + * Read cluster key from a message field. + * @param msg the incoming message. + * @param cluster_key the output cluster key. + * @param field the field to set the cluster key to. + * @return 0 if the cluster key could be parsed -1 on failure. + */ +static int +msg_cluster_key_field_get(msg* msg, as_cluster_key* cluster_key, + as_clustering_msg_field field) +{ + if (msg_get_uint64(msg, field, cluster_key) != 0) { + return -1; + } + + return 0; +} + +/** + * Read cluster key from the message. + * @param msg the incoming message. + * @param cluster_key the output cluster key. + * @return 0 if the cluster key could be parsed -1 on failure. + */ +static int +msg_cluster_key_get(msg* msg, as_cluster_key* cluster_key) +{ + return msg_cluster_key_field_get(msg, cluster_key, + AS_CLUSTERING_MSG_CLUSTER_KEY); +} + +/** + * Set the succession list for an outgoing message in a particular field. + * @param msg the outgoing message. + * @param succession_list the succession list to set. + * @param field the field to set for the succession list. + */ +static void +msg_succession_list_field_set(msg* msg, cf_vector* succession_list, + as_clustering_msg_field field) + +{ + int num_elements = cf_vector_size(succession_list); + size_t buffer_size = num_elements * sizeof(cf_node); + cf_node* succession_buffer = (cf_node*)BUFFER_ALLOC_OR_DIE(buffer_size); + + for (int i = 0; i < num_elements; i++) { + cf_vector_get(succession_list, i, &succession_buffer[i]); + } + + if (msg_set_buf(msg, field, (uint8_t*)succession_buffer, buffer_size, + MSG_SET_COPY) != 0) { + CRASH("error setting succession list on msg"); + } + + BUFFER_FREE(succession_buffer, buffer_size); +} + +/** + * Set the succession list for an outgoing message. + * @param msg the outgoing message. + * @param succession_list the succession list to set. + */ +static void +msg_succession_list_set(msg* msg, cf_vector* succession_list) +{ + int num_elements = cf_vector_size(succession_list); + if (num_elements <= 0) { + // Empty succession list being sent. Definitely wrong.Something is amiss + // let it through. The receiver will reject it anyways. + WARNING("setting empty succession list"); + return; + } + + msg_succession_list_field_set(msg, succession_list, + AS_CLUSTERING_MSG_SUCCESSION_LIST); +} + +/** + * Read succession list from a message field. + * @param msg the incoming message. + * @param succession_list the output succession list. + * @param field the field to read from. + * @return 0 if the succession list could be parsed -1 on failure. + */ +static int +msg_succession_list_field_get(msg* msg, cf_vector* succession_list, + as_clustering_msg_field field) +{ + vector_clear(succession_list); + cf_node* succession_buffer; + size_t buffer_size; + if (msg_get_buf(msg, field, (uint8_t**)&succession_buffer, &buffer_size, + MSG_GET_DIRECT) != 0) { + // Empty succession list should not be allowed. + return -1; + } + + // Correct adjacency list length. + int num_elements = buffer_size / sizeof(cf_node); + + for (int i = 0; i < num_elements; i++) { + cf_vector_append(succession_list, &succession_buffer[i]); + } + + vector_sort_unique(succession_list, cf_node_compare_desc); + + return 0; +} + +/** + * Read succession list from the message. + * @param msg the incoming message. + * @param succession_list the output succession list. + * @return 0 if the succession list could be parsed -1 on failure. + */ +static int +msg_succession_list_get(msg* msg, cf_vector* succession_list) +{ + return msg_succession_list_field_get(msg, succession_list, + AS_CLUSTERING_MSG_SUCCESSION_LIST); +} + +/** + * Get the paxos proposal id for message event. + * @param event the message event. + * @param proposal_id the paxos proposal id. + * @return 0 if the type could be parsed -1 on failure. + */ +static int +msg_event_proposal_id_get(as_clustering_internal_event* event, + as_paxos_proposal_id* proposal_id) +{ + if (msg_sequence_number_get(event->msg, &proposal_id->sequence_number) + != 0) { + return -1; + } + proposal_id->src_nodeid = event->msg_src_nodeid; + return 0; +} + +/** + * Get a network message object from the message pool with all common fields for + * clustering, like protocol identifier, and hlc timestamp filled in. + * @param type the type of the message. + */ +static msg* +msg_pool_get(as_clustering_msg_type type) +{ + msg* msg = as_fabric_msg_get(M_TYPE_CLUSTERING); + msg_src_fields_fill(msg); + msg_type_set(msg, type); + return msg; +} + +/** + * Return a message back to the message pool. + */ +static void +msg_pool_return(msg* msg) +{ + as_fabric_msg_put(msg); +} + +/** + * Determines if the received message is old to be ignored. + * + * This is detemined by comparing the message hlc timestamp and monotonic + * timestamps with the cluster formation hlc and monotonic times. + * + * @param cluster_modified_hlc_ts the hlc timestamp when for current cluster + * change happened. Sent to avoid locking in this function. + * @param cluster_modified_time the monotonic timestamp when for current + * cluster change happened. Sento to avoid locking in this function. + * @param msg_recv_ts the monotonic timestamp for plugin data receive. + * @param msg_hlc_ts the hlc timestamp for plugin data receive. + * @return true if plugin data is obsolete, false otherwise. + */ +bool +msg_is_obsolete(as_hlc_timestamp cluster_modified_hlc_ts, + cf_clock cluster_modified_time, cf_clock msg_recv_ts, + as_hlc_msg_timestamp* msg_hlc_ts) +{ + if (as_hlc_send_timestamp_order(cluster_modified_hlc_ts, msg_hlc_ts) + != AS_HLC_HAPPENS_BEFORE) { + // Cluster formation time after message send or the order is unknown, + // assume cluster formation is after message received. + // The caller should ignore this message. + return true; + } + + // MSG should be atleast after cluster formation time + one hb interval to + // send out our cluster state + one network delay for our information to + // reach the remote node + one hb for the other node to send out the his + // updated state + + // one network delay for the updated state to reach us. + if (cluster_modified_time + 2 * as_hb_tx_interval_get() + + 2 * g_config.fabric_latency_max_ms > msg_recv_ts) { + return true; + } + + return false; +} + +/** + * Send a message to all input nodes. This is best effort some sends could fail. + * The message will be returned back to the pool. + * @param msg the message to send. + * @param nodes the nodes to send the message to. + * @return 0 on successfu queueing of message (does not imply guaranteed + * delivery), -1 if the message could not be queued. + */ +static int +msg_node_send(msg* msg, cf_node node) +{ + int rv = as_fabric_send(node, msg, AS_FABRIC_CHANNEL_CTRL); + if (rv) { + // Fabric did not clean up the message, return it back to the message + // pool. + msg_pool_return(msg); + } + return rv; +} + +/** + * Send a message to all input nodes. This is best effort some sends could fail. + * The message will be returned back to the pool. + * @param msg the message to send. + * @param nodes the nodes to send the message to. + * @return the number of nodes the message was sent to. Does not imply + * guaranteed receipt by these nodes however. + */ +static int +msg_nodes_send(msg* msg, cf_vector* nodes) +{ + int node_count = cf_vector_size(nodes); + int sent_count = 0; + + if (node_count <= 0) { + return sent_count; + } + + int alloc_size = node_count * sizeof(cf_node); + cf_node* send_list = (cf_node*)BUFFER_ALLOC_OR_DIE(alloc_size); + + vector_array_cpy(send_list, nodes, node_count); + + if (as_fabric_send_list(send_list, node_count, msg, AS_FABRIC_CHANNEL_CTRL) + != 0) { + // Fabric did not clean up the message, return it back to the message + // pool. + msg_pool_return(msg); + } + + BUFFER_FREE(send_list, alloc_size); + return sent_count; +} + +/* + * ---------------------------------------------------------------------------- + * Paxos common + * ---------------------------------------------------------------------------- + */ + +/** + * Compare paxos proposal ids. Compares the sequence numbers, ties in sequence + * number are broken by nodeids. + * + * @param id1 the first identifier. + * @param id2 the second identifier. + * + * @return 0 if id1 equals id2, 1 if id1 > id2 and -1 if id1 < id2. + */ +static int +paxos_proposal_id_compare(as_paxos_proposal_id* id1, as_paxos_proposal_id* id2) +{ + if (id1->sequence_number != id2->sequence_number) { + return id1->sequence_number > id2->sequence_number ? 1 : -1; + } + + // Sequence numbers match, compare nodeids. + if (id1->src_nodeid != id2->src_nodeid) { + return id1->src_nodeid > id2->src_nodeid ? 1 : -1; + } + + // Node id and sequence numbers match. + return 0; +} + +/* + * ---------------------------------------------------------------------------- + * Paxos proposer + * ---------------------------------------------------------------------------- + */ + +/** + * Dump paxos proposer state to logs. + */ +static void +paxos_proposer_dump(bool verbose) +{ + CLUSTERING_LOCK(); + + // Output paxos proposer state. + switch (g_proposer.state) { + case AS_PAXOS_PROPOSER_STATE_IDLE: + INFO("CL: paxos proposer: idle"); + break; + case AS_PAXOS_PROPOSER_STATE_PREPARE_SENT: + INFO("CL: paxos proposer: prepare sent"); + break; + case AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT: + INFO("CL: paxos proposer: accept sent"); + break; + } + + if (verbose) { + if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_IDLE) { + INFO("CL: paxos proposal start time: %"PRIu64" now: %"PRIu64, + g_proposer.paxos_round_start_time, cf_getms()); + INFO("CL: paxos proposed cluster key: %"PRIx64, + g_proposer.proposed_value.cluster_key); + INFO("CL: paxos proposed sequence: %"PRIu64, + g_proposer.sequence_number); + log_cf_node_vector("CL: paxos proposed succession:", + &g_proposer.proposed_value.succession_list, CF_INFO); + log_cf_node_vector("CL: paxos promises received:", + &g_proposer.promises_received, CF_INFO); + log_cf_node_vector("CL: paxos accepted received:", + &g_proposer.accepted_received, CF_INFO); + } + } + + CLUSTERING_UNLOCK(); +} + +/** + * Reset state on failure of a paxos round. + */ +static void +paxos_proposer_reset() +{ + CLUSTERING_LOCK(); + + // Flipping state to idle to indicate paxos round is over. + g_proposer.state = AS_PAXOS_PROPOSER_STATE_IDLE; + memset(&g_proposer.sequence_number, 0, sizeof(g_proposer.sequence_number)); + + g_proposer.proposed_value.cluster_key = 0; + vector_clear(&g_proposer.proposed_value.succession_list); + + vector_clear(&g_proposer.acceptors); + + DETAIL("paxos round over for proposal id %"PRIx64":%"PRIu64, + config_self_nodeid_get(), g_proposer.sequence_number); + + CLUSTERING_UNLOCK(); +} + +/** + * Invoked to fail an ongoing paxos proposal. + */ +static void +paxos_proposer_fail() +{ + // Cleanup state for the paxos round. + paxos_proposer_reset(); + + as_clustering_internal_event paxos_fail_event; + memset(&paxos_fail_event, 0, sizeof(paxos_fail_event)); + paxos_fail_event.type = AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_FAIL; + + internal_event_dispatch(&paxos_fail_event); +} + +/** + * Indicates if a paxos proposal from self node is active. + */ +static bool +paxos_proposer_proposal_is_active() +{ + CLUSTERING_LOCK(); + bool rv = g_proposer.state != AS_PAXOS_PROPOSER_STATE_IDLE; + CLUSTERING_UNLOCK(); + return rv; +} + +/** + * Send paxos prepare message current list of acceptor nodes. + */ +static void +paxos_proposer_prepare_send() +{ + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE); + + CLUSTERING_LOCK(); + + // Set the sequence number + msg_sequence_number_set(msg, g_proposer.sequence_number); + + log_cf_node_vector("paxos prepare message sent to:", &g_proposer.acceptors, + CF_DEBUG); + + g_proposer.prepare_send_time = cf_getms(); + + cf_vector* acceptors = vector_stack_lockless_create(cf_node); + vector_copy(acceptors, &g_proposer.acceptors); + + CLUSTERING_UNLOCK(); + + // Sent the message to the acceptors. + msg_nodes_send(msg, acceptors); + cf_vector_destroy(acceptors); +} + +/** + * Send paxos accept message current list of acceptor nodes. + */ +static void +paxos_proposer_accept_send() +{ + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT); + + CLUSTERING_LOCK(); + + // Set the sequence number + msg_sequence_number_set(msg, g_proposer.sequence_number); + + // Skip send of the proposed value for accept, since we do not use it. Learn + // message is the only way a consensus value is sent out. + log_cf_node_vector("paxos accept message sent to:", &g_proposer.acceptors, + CF_DEBUG); + + g_proposer.accept_send_time = cf_getms(); + + cf_vector* acceptors = vector_stack_lockless_create(cf_node); + vector_copy(acceptors, &g_proposer.acceptors); + + CLUSTERING_UNLOCK(); + + // Sent the message to the acceptors. + msg_nodes_send(msg, acceptors); + cf_vector_destroy(acceptors); +} + +/** + * Send paxos learn message current list of acceptor nodes. + */ +static void +paxos_proposer_learn_send() +{ + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_LEARN); + + CLUSTERING_LOCK(); + + // Set the sequence number + msg_sequence_number_set(msg, g_proposer.sequence_number); + + // Set the cluster key + msg_cluster_key_set(msg, g_proposer.proposed_value.cluster_key); + + // Set the succession list + msg_succession_list_set(msg, &g_proposer.proposed_value.succession_list); + + log_cf_node_vector("paxos learn message sent to:", &g_proposer.acceptors, + CF_DEBUG); + + g_proposer.learn_send_time = cf_getms(); + + cf_vector* acceptors = vector_stack_lockless_create(cf_node); + vector_copy(acceptors, &g_proposer.acceptors); + + CLUSTERING_UNLOCK(); + + // Sent the message to the acceptors. + msg_nodes_send(msg, acceptors); + cf_vector_destroy(acceptors); +} + +/** + * Handle an incoming paxos promise message. + */ +static void +paxos_proposer_promise_handle(as_clustering_internal_event* event) +{ + cf_node src_nodeid = event->msg_src_nodeid; + msg* msg = event->msg; + + DEBUG("received paxos promise from node %"PRIx64, src_nodeid); + + CLUSTERING_LOCK(); + if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_PREPARE_SENT) { + // We are not in the prepare phase. Reject this message. + DEBUG("ignoring paxos promise from node %"PRIx64" - we are not in prepare phase", + src_nodeid); + goto Exit; + } + + if (vector_find(&g_proposer.acceptors, &src_nodeid) < 0) { + WARNING("ignoring paxos promise from node %"PRIx64" - it is not in acceptor list", + src_nodeid); + goto Exit; + } + + as_paxos_sequence_number sequence_number = 0; + if (msg_sequence_number_get(msg, &sequence_number) != 0) { + WARNING("ignoring paxos promise from node %"PRIx64" with invalid proposal id", + src_nodeid); + goto Exit; + } + + if (sequence_number != g_proposer.sequence_number) { + // Not a matching promise message. Ignore. + INFO("ignoring paxos promise from node %"PRIx64" because its proposal id %"PRIu64" does not match expected id %"PRIu64, + src_nodeid, sequence_number, + g_proposer.sequence_number); + goto Exit; + } + + cf_vector_append_unique(&g_proposer.promises_received, &src_nodeid); + + int promised_count = cf_vector_size(&g_proposer.promises_received); + int acceptor_count = cf_vector_size(&g_proposer.acceptors); + + // Use majority quorum to move on. + if (promised_count >= 1 + (acceptor_count / 2)) { + // We have quorum number of promises. go ahead to the accept phase. + g_proposer.state = AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT; + paxos_proposer_accept_send(); + } + +Exit: + CLUSTERING_UNLOCK(); +} + +/** + * Handle an incoming paxos prepare nack message. + */ +static void +paxos_proposer_prepare_nack_handle(as_clustering_internal_event* event) +{ + cf_node src_nodeid = event->msg_src_nodeid; + msg* msg = event->msg; + + DEBUG("received paxos prepare nack from node %"PRIx64, src_nodeid); + + CLUSTERING_LOCK(); + if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_PREPARE_SENT) { + // We are not in the prepare phase. Reject this message. + INFO("ignoring paxos prepare nack from node %"PRIx64" - we are not in prepare phase", + src_nodeid); + goto Exit; + } + + if (vector_find(&g_proposer.acceptors, &src_nodeid) < 0) { + WARNING("ignoring paxos prepare nack from node %"PRIx64" - it is not in acceptor list", + src_nodeid); + goto Exit; + } + + as_paxos_sequence_number sequence_number = 0; + if (msg_sequence_number_get(msg, &sequence_number) != 0) { + WARNING("ignoring paxos prepare nack from node %"PRIx64" with invalid proposal id", + src_nodeid); + goto Exit; + } + + if (sequence_number != g_proposer.sequence_number) { + // Not a matching prepare nack message. Ignore. + INFO("ignoring paxos prepare nack from node %"PRIx64" because its proposal id %"PRIu64" does not match expected id %"PRIu64, + src_nodeid, sequence_number, + g_proposer.sequence_number); + goto Exit; + } + + INFO( + "aborting current paxos proposal because of a prepare nack from node %"PRIx64, + src_nodeid); + paxos_proposer_fail(); + +Exit: + CLUSTERING_UNLOCK(); +} + +/** + * Invoked when all acceptors have accepted the proposal. + */ +static void +paxos_proposer_success() +{ + CLUSTERING_LOCK(); + + // Set the proposer to back idle state. + g_proposer.state = AS_PAXOS_PROPOSER_STATE_IDLE; + + // Send out learn message and enable retransmits of learn message. + g_proposer.learn_retransmit_needed = true; + paxos_proposer_learn_send(); + + // Retain the sequence_number, cluster key and succession list for + // retransmits of the learn message. + as_clustering_internal_event paxos_success_event; + memset(&paxos_success_event, 0, sizeof(paxos_success_event)); + paxos_success_event.type = + AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_SUCCESS; + + CLUSTERING_UNLOCK(); +} + +/** + * Handle an incoming paxos accepted message. + */ +static void +paxos_proposer_accepted_handle(as_clustering_internal_event* event) +{ + cf_node src_nodeid = event->msg_src_nodeid; + msg* msg = event->msg; + + DEBUG("received paxos accepted from node %"PRIx64, src_nodeid); + + CLUSTERING_LOCK(); + + // We also allow accepted messages in the idle state to deal with a loss of + // the learn message. + if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT + && g_proposer.state != AS_PAXOS_PROPOSER_STATE_IDLE) { + // We are not in the accept phase. Reject this message. + DEBUG("ignoring paxos accepted from node %"PRIx64" - we are not in accept phase. Actual phase %d", + src_nodeid, g_proposer.state); + goto Exit; + } + + if (vector_find(&g_proposer.acceptors, &src_nodeid) < 0) { + WARNING("ignoring paxos accepted from node %"PRIx64" - it is not in acceptor list", + src_nodeid); + goto Exit; + } + + as_paxos_sequence_number sequence_number = 0; + if (msg_sequence_number_get(msg, &sequence_number) != 0) { + WARNING("ignoring paxos accepted from node %"PRIx64" with invalid proposal id", + src_nodeid); + goto Exit; + } + + if (sequence_number != g_proposer.sequence_number) { + // Not a matching accepted message. Ignore. + INFO("ignoring paxos accepted from node %"PRIx64" because its proposal id %"PRIu64" does not match expected id %"PRIu64, + src_nodeid, sequence_number, + g_proposer.sequence_number); + goto Exit; + } + + cf_vector_append_unique(&g_proposer.accepted_received, &src_nodeid); + + int accepted_count = cf_vector_size(&g_proposer.accepted_received); + int acceptor_count = cf_vector_size(&g_proposer.acceptors); + + // Use a simple quorum, all acceptors should accept for success. + if (accepted_count == acceptor_count) { + // This is the point after which the succession list will not change for + // this paxos round. Ensure that we meet the minimum cluster size + // criterion. + int cluster_size = cf_vector_size( + &g_proposer.proposed_value.succession_list); + if (cluster_size < g_config.clustering_config.cluster_size_min) { + WARNING( + "failing paxos round - the remaining number of nodes %d is less than minimum cluster size %d", + cluster_size, g_config.clustering_config.cluster_size_min); + // Fail paxos. + paxos_proposer_fail(); + goto Exit; + } + + // We have quorum number of accepted nodes. The proposal succeeded. + paxos_proposer_success(); + } + +Exit: + CLUSTERING_UNLOCK(); +} + +/** + * Handle an incoming paxos accept nack message. + */ +static void +paxos_proposer_accept_nack_handle(as_clustering_internal_event* event) +{ + cf_node src_nodeid = event->msg_src_nodeid; + msg* msg = event->msg; + + DEBUG("received paxos accept nack from node %"PRIx64, src_nodeid); + + CLUSTERING_LOCK(); + if (g_proposer.state != AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT) { + // We are not in the accept phase. Reject this message. + INFO("ignoring paxos accept nack from node %"PRIx64" - we are not in accept phase", + src_nodeid); + goto Exit; + } + + if (vector_find(&g_proposer.acceptors, &src_nodeid) < 0) { + WARNING("ignoring paxos accept nack from node %"PRIx64" - it is not in acceptor list", + src_nodeid); + goto Exit; + } + + as_paxos_sequence_number sequence_number = 0; + if (msg_sequence_number_get(msg, &sequence_number) != 0) { + WARNING("ignoring paxos accept nack from node %"PRIx64" with invalid proposal id", + src_nodeid); + goto Exit; + } + + if (sequence_number != g_proposer.sequence_number) { + // Not a matching accept nack message. Ignore. + INFO("ignoring paxos accept nack from node %"PRIx64"because its proposal id %"PRIu64" does not match expected id %"PRIu64, + src_nodeid, sequence_number, + g_proposer.sequence_number); + goto Exit; + } + + INFO( + "aborting current paxos proposal because of an accept nack from node %"PRIx64, + src_nodeid); + paxos_proposer_fail(); + +Exit: + CLUSTERING_UNLOCK(); +} + +/** + * Handle an incoming message. + */ +static void +paxos_proposer_msg_event_handle(as_clustering_internal_event* msg_event) +{ + switch (msg_event->msg_type) { + case AS_CLUSTERING_MSG_TYPE_PAXOS_PROMISE: + paxos_proposer_promise_handle(msg_event); + break; + case AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE_NACK: + paxos_proposer_prepare_nack_handle(msg_event); + break; + case AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPTED: + paxos_proposer_accepted_handle(msg_event); + break; + case AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT_NACK: + paxos_proposer_accept_nack_handle(msg_event); + break; + default: // Other message types are not of interest. + break; + } +} + +/** + * Handle heartbeat event. + */ +static void +paxos_proposer_hb_event_handle(as_clustering_internal_event* hb_event) +{ + if (!paxos_proposer_proposal_is_active()) { + return; + } + + CLUSTERING_LOCK(); + for (int i = 0; i < hb_event->hb_n_events; i++) { + if (hb_event->hb_events[i].evt == AS_HB_NODE_DEPART) { + cf_node departed_node = hb_event->hb_events[i].nodeid; + if (vector_find(&g_proposer.acceptors, &departed_node)) { + // One of the acceptors has departed. Abort the paxos proposal. + INFO("paxos acceptor %"PRIx64" departed - aborting current paxos proposal", departed_node); + paxos_proposer_fail(); + break; + } + } + } + CLUSTERING_UNLOCK(); +} + +/** + * Check and retransmit prepare message if paxos promise messages have not yet + * being received. + */ +static void +paxos_proposer_prepare_check_retransmit() +{ + CLUSTERING_LOCK(); + cf_clock now = cf_getms(); + if (g_proposer.state == AS_PAXOS_PROPOSER_STATE_PREPARE_SENT + && g_proposer.prepare_send_time + paxos_msg_timeout() < now) { + paxos_proposer_prepare_send(); + } + CLUSTERING_UNLOCK(); +} + +/** + * Check and retransmit accept message if paxos accepted has yet being received. + */ +static void +paxos_proposer_accept_check_retransmit() +{ + CLUSTERING_LOCK(); + cf_clock now = cf_getms(); + if (g_proposer.state == AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT + && g_proposer.accept_send_time + paxos_msg_timeout() < now) { + paxos_proposer_accept_send(); + } + CLUSTERING_UNLOCK(); +} + +/** + * Check and retransmit learn message if all acceptors have not applied the + * current cluster change. + */ +static void +paxos_proposer_learn_check_retransmit() +{ + CLUSTERING_LOCK(); + cf_clock now = cf_getms(); + bool learn_timedout = g_proposer.learn_retransmit_needed + && (g_proposer.state == AS_PAXOS_PROPOSER_STATE_IDLE) + && (g_proposer.proposed_value.cluster_key != 0) + && (g_proposer.learn_send_time + paxos_msg_timeout() < now); + + if (learn_timedout) { + // If the register is not synced, most likely the learn message did not + // make it through, retransmit the learn message to move the paxos + // acceptor forward and start register sync. + INFO("retransmitting paxos learn message"); + paxos_proposer_learn_send(); + } + CLUSTERING_UNLOCK(); +} + +/** + * Handle a timer event and retransmit messages if required. + */ +static void +paxos_proposer_timer_event_handle() +{ + CLUSTERING_LOCK(); + switch (g_proposer.state) { + case AS_PAXOS_PROPOSER_STATE_IDLE: + paxos_proposer_learn_check_retransmit(); + break; + case AS_PAXOS_PROPOSER_STATE_PREPARE_SENT: + paxos_proposer_prepare_check_retransmit(); + break; + case AS_PAXOS_PROPOSER_STATE_ACCEPT_SENT: + paxos_proposer_accept_check_retransmit(); + break; + } + CLUSTERING_UNLOCK(); +} + +/** + * Handle register getting synched. + */ +static void +paxos_proposer_register_synched() +{ + CLUSTERING_LOCK(); + // Register synched we no longer need learn messages to be retransmitted. + g_proposer.learn_retransmit_needed = false; + CLUSTERING_UNLOCK(); +} + +/** + * Initialize paxos proposer state. + */ +static void +paxos_proposer_init() +{ + CLUSTERING_LOCK(); + // Memset to zero which ensures that all proposer state variables have zero + // which is the correct initial value for elements other that contained + // vectors and status. + memset(&g_proposer, 0, sizeof(g_proposer)); + + // Initialize the proposer state. + // No paxos round running, so the state has to be idle. + g_proposer.state = AS_PAXOS_PROPOSER_STATE_IDLE; + + // Set the current acceptor list to be empty. + vector_lockless_init(&g_proposer.acceptors, cf_node); + + // Set the current promises received node list to empty. + vector_lockless_init(&g_proposer.promises_received, cf_node); + + // Set the current accepted received node list to empty. + vector_lockless_init(&g_proposer.accepted_received, cf_node); + + // Initialize the proposed value. + vector_lockless_init(&g_proposer.proposed_value.succession_list, cf_node); + g_proposer.proposed_value.cluster_key = 0; + + CLUSTERING_UNLOCK(); +} + +/** + * Log paxos results. + */ +static void +paxos_result_log(as_paxos_start_result result, cf_vector* new_succession_list) +{ + CLUSTERING_LOCK(); + switch (result) { + case AS_PAXOS_RESULT_STARTED: { + // Running check required because paxos round finished for single node + // cluster by this time. + if (paxos_proposer_proposal_is_active()) { + INFO("paxos round started - cluster key: %"PRIx64, + g_proposer.proposed_value.cluster_key); + log_cf_node_vector("paxos round started - succession list:", + &g_proposer.proposed_value.succession_list, CF_INFO); + } + break; + } + + case AS_PAXOS_RESULT_CLUSTER_TOO_SMALL: { + WARNING( + "paxos round aborted - new cluster size %d less than min cluster size %d", + cf_vector_size(new_succession_list), + g_config.clustering_config.cluster_size_min); + break; + } + + case AS_PAXOS_RESULT_ROUND_RUNNING: { + // Should never happen in practice. Let the old round finish or timeout. + WARNING( + "older paxos round still running - should have finished by now"); + } + } + + CLUSTERING_UNLOCK(); +} + +/** + * Start a new paxos round. + * + * @param new_succession_list the new succession list. + * @param acceptor_list the list of nodes to use for paxos acceptors. + * @param current_cluster_key the current cluster key + * @param current_succession_list the current succession list, can be null if + * this node is an orphan. + */ +static as_paxos_start_result +paxos_proposer_proposal_start(cf_vector* new_succession_list, + cf_vector* acceptor_list) +{ + if (cf_vector_size(new_succession_list) + < g_config.clustering_config.cluster_size_min) { + // Fail paxos. + return AS_PAXOS_RESULT_CLUSTER_TOO_SMALL; + } + + CLUSTERING_LOCK(); + + as_paxos_start_result result; + if (paxos_proposer_proposal_is_active()) { + result = AS_PAXOS_RESULT_ROUND_RUNNING; + goto Exit; + } + + // Update state to prepare. + g_proposer.state = AS_PAXOS_PROPOSER_STATE_PREPARE_SENT; + + g_proposer.sequence_number = as_hlc_timestamp_now(); + + g_proposer.paxos_round_start_time = cf_getms(); + + // Populate the proposed value struct with new succession list and a new + // cluster key. + vector_clear(&g_proposer.proposed_value.succession_list); + vector_copy(&g_proposer.proposed_value.succession_list, + new_succession_list); + g_proposer.proposed_value.cluster_key = clustering_cluster_key_generate( + g_register.cluster_key); + + // Remember the acceptors for this paxos round. + vector_clear(&g_proposer.acceptors); + vector_copy(&g_proposer.acceptors, acceptor_list); + + // Clear the promise received and accepted received vectors for this new + // round. + vector_clear(&g_proposer.promises_received); + vector_clear(&g_proposer.accepted_received); + + paxos_proposer_prepare_send(); + + result = AS_PAXOS_RESULT_STARTED; + +Exit: + CLUSTERING_UNLOCK(); + + return result; +} + +/** + * Paxos proposer monitor to detect and cleanup long running and most likely + * failed paxos rounds. + */ +static void +paxos_proposer_monitor() +{ + CLUSTERING_LOCK(); + if (paxos_proposer_proposal_is_active()) { + if (g_proposer.paxos_round_start_time + paxos_proposal_timeout() + <= cf_getms()) { + // Paxos round is running and has timed out. + // Consider paxos round failed. + INFO("paxos round timed out for proposal id %"PRIx64":%"PRIu64, + config_self_nodeid_get(), + g_proposer.sequence_number); + paxos_proposer_fail(); + } + } + CLUSTERING_UNLOCK(); +} + +/* + * ---------------------------------------------------------------------------- + * Paxos acceptor + * ---------------------------------------------------------------------------- + */ + +/** + * Dump paxos acceptor state to logs. + */ +static void +paxos_acceptor_dump(bool verbose) +{ + CLUSTERING_LOCK(); + + // Output paxos acceptor state. + switch (g_acceptor.state) { + case AS_PAXOS_ACCEPTOR_STATE_IDLE: + INFO("CL: paxos acceptor: idle"); + break; + case AS_PAXOS_ACCEPTOR_STATE_PROMISED: + INFO("CL: paxos acceptor: promised"); + break; + case AS_PAXOS_ACCEPTOR_STATE_ACCEPTED: + INFO("CL: paxos acceptor: accepted"); + break; + } + + if (verbose) { + if (g_acceptor.state != AS_PAXOS_ACCEPTOR_STATE_IDLE) { + INFO("CL: paxos acceptor start time: %"PRIu64" now: %"PRIu64, + g_acceptor.acceptor_round_start, cf_getms()); + INFO("CL: paxos acceptor proposal id: (%"PRIx64":%"PRIu64")", + g_acceptor.last_proposal_received_id.src_nodeid, + g_acceptor.last_proposal_received_id.sequence_number); + INFO("CL: paxos acceptor promised time: %"PRIu64" now: %"PRIu64, + g_acceptor.promise_send_time, cf_getms()); + INFO("CL: paxos acceptor accepted time: %"PRIu64" now: %"PRIu64, + g_acceptor.accepted_send_time, cf_getms()); + } + } + + CLUSTERING_UNLOCK(); +} + +/** + * Reset the acceptor for the next round. + */ +static void +paxos_acceptor_reset() +{ + CLUSTERING_LOCK(); + g_acceptor.state = AS_PAXOS_ACCEPTOR_STATE_IDLE; + g_acceptor.acceptor_round_start = 0; + g_acceptor.promise_send_time = 0; + g_acceptor.accepted_send_time = 0; + CLUSTERING_UNLOCK(); +} + +/** + * Invoked to fail an ongoing paxos proposal. + */ +static void +paxos_acceptor_fail() +{ + // Cleanup state for the paxos round. + paxos_acceptor_reset(); + + as_clustering_internal_event paxos_fail_event; + memset(&paxos_fail_event, 0, sizeof(paxos_fail_event)); + paxos_fail_event.type = AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_FAIL; + + internal_event_dispatch(&paxos_fail_event); +} + +/** + * Invoked on success of an ongoing paxos proposal. + */ +static void +paxos_acceptor_success(as_cluster_key cluster_key, cf_vector* succession_list, + as_paxos_sequence_number sequence_number) +{ + // Cleanup state for the paxos round. + paxos_acceptor_reset(); + + as_clustering_internal_event paxos_success_event; + memset(&paxos_success_event, 0, sizeof(paxos_success_event)); + paxos_success_event.type = + AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_SUCCESS; + paxos_success_event.new_succession_list = succession_list; + paxos_success_event.new_cluster_key = cluster_key; + paxos_success_event.new_sequence_number = sequence_number; + + internal_event_dispatch(&paxos_success_event); +} + +/** + * Send paxos promise message to the proposer node. + * @param dest the destination node. + * @param sequence_number the sequence number from the incoming message. + */ +static void +paxos_acceptor_promise_send(cf_node dest, + as_paxos_sequence_number sequence_number) +{ + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_PROMISE); + + msg_sequence_number_set(msg, sequence_number); + + DEBUG("paxos promise message sent to node %"PRIx64" with proposal id (%"PRIx64":%"PRIu64")", dest, dest, sequence_number); + + CLUSTERING_LOCK(); + g_acceptor.promise_send_time = cf_getms(); + CLUSTERING_UNLOCK(); + + // Send the message to the proposer. + msg_node_send(msg, dest); +} + +/** + * Send paxos prepare nack message to the proposer. + * @param dest the destination node. + * @param sequence_number the sequence number from the incoming message. + */ +static void +paxos_acceptor_prepare_nack_send(cf_node dest, + as_paxos_sequence_number sequence_number) +{ + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE_NACK); + + msg_sequence_number_set(msg, sequence_number); + + DEBUG("paxos prepare nack message sent to node %"PRIx64" with proposal id (%"PRIx64":%"PRIu64")", dest, dest, sequence_number); + + // Send the message to the proposer. + msg_node_send(msg, dest); +} + +/** + * Send paxos accepted message to the proposer node. + * @param dest the destination node. + * @param sequence_number the sequence number from the incoming message. + */ +static void +paxos_acceptor_accepted_send(cf_node dest, + as_paxos_sequence_number sequence_number) +{ + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPTED); + + msg_sequence_number_set(msg, sequence_number); + + DEBUG("paxos accepted message sent to node %"PRIx64" with proposal id (%"PRIx64":%"PRIu64")", dest, dest, sequence_number); + + CLUSTERING_LOCK(); + g_acceptor.accepted_send_time = cf_getms(); + CLUSTERING_UNLOCK(); + + // Send the message to the proposer. + msg_node_send(msg, dest); +} + +/** + * Send paxos accept nack message to the proposer. + * @param dest the destination node. + * @param sequence_number the sequence number from the incoming message. + */ +static void +paxos_acceptor_accept_nack_send(cf_node dest, + as_paxos_sequence_number sequence_number) +{ + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT_NACK); + + msg_sequence_number_set(msg, sequence_number); + + DEBUG("paxos accept nack message sent to node %"PRIx64" with proposal id (%"PRIx64":%"PRIu64")", dest, dest, sequence_number); + + // Send the message to the proposer. + msg_node_send(msg, dest); +} + +/** + * Check if the incoming prepare can be promised. + */ +static bool +paxos_acceptor_prepare_can_promise(cf_node src_nodeid, + as_paxos_proposal_id* proposal_id) +{ + if (!clustering_can_accept_as_proposer(src_nodeid)) { + INFO("ignoring paxos prepare from node %"PRIx64" because it cannot be a principal", + src_nodeid); + return false; + } + + bool can_promise = false; + CLUSTERING_LOCK(); + int comparison = paxos_proposal_id_compare(proposal_id, + &g_acceptor.last_proposal_received_id); + + switch (g_acceptor.state) { + case AS_PAXOS_ACCEPTOR_STATE_IDLE: + case AS_PAXOS_ACCEPTOR_STATE_ACCEPTED: { + // Allow only higher valued proposal to prevent replays and also to + // ensure convergence in the face of competing proposals. + can_promise = comparison > 0; + } + break; + case AS_PAXOS_ACCEPTOR_STATE_PROMISED: { + // We allow for replays of the prepare message as well so that the + // proposer can receive a promise for this node's lost promise message. + can_promise = comparison >= 0; + } + break; + } + + CLUSTERING_UNLOCK(); + + return can_promise; +} + +/** + * Handle an incoming paxos prepare message. + */ +static void +paxos_acceptor_prepare_handle(as_clustering_internal_event* event) +{ + cf_node src_nodeid = event->msg_src_nodeid; + DEBUG("received paxos prepare from node %"PRIx64, src_nodeid); + + as_paxos_proposal_id proposal_id = { 0 }; + if (msg_event_proposal_id_get(event, &proposal_id) != 0) { + INFO("ignoring paxos prepare from node %"PRIx64" with invalid proposal id", + src_nodeid); + return; + } + + if (!paxos_acceptor_prepare_can_promise(src_nodeid, &proposal_id)) { + INFO("ignoring paxos prepare from node %"PRIx64" with obsolete proposal id (%"PRIx64":%"PRIu64")", proposal_id.src_nodeid, proposal_id.src_nodeid, proposal_id.sequence_number); + paxos_acceptor_prepare_nack_send(src_nodeid, + proposal_id.sequence_number); + return; + } + + CLUSTERING_LOCK(); + + bool is_new_proposal = paxos_proposal_id_compare(&proposal_id, + &g_acceptor.last_proposal_received_id) != 0; + + if (is_new_proposal) { + // Remember this to be the last proposal id we received. + memcpy(&g_acceptor.last_proposal_received_id, &proposal_id, + sizeof(proposal_id)); + + // Update the round start time. + g_acceptor.acceptor_round_start = cf_getms(); + + // Switch to promised state. + g_acceptor.state = AS_PAXOS_ACCEPTOR_STATE_PROMISED; + } + else { + // This is a retransmit or delayed message in which case we do not + // update the state. + // If we have already accepted this proposal, we would want to remain in + // accepted state. + } + + // The proposal is promised. Send back a paxos promise. + paxos_acceptor_promise_send(src_nodeid, proposal_id.sequence_number); + + CLUSTERING_UNLOCK(); +} + +/** + * Check if the incoming accept can be accepted. + */ +static bool +paxos_acceptor_accept_can_accept(cf_node src_nodeid, + as_paxos_proposal_id* proposal_id) +{ + if (!clustering_can_accept_as_proposer(src_nodeid)) { + INFO("ignoring paxos accept from node %"PRIx64" because it cannot be a principal", + src_nodeid); + return false; + } + + bool can_accept = false; + CLUSTERING_LOCK(); + int comparison = paxos_proposal_id_compare(proposal_id, + &g_acceptor.last_proposal_received_id); + + switch (g_acceptor.state) { + case AS_PAXOS_ACCEPTOR_STATE_IDLE: + case AS_PAXOS_ACCEPTOR_STATE_PROMISED: + case AS_PAXOS_ACCEPTOR_STATE_ACCEPTED: { + // We allow for replays of the accept message as well, so that the + // proposer can receive an accepted for this node's lost accepted + // message. + can_accept = comparison >= 0; + } + break; + } + + CLUSTERING_UNLOCK(); + + return can_accept; +} + +/** + * Handle an incoming paxos accept message. + */ +static void +paxos_acceptor_accept_handle(as_clustering_internal_event* event) +{ + cf_node src_nodeid = event->msg_src_nodeid; + + DEBUG("received paxos accept from node %"PRIx64, src_nodeid); + + // Its ok to proceed even is paxos is running, because this could be a + // competing proposal and the winner will be decided by paxos sequence + // number. + as_paxos_proposal_id proposal_id = { 0 }; + if (msg_event_proposal_id_get(event, &proposal_id) != 0) { + INFO("ignoring paxos accept from node %"PRIx64" with invalid proposal id", + src_nodeid); + return; + } + + if (!paxos_acceptor_accept_can_accept(src_nodeid, &proposal_id)) { + INFO("ignoring paxos accept from node %"PRIx64" with obsolete proposal id (%"PRIx64":%"PRIu64")", proposal_id.src_nodeid, proposal_id.src_nodeid, proposal_id.sequence_number); + paxos_acceptor_accept_nack_send(src_nodeid, + proposal_id.sequence_number); + return; + } + + CLUSTERING_LOCK(); + + bool is_new_proposal = paxos_proposal_id_compare(&proposal_id, + &g_acceptor.last_proposal_received_id) != 0; + + if (is_new_proposal) { + // This node has missed the prepare message, but received the accept + // message. This is alright. + + // Remember this to be the last proposal id we received. + memcpy(&g_acceptor.last_proposal_received_id, &proposal_id, + sizeof(proposal_id)); + + // Mark this as the start of the acceptor paxos round. + g_acceptor.acceptor_round_start = cf_getms(); + } + + g_acceptor.state = AS_PAXOS_ACCEPTOR_STATE_ACCEPTED; + // The proposal is accepted. Send back a paxos accept. + paxos_acceptor_accepted_send(src_nodeid, proposal_id.sequence_number); + + CLUSTERING_UNLOCK(); +} + +/** + * Handle an incoming paxos learn message. + */ +static void +paxos_acceptor_learn_handle(as_clustering_internal_event* event) +{ + cf_node src_nodeid = event->msg_src_nodeid; + msg* msg = event->msg; + + DEBUG("received paxos learn from node %"PRIx64, src_nodeid); + + if (!clustering_can_accept_as_proposer(src_nodeid)) { + INFO("ignoring learn message from a non-principal node %"PRIx64" because we are already in a cluster", + src_nodeid); + return; + } + + // Its ok to proceed even if paxos is running, because this could be a + // competing proposal and the winner was decided by paxos sequence number. + as_paxos_proposal_id proposal_id = { 0 }; + if (msg_event_proposal_id_get(event, &proposal_id) != 0) { + INFO("ignoring paxos learn from node %"PRIx64"with invalid proposal id", + src_nodeid); + return; + } + + CLUSTERING_LOCK(); + + if (g_acceptor.state != AS_PAXOS_ACCEPTOR_STATE_ACCEPTED) { + INFO( + "ignoring paxos learn from node %"PRIx64" - proposal id (%"PRIx64":%"PRIu64") we are already in a cluster", + src_nodeid, proposal_id.src_nodeid, + proposal_id.sequence_number); + goto Exit; + } + + if (paxos_proposal_id_compare(&proposal_id, + &g_acceptor.last_proposal_received_id) != 0) { + // We have not promised nor accepted this proposal, + // ignore the learn message. + INFO( + "ignoring paxos learn from node %"PRIx64" - proposal id (%"PRIx64":%"PRIu64") mismatches current proposal id (%"PRIx64":%"PRIu64")", + src_nodeid, proposal_id.src_nodeid, + proposal_id.sequence_number, + g_acceptor.last_proposal_received_id.src_nodeid, + g_acceptor.last_proposal_received_id.sequence_number); + goto Exit; + } + + as_cluster_key new_cluster_key = 0; + cf_vector* new_succession_list = vector_stack_lockless_create(cf_node); + + if (msg_cluster_key_get(msg, &new_cluster_key) != 0) { + INFO("ignoring paxos learn from node %"PRIx64" without cluster key", + src_nodeid); + goto Exit_destory_succession; + } + + if (msg_succession_list_get(msg, new_succession_list) != 0) { + INFO("ignoring paxos learn from node %"PRIx64" without succession list", + src_nodeid); + goto Exit_destory_succession; + } + + if (new_cluster_key == g_register.cluster_key) { + if (!vector_equals(new_succession_list, &g_register.succession_list)) { + // We have the same cluster key repeated for a new round. Should + // never happen. + CRASH("duplicate cluster key %"PRIx64" generated for different paxos rounds - disastrous", new_cluster_key); + } + + INFO("ignoring duplicate paxos learn from node %"PRIx64, src_nodeid); + goto Exit_destory_succession; + } + + // Paxos round converged, apply the new cluster configuration. + paxos_acceptor_success(new_cluster_key, new_succession_list, + proposal_id.sequence_number); + +Exit_destory_succession: + cf_vector_destroy(new_succession_list); + +Exit: + CLUSTERING_UNLOCK(); +} + +/** + * Handle an incoming message. + */ +static void +paxos_acceptor_msg_event_handle(as_clustering_internal_event *msg_event) +{ + switch (msg_event->msg_type) { + case AS_CLUSTERING_MSG_TYPE_PAXOS_PREPARE: + paxos_acceptor_prepare_handle(msg_event); + break; + case AS_CLUSTERING_MSG_TYPE_PAXOS_ACCEPT: + paxos_acceptor_accept_handle(msg_event); + break; + case AS_CLUSTERING_MSG_TYPE_PAXOS_LEARN: + paxos_acceptor_learn_handle(msg_event); + break; + default: // Other message types are not of interest. + break; + } +} + +/** + * Check and retransmit promise message if paxos proposer has not moved ahead + * and send back an accept message. + */ +static void +paxos_acceptor_promise_check_retransmit() +{ + CLUSTERING_LOCK(); + cf_clock now = cf_getms(); + if (g_acceptor.state == AS_PAXOS_ACCEPTOR_STATE_PROMISED + && g_acceptor.promise_send_time + paxos_msg_timeout() < now) { + paxos_acceptor_promise_send( + g_acceptor.last_proposal_received_id.src_nodeid, + g_acceptor.last_proposal_received_id.sequence_number); + } + CLUSTERING_UNLOCK(); +} + +/** + * Check and retransmit accepted message if paxos proposer has not send back a + * learn message. + */ +static void +paxos_acceptor_accepted_check_retransmit() +{ + CLUSTERING_LOCK(); + cf_clock now = cf_getms(); + if (g_acceptor.state == AS_PAXOS_ACCEPTOR_STATE_ACCEPTED + && g_acceptor.accepted_send_time + paxos_msg_timeout() < now) { + paxos_acceptor_accepted_send( + g_acceptor.last_proposal_received_id.src_nodeid, + g_acceptor.last_proposal_received_id.sequence_number); + } + CLUSTERING_UNLOCK(); +} + +/** + * Handle a timer event and retransmit messages if required. + */ +static void +paxos_acceptor_timer_event_handle() +{ + CLUSTERING_LOCK(); + switch (g_acceptor.state) { + case AS_PAXOS_ACCEPTOR_STATE_IDLE: { + // No retransmitts required. + break; + } + case AS_PAXOS_ACCEPTOR_STATE_PROMISED: + paxos_acceptor_promise_check_retransmit(); + break; + case AS_PAXOS_ACCEPTOR_STATE_ACCEPTED: + paxos_acceptor_accepted_check_retransmit(); + break; + } + + CLUSTERING_UNLOCK(); +} + +/** + * Initialize paxos acceptor state. + */ +static void +paxos_acceptor_init() +{ + CLUSTERING_LOCK(); + // Memset to zero which ensures that all acceptor state variables have zero + // which is the correct initial value for elements other that contained + // vectors and status. + memset(&g_acceptor, 0, sizeof(g_acceptor)); + g_acceptor.state = AS_PAXOS_ACCEPTOR_STATE_IDLE; + CLUSTERING_UNLOCK(); +} + +/** + * Paxos acceptor monitor to detect and cleanup long running and most likely + * failed paxos rounds. + */ +static void +paxos_acceptor_monitor() +{ + CLUSTERING_LOCK(); + if (g_acceptor.state != AS_PAXOS_ACCEPTOR_STATE_IDLE + && g_acceptor.acceptor_round_start + paxos_proposal_timeout() + <= cf_getms()) { + // Paxos round is running and has timed out. + // Consider paxos round failed. + INFO("paxos round timed out for proposal id %"PRIx64":%"PRIu64, + config_self_nodeid_get(), + g_proposer.sequence_number); + paxos_acceptor_fail(); + } + CLUSTERING_UNLOCK(); +} + +/* + * ---------------------------------------------------------------------------- + * Paxos lifecycle and common event handling + * ---------------------------------------------------------------------------- + */ + +/** + * Paxos monitor to detect and cleanup long running and most likely failed paxos + * rounds. + */ +static void +paxos_monitor() +{ + paxos_proposer_monitor(); + paxos_acceptor_monitor(); +} + +/** + * Handle an incoming timer event. + */ +static void +paxos_timer_event_handle() +{ + // Acceptor retransmits handled here. + paxos_acceptor_timer_event_handle(); + + // Proposer retransmits handled here. + paxos_proposer_timer_event_handle(); + + // Invoke Paxos monitor to timeout long running paxos rounds. + paxos_monitor(); +} + +/** + * Handle incoming messages. + */ +static void +paxos_msg_event_handle(as_clustering_internal_event* msg_event) +{ + paxos_acceptor_msg_event_handle(msg_event); + paxos_proposer_msg_event_handle(msg_event); +} + +/** + * Handle heartbeat event. + */ +static void +paxos_hb_event_handle(as_clustering_internal_event* hb_event) +{ + paxos_proposer_hb_event_handle(hb_event); +} + +/** + * Dispatch clustering events. + */ +static void +paxos_event_dispatch(as_clustering_internal_event* event) +{ + switch (event->type) { + case AS_CLUSTERING_INTERNAL_EVENT_TIMER: + paxos_timer_event_handle(); + break; + case AS_CLUSTERING_INTERNAL_EVENT_MSG: + paxos_msg_event_handle(event); + break; + case AS_CLUSTERING_INTERNAL_EVENT_HB: + paxos_hb_event_handle(event); + break; + case AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_SYNCED: + paxos_proposer_register_synched(); + default: // Not of interest for paxos. + break; + } +} + +/** + * Initialize paxos proposer and acceptor data structures. + */ +static void +paxos_init() +{ + paxos_proposer_init(); + paxos_acceptor_init(); +} + +/* + * ---------------------------------------------------------------------------- + * Clustering external event publisher + * ---------------------------------------------------------------------------- + */ + +/** + * * Check if event publisher is running. + */ +static bool +external_event_publisher_is_running() +{ + CLUSTERING_EVENT_PUBLISHER_LOCK(); + bool running = g_external_event_publisher.sys_state + == AS_CLUSTERING_SYS_STATE_RUNNING; + CLUSTERING_EVENT_PUBLISHER_UNLOCK(); + return running; +} + +/** + * Initialize the event publisher. + */ +static void +external_event_publisher_init() +{ + CLUSTERING_EVENT_PUBLISHER_LOCK(); + memset(&g_external_event_publisher, 0, sizeof(g_external_event_publisher)); + vector_lockless_init(&g_external_event_publisher.published_succession_list, + cf_node); + + pthread_mutex_init(&g_external_event_publisher.is_pending_mutex, NULL); + pthread_cond_init(&g_external_event_publisher.is_pending, NULL); + CLUSTERING_EVENT_PUBLISHER_UNLOCK(); +} + +/** + * Wakeup the publisher thread. + */ +static void +external_event_publisher_thr_wakeup() +{ + pthread_mutex_lock(&g_external_event_publisher.is_pending_mutex); + pthread_cond_signal(&g_external_event_publisher.is_pending); + pthread_mutex_unlock(&g_external_event_publisher.is_pending_mutex); +} + +/** + * Queue up and external event to publish. + */ +static void +external_event_queue(as_clustering_event* event) +{ + CLUSTERING_EVENT_PUBLISHER_LOCK(); + memcpy(&g_external_event_publisher.to_publish, event, + sizeof(g_external_event_publisher.to_publish)); + + vector_clear(&g_external_event_publisher.published_succession_list); + if (event->succession_list) { + // Use the static list for the published event, so that the input event + // object can be destroyed irrespective of when the it is published. + vector_copy(&g_external_event_publisher.published_succession_list, + event->succession_list); + g_external_event_publisher.to_publish.succession_list = + &g_external_event_publisher.published_succession_list; + + } + + g_external_event_publisher.event_queued = true; + + CLUSTERING_EVENT_PUBLISHER_UNLOCK(); + + // Wake up the publisher thread. + external_event_publisher_thr_wakeup(); +} + +/** + * Publish external events if any are pending. + */ +static void +external_events_publish() +{ + CLUSTERING_EVENT_PUBLISHER_LOCK(); + + if (g_external_event_publisher.event_queued) { + g_external_event_publisher.event_queued = false; + exchange_clustering_event_listener( + &g_external_event_publisher.to_publish); + } + CLUSTERING_EVENT_PUBLISHER_UNLOCK(); +} + +/** + * External event publisher thread. + */ +static void* +external_event_publisher_thr(void* arg) +{ + pthread_mutex_lock(&g_external_event_publisher.is_pending_mutex); + + while (true) { + pthread_cond_wait(&g_external_event_publisher.is_pending, + &g_external_event_publisher.is_pending_mutex); + if (external_event_publisher_is_running()) { + external_events_publish(); + } + else { + // Publisher stopped, exit the tread. + break; + } + } + + pthread_mutex_unlock(&g_external_event_publisher.is_pending_mutex); + return NULL; +} + +/** + * Start the event publisher. + */ +static void +external_event_publisher_start() +{ + CLUSTERING_EVENT_PUBLISHER_LOCK(); + g_external_event_publisher.sys_state = AS_CLUSTERING_SYS_STATE_RUNNING; + + // Start the event publishing thread. + if (pthread_create(&g_external_event_publisher.event_publisher_tid, 0, + external_event_publisher_thr, NULL) != 0) { + CRASH("could not create event publishing thread: %s", + cf_strerror(errno)); + } + CLUSTERING_EVENT_PUBLISHER_UNLOCK(); +} + +/** + * Stop the event publisher. + */ +static void +external_event_publisher_stop() +{ + CLUSTERING_EVENT_PUBLISHER_LOCK(); + g_external_event_publisher.sys_state = + AS_CLUSTERING_SYS_STATE_SHUTTING_DOWN; + CLUSTERING_EVENT_PUBLISHER_UNLOCK(); + + external_event_publisher_thr_wakeup(); + pthread_join(g_external_event_publisher.event_publisher_tid, NULL); + + CLUSTERING_EVENT_PUBLISHER_LOCK(); + g_external_event_publisher.sys_state = AS_CLUSTERING_SYS_STATE_STOPPED; + g_external_event_publisher.event_queued = false; + CLUSTERING_EVENT_PUBLISHER_UNLOCK(); +} + +/* + * ---------------------------------------------------------------------------- + * Clustering register + * ---------------------------------------------------------------------------- + */ + +/** + * Dump register state to logs. + */ +static void +register_dump(bool verbose) +{ + CLUSTERING_LOCK(); + + // Output register state. + switch (g_register.state) { + case AS_CLUSTERING_REGISTER_STATE_SYNCED: + INFO("CL: register: synced"); + break; + case AS_CLUSTERING_REGISTER_STATE_SYNCING: + INFO("CL: register: syncing"); + break; + } + + // Cluster state details. + INFO("CL: cluster changed at: %"PRIu64" now: %"PRIu64, + g_register.cluster_modified_time, cf_getms()); + + INFO("CL: cluster key: %"PRIx64, g_register.cluster_key); + INFO("CL: cluster sequence: %"PRIu64, g_register.sequence_number); + INFO("CL: cluster size: %d", cf_vector_size(&g_register.succession_list)); + + if (verbose) { + log_cf_node_vector("CL: succession:", &g_register.succession_list, + CF_INFO); + } + + CLUSTERING_UNLOCK(); +} + +/** + * Initialize the register. + */ +static void +register_init() +{ + CLUSTERING_LOCK(); + memset(&g_register, 0, sizeof(g_register)); + vector_lockless_init(&g_register.succession_list, cf_node); + vector_lockless_init(&g_register.sync_pending, cf_node); + vector_lockless_init(&g_register.ooo_change_applied_received, cf_node); + vector_lockless_init(&g_register.ooo_succession_list, cf_node); + + // We are in the orphan state but that will be considered as sync state. + g_register.state = AS_CLUSTERING_REGISTER_STATE_SYNCED; + CLUSTERING_UNLOCK(); +} + +/** + * Returns true if register sync is pending. + */ +static bool +register_is_sycn_pending() +{ + CLUSTERING_LOCK(); + bool sync_pending = cf_vector_size(&g_register.sync_pending) > 0; + log_cf_node_vector("pending register sync:", &g_register.sync_pending, + CF_TRACE); + CLUSTERING_UNLOCK(); + return sync_pending; +} + +/** + * Check if the register is synced across the cluster and move to sync state if + * it is synced. + */ +static void +register_check_and_switch_synced() +{ + CLUSTERING_LOCK(); + if (!register_is_sycn_pending() + && g_register.state != AS_CLUSTERING_REGISTER_STATE_SYNCED) { + g_register.state = AS_CLUSTERING_REGISTER_STATE_SYNCED; + // Generate internal cluster changed synced. + as_clustering_internal_event cluster_synced; + memset(&cluster_synced, 0, sizeof(cluster_synced)); + cluster_synced.type = + AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_SYNCED; + internal_event_dispatch(&cluster_synced); + } + CLUSTERING_UNLOCK(); +} + +/** + * Update register to become an orphan node. + */ +static void +register_become_orphan(as_clustering_event_qualifier qualifier) +{ + CLUSTERING_LOCK(); + g_register.state = AS_CLUSTERING_REGISTER_STATE_SYNCED; + g_register.cluster_key = 0; + g_register.sequence_number = 0; + g_register.has_orphan_transitioned = true; + g_clustering.has_integrity = false; + vector_clear(&g_register.succession_list); + vector_clear(&g_register.sync_pending); + + g_register.cluster_modified_time = cf_getms(); + g_register.cluster_modified_hlc_ts = as_hlc_timestamp_now(); + + // Queue internal orphaned event. + as_clustering_internal_event orphaned_event; + memset(&orphaned_event, 0, sizeof(orphaned_event)); + orphaned_event.type = AS_CLUSTERING_INTERNAL_EVENT_REGISTER_ORPHANED; + orphaned_event.qualifier = qualifier; + internal_event_dispatch(&orphaned_event); + + CLUSTERING_UNLOCK(); + + INFO("moved self node to orphan state"); +} + +/** + * Handle timer event in the syncing state. + */ +static void +register_syncing_timer_event_handle() +{ + CLUSTERING_LOCK(); + cf_clock now = cf_getms(); + if (g_register.last_sync_check_time + register_sync_check_interval() + > now) { + // Give more time before checking for sync. + goto Exit; + } + + if (register_is_sycn_pending()) { + // Update pending nodes based on heartbeat status. + int num_pending = cf_vector_size(&g_register.sync_pending); + for (int i = 0; i < num_pending; i++) { + cf_node pending; + cf_vector_get(&g_register.sync_pending, i, &pending); + if (clustering_node_is_sync(pending)) { + cf_vector_delete(&g_register.sync_pending, i); + + // Compensate the index for the delete. + i--; + + // Adjust vector size. + num_pending--; + } + } + } + + register_check_and_switch_synced(); + +Exit: + CLUSTERING_UNLOCK(); +} + +/** + * Send cluster change applied message to all cluster members. + */ +static void +register_cluster_change_applied_msg_send() +{ + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_CLUSTER_CHANGE_APPLIED); + + CLUSTERING_LOCK(); + + // Set the cluster key. + msg_cluster_key_set(msg, g_register.cluster_key); + + // Set the succession list. + msg_succession_list_set(msg, &g_register.succession_list); + + log_cf_node_vector("cluster change applied message sent to:", + &g_register.succession_list, CF_DEBUG); + + cf_vector* members = vector_stack_lockless_create(cf_node); + vector_copy(members, &g_register.succession_list); + + CLUSTERING_UNLOCK(); + + // Sent the message to the cluster members. + msg_nodes_send(msg, members); + cf_vector_destroy(members); +} + +/** + * Validate cluster state. For now ensure the cluster size is greater than the + * min cluster size. + */ +static void +register_validate_cluster() +{ + CLUSTERING_LOCK(); + int cluster_size = cf_vector_size(&g_register.succession_list); + if (!clustering_is_orphan() + && cluster_size < g_config.clustering_config.cluster_size_min) { + WARNING( + "cluster size %d less than required minimum size %d - switching to orphan state", + cluster_size, g_config.clustering_config.cluster_size_min); + register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST); + } + CLUSTERING_UNLOCK(); +} + +/** + * Handle a timer event for the register. + */ +static void +register_timer_event_handle() +{ + CLUSTERING_LOCK(); + switch (g_register.state) { + case AS_CLUSTERING_REGISTER_STATE_SYNCED: + register_validate_cluster(); + break; + case AS_CLUSTERING_REGISTER_STATE_SYNCING: + register_syncing_timer_event_handle(); + break; + } + CLUSTERING_UNLOCK(); +} + +/** + * Handle paxos round succeeding. + */ +static void +register_paxos_acceptor_success_handle( + as_clustering_internal_event* paxos_success_event) +{ + CLUSTERING_LOCK(); + + g_register.has_orphan_transitioned = false; + + g_register.cluster_key = paxos_success_event->new_cluster_key; + g_register.sequence_number = paxos_success_event->new_sequence_number; + + vector_clear(&g_register.succession_list); + vector_copy(&g_register.succession_list, + paxos_success_event->new_succession_list); + + // Update the timestamps as the register has changed its contents. + g_register.cluster_modified_time = cf_getms(); + g_register.cluster_modified_hlc_ts = as_hlc_timestamp_now(); + + // Initialize pending list with all cluster members. + g_register.state = AS_CLUSTERING_REGISTER_STATE_SYNCING; + vector_clear(&g_register.sync_pending); + vector_copy(&g_register.sync_pending, &g_register.succession_list); + register_cluster_change_applied_msg_send(); + + if (g_register.cluster_key == g_register.ooo_cluster_key + && vector_equals(&g_register.succession_list, + &g_register.ooo_succession_list)) { + // We have already received change applied message from these node + // account for them. + vector_subtract(&g_register.sync_pending, + &g_register.ooo_change_applied_received); + } + vector_clear(&g_register.ooo_change_applied_received); + vector_clear(&g_register.ooo_succession_list); + g_register.ooo_cluster_key = 0; + g_register.ooo_hlc_timestamp = 0; + + INFO("applied new cluster key %"PRIx64, + paxos_success_event->new_cluster_key); + log_cf_node_vector("applied new succession list", + &g_register.succession_list, CF_INFO); + INFO("applied cluster size %d", + cf_vector_size(&g_register.succession_list)); + + as_clustering_internal_event cluster_changed; + memset(&cluster_changed, 0, sizeof(cluster_changed)); + cluster_changed.type = + AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_CHANGED; + internal_event_dispatch(&cluster_changed); + + // Send change appied message. Its alright even if they are out of order. + register_cluster_change_applied_msg_send(); + + CLUSTERING_UNLOCK(); +} + +/** + * Handle incoming cluster change applied message. + */ +static void +register_cluster_change_applied_msg_handle( + as_clustering_internal_event* msg_event) +{ + CLUSTERING_LOCK(); + as_cluster_key msg_cluster_key = 0; + msg_cluster_key_get(msg_event->msg, &msg_cluster_key); + cf_vector *msg_succession_list = vector_stack_lockless_create(cf_node); + msg_succession_list_get(msg_event->msg, msg_succession_list); + as_hlc_timestamp msg_hlc_timestamp = 0; + msg_send_ts_get(msg_event->msg, &msg_hlc_timestamp); + + DEBUG("received cluster change applied message from node %"PRIx64, + msg_event->msg_src_nodeid); + if (g_register.cluster_key == msg_cluster_key + && vector_equals(&g_register.succession_list, + msg_succession_list)) { + // This is a matching change applied message. + int found_at = 0; + if ((found_at = vector_find(&g_register.sync_pending, + &msg_event->msg_src_nodeid)) >= 0) { + // Remove from the pending list. + cf_vector_delete(&g_register.sync_pending, found_at); + } + + } + else if (g_register.ooo_cluster_key == msg_cluster_key + && vector_equals(&g_register.ooo_succession_list, + msg_succession_list)) { + DEBUG("received ooo cluster change applied message from node %"PRIx64" with cluster key %"PRIx64, msg_event->msg_src_nodeid, msg_cluster_key); + cf_vector_append_unique(&g_register.ooo_change_applied_received, + &msg_event->msg_src_nodeid); + + } + else if (g_register.ooo_hlc_timestamp < msg_hlc_timestamp) { + // Prefer a later version of OOO message. + g_register.ooo_cluster_key = msg_cluster_key; + g_register.ooo_hlc_timestamp = msg_hlc_timestamp; + vector_clear(&g_register.ooo_succession_list); + vector_copy(&g_register.ooo_succession_list, msg_succession_list); + vector_clear(&g_register.ooo_change_applied_received); + cf_vector_append_unique(&g_register.ooo_change_applied_received, + &msg_event->msg_src_nodeid); + DEBUG("received ooo cluster change applied message from node %"PRIx64" with cluster key %"PRIx64, msg_event->msg_src_nodeid, msg_cluster_key); + } + else { + INFO( + "ignoring cluster mismatching change applied message from node %"PRIx64, + msg_event->msg_src_nodeid); + } + cf_vector_destroy(msg_succession_list); + register_check_and_switch_synced(); + CLUSTERING_UNLOCK(); +} + +/** + * Handle incoming message. + */ +static void +register_msg_event_handle(as_clustering_internal_event* msg_event) +{ + CLUSTERING_LOCK(); + as_clustering_msg_type type; + msg_type_get(msg_event->msg, &type); + + if (type == AS_CLUSTERING_MSG_TYPE_CLUSTER_CHANGE_APPLIED) { + register_cluster_change_applied_msg_handle(msg_event); + } + CLUSTERING_UNLOCK(); +} + +/** + * Dispatch internal events to the register. + */ +static void +register_event_dispatch(as_clustering_internal_event* event) +{ + switch (event->type) { + case AS_CLUSTERING_INTERNAL_EVENT_TIMER: + register_timer_event_handle(); + break; + case AS_CLUSTERING_INTERNAL_EVENT_PAXOS_ACCEPTOR_SUCCESS: + register_paxos_acceptor_success_handle(event); + break; + case AS_CLUSTERING_INTERNAL_EVENT_MSG: + register_msg_event_handle(event); + break; + default: // Not of interest for the register. + break; + } +} + +/* + * ---------------------------------------------------------------------------- + * Clustering core (triggers cluster changes) + * ---------------------------------------------------------------------------- + */ + +/** + * Send a join reject message to destination node. + */ +static void +clustering_join_reject_send(cf_node dest) +{ + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_JOIN_REJECT); + + DETAIL("sent join reject to node %"PRIx64, dest); + + // Sent the message to the acceptors. + msg_node_send(msg, dest); +} + +/** + * Send cluster join reject message to all nodes in the vector. + */ +static void +clustering_join_requests_reject(cf_vector* rejected_nodes) +{ + int rejected_node_count = cf_vector_size(rejected_nodes); + for (int i = 0; i < rejected_node_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + cf_node requesting_nodeid = *((cf_node*)cf_vector_getp(rejected_nodes, + i)); + + // Send the reject message. + clustering_join_reject_send(requesting_nodeid); + } +} + +/** + * Send join reject message for all pending join requests. + */ +static void +clustering_join_requests_reject_all() +{ + CLUSTERING_LOCK(); + + cf_vector* rejected_nodes = vector_stack_lockless_create(cf_node); + vector_copy_unique(rejected_nodes, &g_clustering.pending_join_requests); + + vector_clear(&g_clustering.pending_join_requests); + + CLUSTERING_UNLOCK(); + + clustering_join_requests_reject(rejected_nodes); + + cf_vector_destroy(rejected_nodes); +} + +/** + * Send a join request to a principal. + * @param new_principal the destination principal node. + * @return 0 on successful message queue, -1 on failure. + */ +static int +clustering_join_request_send(cf_node new_principal) +{ + int rv = -1; + CLUSTERING_LOCK(); + + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_JOIN_REQUEST); + + DETAIL("sending cluster join request to node %"PRIx64, new_principal); + + if (msg_node_send(msg, new_principal) == 0) { + cf_clock now = cf_getms(); + cf_shash_put(g_clustering.join_request_blackout, &new_principal, &now); + + g_clustering.last_join_request_principal = new_principal; + g_clustering.last_join_request_sent_time = + g_clustering.last_join_request_retransmit_time = cf_getms(); + + INFO("sent cluster join request to %"PRIx64, new_principal); + rv = 0; + } + + // Send early reject to all nodes that have send us a join request in the + // orphan state, because self node is not going to become a principal node. + // This allows the requesting nodes to send requests to other + // (potential)principals. + clustering_join_requests_reject_all(); + + CLUSTERING_UNLOCK(); + return rv; +} + +/** + * Retransmit a join request to a previously attmepted principal. + * @param last_join_request_principal the principal to retransmit to. + */ +static void +clustering_join_request_retransmit(cf_node last_join_request_principal) +{ + CLUSTERING_LOCK(); + cf_node new_principal = g_clustering.last_join_request_principal; + g_clustering.last_join_request_retransmit_time = cf_getms(); + CLUSTERING_UNLOCK(); + + if (new_principal != last_join_request_principal) { + // The last attempted principal has changed. Don't retransmit. + return; + } + + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_JOIN_REQUEST); + DETAIL("re-sending cluster join request to node %"PRIx64, new_principal); + if (msg_node_send(msg, new_principal) == 0) { + DEBUG("re-sent cluster join request to %"PRIx64, new_principal); + } +} + +/** + * Remove nodes for which join requests are blocked. + * + * @param requestees the nodes considered for join requests. + * @param target the result with requestees that are not blocked. + */ +static void +clustering_join_request_filter_blocked(cf_vector* requestees, cf_vector* target) +{ + CLUSTERING_LOCK(); + cf_clock last_sent; + int requestee_count = cf_vector_size(requestees); + for (int i = 0; i < requestee_count; i++) { + cf_node requestee; + cf_vector_get(requestees, i, &requestee); + if (cf_shash_get(g_clustering.join_request_blackout, &requestee, + &last_sent) != CF_SHASH_OK) { + // The requestee is not marked for blackout + cf_vector_append(target, &requestee); + } + } + CLUSTERING_UNLOCK(); +} + +/** + * Send a cluster join request to a neighboring principal. If + * preferred_principal is set and it is an eligible neighboring principal, a + * request is sent to that principal, else this function cycles among eligible + * neighboring principals at each call. + * + * A request will not be sent if there is no neighboring principal. + * + * @param preferred_principal the preferred principal to join. User zero if + * there is no preference. + * @return 0 if the join request was send or there is one in progress. -1 if + * there are no principals to try and send the join request. + */ +static as_clustering_join_request_result +clustering_principal_join_request_attempt(cf_node preferred_principal) +{ + CLUSTERING_LOCK(); + + as_clustering_join_request_result rv = AS_CLUSTERING_JOIN_REQUEST_SENT; + cf_vector* neighboring_principals = vector_stack_lockless_create(cf_node); + cf_vector* eligible_principals = vector_stack_lockless_create(cf_node); + + // Get list of neighboring principals. + clustering_neighboring_principals_get(neighboring_principals); + if (cf_vector_size(neighboring_principals) == 0) { + DEBUG("no neighboring principal found - not sending join request"); + rv = AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS; + goto Exit; + } + + clustering_join_request_filter_blocked(neighboring_principals, + eligible_principals); + + if (cf_vector_size(eligible_principals) == 0) { + DETAIL("no eligible principals found to make a join request"); + // This principal is still in the blackout list. Do not send a request. + rv = AS_CLUSTERING_JOIN_REQUEST_PENDING; + goto Exit; + } + + int next_join_request_principal_index = -1; + + // We have some well-formed neighboring clusters, try and join them + if (preferred_principal != 0) { + int preferred_principal_index = vector_find(eligible_principals, + &preferred_principal); + if (preferred_principal_index >= 0) { + DETAIL("sending join request to preferred principal %"PRIx64, + preferred_principal); + + // Update the index of the principal to try. + next_join_request_principal_index = preferred_principal_index; + } + } + + if (next_join_request_principal_index == -1) { + // Choose the first entry, since we have no valid preferred principal. + next_join_request_principal_index = 0; + if (g_clustering.last_join_request_principal != 0) { + // Choose the node after the current principal. If the current + // principal is not found we start at index 0 else the next index. + next_join_request_principal_index = vector_find(eligible_principals, + &g_clustering.last_join_request_principal) + 1; + } + } + + // Forget the fact that a join request is pending for a principal. + g_clustering.last_join_request_principal = 0; + + cf_node* principal_to_try = cf_vector_getp(eligible_principals, + next_join_request_principal_index + % cf_vector_size(eligible_principals)); + + if (principal_to_try) { + rv = clustering_join_request_send(*principal_to_try) == 0 ? + AS_CLUSTERING_JOIN_REQUEST_SENT : + AS_CLUSTERING_JOIN_REQUEST_SEND_FAILED; + + } + else { + DEBUG("no neighboring principal found - not sending join request"); + rv = AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS; + } + +Exit: + if (rv != AS_CLUSTERING_JOIN_REQUEST_SENT) { + // Forget the last principal we sent the join request to. + g_clustering.last_join_request_principal = 0; + g_clustering.last_join_request_sent_time = 0; + } + + CLUSTERING_UNLOCK(); + + cf_vector_destroy(neighboring_principals); + cf_vector_destroy(eligible_principals); + + return rv; +} + +/** + * Send a cluster join request to a neighboring orphan who this node thinks will + * be best suited to form a new cluster. + */ +static as_clustering_join_request_result +clustering_orphan_join_request_attempt() +{ + CLUSTERING_LOCK(); + + // Get list of neighboring orphans. + cf_vector* orphans = vector_stack_lockless_create(cf_node); + clustering_neighboring_orphans_get(orphans); + + // Get filtered list of orphans. + cf_vector* new_succession_list = vector_stack_lockless_create(cf_node); + clustering_join_request_filter_blocked(orphans, new_succession_list); + + log_cf_node_vector("neighboring orphans for join request:", + new_succession_list, CF_DEBUG); + + // Add self node. + cf_node self_nodeid = config_self_nodeid_get(); + cf_vector_append_unique(new_succession_list, &self_nodeid); + + clustering_succession_list_clique_evict(new_succession_list, + "clique based evicted nodes for potential cluster:"); + + // Sort the new succession list. + vector_sort_unique(new_succession_list, cf_node_compare_desc); + + as_clustering_join_request_result rv = + AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS; + + if (cf_vector_size(new_succession_list) > 0) { + cf_node new_principal = *((cf_node*)cf_vector_getp(new_succession_list, + 0)); + if (new_principal == config_self_nodeid_get()) { + // No need to send self a join request. + goto Exit; + } + else { + rv = clustering_join_request_send(new_principal) == 0 ? + AS_CLUSTERING_JOIN_REQUEST_SENT : + AS_CLUSTERING_JOIN_REQUEST_SEND_FAILED; + } + } + +Exit: + cf_vector_destroy(new_succession_list); + cf_vector_destroy(orphans); + + CLUSTERING_UNLOCK(); + return rv; +} + +/** + * Remove nodes from the blackout hash once they have been in the list for + * greater than the blackout period. + */ +int +clustering_join_request_blackout_tend_reduce(const void* key, void* data, + void* udata) +{ + cf_clock* join_request_send_time = (cf_clock*)data; + if (*join_request_send_time + join_request_blackout_interval() + < cf_getms()) { + return CF_SHASH_REDUCE_DELETE; + } + return CF_SHASH_OK; +} + +/** + * Tend the join request blackout data structure to remove blacked out + * principals. + */ +static void +clustering_join_request_blackout_tend() +{ + CLUSTERING_LOCK(); + cf_shash_reduce(g_clustering.join_request_blackout, + clustering_join_request_blackout_tend_reduce, NULL); + CLUSTERING_UNLOCK(); +} + +/** + * Send a cluster join request to a neighboring principal if one exists, else if + * there are no neighboring principals, send a join request to a neighboring + * orphan node if this node thinks it will win paxos and become the new + * principal. + */ +static as_clustering_join_request_result +clustering_join_request_attempt() +{ + clustering_join_request_blackout_tend(); + + CLUSTERING_LOCK(); + cf_node last_join_request_principal = + g_clustering.last_join_request_principal; + cf_clock last_join_request_sent_time = + g_clustering.last_join_request_sent_time; + cf_clock last_join_request_retransmit_time = + g_clustering.last_join_request_retransmit_time; + CLUSTERING_UNLOCK(); + + // Check if the outgoing join request has timed out. + if (last_join_request_principal + && as_hb_is_alive(last_join_request_principal)) { + if (last_join_request_sent_time + join_request_timeout() > cf_getms()) { + if (last_join_request_retransmit_time + + join_request_retransmit_timeout() < cf_getms()) { + // Re-transmit join request to the same principal, to cover the + // case where the previous join request was lost. + clustering_join_request_retransmit(last_join_request_principal); + } + // Wait for the principal to respond. do nothing + DETAIL( + "join request to principal %"PRIx64" pending - not attempting new join request", + last_join_request_principal); + + return AS_CLUSTERING_JOIN_REQUEST_PENDING; + } + // Timeout joining a principal. Choose a different principal. + INFO("join request timed out for principal %"PRIx64, + last_join_request_principal); + + } + + // Try sending a join request to a neighboring principal. + as_clustering_join_request_result rv = + clustering_principal_join_request_attempt(0); + + if (rv != AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS) { + // There are valid principals around. Don't send a request to + // neighboring orphan nodes. + return rv; + } + + // Send a join request to an orphan node, best suited to be the new + // principal. + return clustering_orphan_join_request_attempt(); +} + +/** + * Try to become a principal and start a new cluster. + */ +static void +clustering_cluster_form() +{ + ASSERT(clustering_is_orphan(), + "should not attempt forming new cluster when not an orphan node"); + + CLUSTERING_LOCK(); + bool paxos_proposal_started = false; + cf_vector* new_succession_list = vector_stack_lockless_create(cf_node); + cf_vector* expected_succession_list = vector_stack_lockless_create(cf_node); + cf_vector* orphans = vector_stack_lockless_create(cf_node); + + clustering_neighboring_orphans_get(orphans); + vector_copy(new_succession_list, orphans); + + log_cf_node_vector("neighboring orphans for cluster formation:", + new_succession_list, + cf_vector_size(new_succession_list) > 0 ? CF_INFO : CF_DEBUG); + log_cf_node_vector("pending join requests:", + &g_clustering.pending_join_requests, + cf_vector_size(&g_clustering.pending_join_requests) > 0 ? + CF_INFO : CF_DEBUG); + + // Add self node. + cf_node self_nodeid = config_self_nodeid_get(); + cf_vector_append_unique(new_succession_list, &self_nodeid); + + clustering_succession_list_clique_evict(new_succession_list, + "clique based evicted nodes at cluster formation:"); + + // Sort the new succession list. + vector_sort_unique(new_succession_list, cf_node_compare_desc); + + cf_vector_append(expected_succession_list, &self_nodeid); + vector_copy_unique(expected_succession_list, + &g_clustering.pending_join_requests); + // Sort the expected succession list. + vector_sort_unique(expected_succession_list, cf_node_compare_desc); + // The result should match the pending join requests exactly to consider the + // new succession list. + if (!vector_equals(expected_succession_list, new_succession_list)) { + log_cf_node_vector( + "skipping forming cluster - cannot form new cluster from pending join requests", + &g_clustering.pending_join_requests, CF_INFO); + goto Exit; + } + + if (cf_vector_size(orphans) > 0 + && cf_vector_size(new_succession_list) == 1) { + log_cf_node_vector( + "skipping forming cluster - there are neighboring orphans that cannot be clustered with", + orphans, CF_INFO); + goto Exit; + } + + if (cf_vector_size(new_succession_list) > 0) { + cf_node new_principal = *((cf_node*)cf_vector_getp(new_succession_list, + 0)); + if (new_principal == config_self_nodeid_get()) { + log_cf_node_vector( + "principal node - forming new cluster with succession list:", + new_succession_list, CF_INFO); + + as_paxos_start_result result = paxos_proposer_proposal_start( + new_succession_list, new_succession_list); + + // Log paxos result. + paxos_result_log(result, new_succession_list); + + paxos_proposal_started = (result == AS_PAXOS_RESULT_STARTED); + } + else { + INFO("skipping cluster formation - a new potential principal %"PRIx64" exists", + new_principal); + } + } + +Exit: + // Compute list of rejected nodes. + if (paxos_proposal_started) { + // Nodes in set (pending_join - new succession list) could not be + // accomodated and should receive a join reject. + vector_subtract(&g_clustering.pending_join_requests, + new_succession_list); + } + else { + // Reject all pending join requests. Will happen below. + } + + cf_vector* rejected_nodes = vector_stack_lockless_create(cf_node); + vector_copy_unique(rejected_nodes, &g_clustering.pending_join_requests); + + // Clear the pending join requests + vector_clear(&g_clustering.pending_join_requests); + + // Send reject messages to rejected nodes. + clustering_join_requests_reject(rejected_nodes); + + cf_vector_destroy(rejected_nodes); + + cf_vector_destroy(orphans); + cf_vector_destroy(expected_succession_list); + cf_vector_destroy(new_succession_list); + + CLUSTERING_UNLOCK(); +} + +/** + * Try to join a cluster if there is a neighboring one, + * else try to form one. + */ +static void +clustering_join_or_form_cluster() +{ + ASSERT(clustering_is_orphan(), + "should not attempt forming new cluster when not an orphan node"); + + if (paxos_proposer_proposal_is_active()) { + // There is an active paxos round with this node as the proposed + // principal. + // Skip join cluster attempt and give current paxos round a chance to + // form the cluster. + return; + } + + CLUSTERING_LOCK(); + + // TODO (Discuss this): after some timeout and exhausting all neighboring + // principals, become a single node cluster / try our own cluster. This + // might not be required. Nonetheless discuss and figure this out. Current + // behaviour is form new cluster after a timeout. + + // A node is orphan for too long if it has attempted a join request which + // timedout and its in orphan state for a while. + bool orphan_for_too_long = (clustering_orphan_timeout() + + g_clustering.orphan_state_start_time) < cf_getms() + && g_clustering.last_join_request_principal + && g_clustering.last_join_request_sent_time + join_request_timeout() + < cf_getms(); + + if (orphan_for_too_long + || clustering_join_request_attempt() + == AS_CLUSTERING_JOIN_REQUEST_NO_PRINCIPALS) { + // No neighboring principal found or we have been orphan for too long, + // try and form a new cluster. + clustering_cluster_form(); + } + else { + // A join request sent successfully or pending. Wait for the new + // principal to respond. + + // We are not going to be a principal node in this quantum, reject all + // pending join requests. + clustering_join_requests_reject_all(); + } + + CLUSTERING_UNLOCK(); +} + +/** + * Get a list of nodes that need to be added to current succession list from + * pending join requests. Bascially filters out node that are not orphans. + */ +static void +clustering_nodes_to_add_get(cf_vector* nodes_to_add) +{ + CLUSTERING_LOCK(); + + // Use a single iteration over the clustering data received via the + // heartbeats instead of individual calls to get a consistent view and avoid + // small lock and release. + as_hb_plugin_data_iterate(&g_clustering.pending_join_requests, + AS_HB_PLUGIN_CLUSTERING, clustering_orphan_nodes_find, + nodes_to_add); + + CLUSTERING_UNLOCK(); +} + +/** + * Handle quantum interval start in the orphan state. Try and join / form a + * cluster. + */ +static void +clustering_orphan_quantum_interval_start_handle() +{ + if (!as_hb_self_is_duplicate()) { + // Try to join a cluster or form a new one. + clustering_join_or_form_cluster(); + } +} + +/** + * Send a cluster move command to all nodes in the input list. + * + * @param candidate_principal the principal to which the other nodes should try + * and join after receiving the move command. + * @param cluster_key current cluster key for receiver validation. + * @param nodeids the nodes to send move command to. + */ +static void +clustering_cluster_move_send(cf_node candidate_principal, + as_cluster_key cluster_key, cf_vector* nodeids) +{ + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_MERGE_MOVE); + + // Set the proposed principal. + msg_proposed_principal_set(msg, candidate_principal); + + // Set cluster key for message validation. + msg_cluster_key_set(msg, cluster_key); + + log_cf_node_vector("cluster merge move command sent to:", nodeids, + CF_DEBUG); + + // Sent the message to the acceptors. + msg_nodes_send(msg, nodeids); +} + +/** + * Update preferred principal votes using hb plugin data. + */ +static void +clustering_principal_preferred_principal_votes_count(cf_node nodeid, + void* plugin_data, size_t plugin_data_size, cf_clock recv_monotonic_ts, + as_hlc_msg_timestamp* msg_hlc_ts, void* udata) +{ + // A hash from each unique non null vinfo to a vector of partition ids + // having the vinfo. + cf_shash* preferred_principal_votes = (cf_shash*)udata; + + CLUSTERING_LOCK(); + if (!clustering_hb_plugin_data_is_obsolete( + g_register.cluster_modified_hlc_ts, + g_register.cluster_modified_time, plugin_data, plugin_data_size, + recv_monotonic_ts, msg_hlc_ts)) { + cf_node* preferred_principal_p = + clustering_hb_plugin_preferred_principal_get(plugin_data, + plugin_data_size); + + int current_votes = 0; + if (cf_shash_get(preferred_principal_votes, preferred_principal_p, + ¤t_votes) == CF_SHASH_OK) { + current_votes++; + } + else { + // We are seeing this preferred principal for the first time. + current_votes = 0; + } + + cf_shash_put(preferred_principal_votes, preferred_principal_p, + ¤t_votes); + } + else { + DETAIL( + "preferred principal voting skipped - found obsolete plugin data for node %"PRIx64, + nodeid); + } + CLUSTERING_UNLOCK(); +} + +/** + * Get the preferred majority principal. + */ +static int +clustering_principal_preferred_principal_majority_find(const void* key, + void* data, void* udata) +{ + + const cf_node* current_preferred_principal = (const cf_node*)key; + int current_preferred_principal_votes = *(int*)data; + cf_node* majority_preferred_principal = (cf_node*)udata; + + CLUSTERING_LOCK(); + int preferred_principal_majority = + (int)ceil( + cf_vector_size( + &g_register.succession_list) * AS_CLUSTERING_PREFERRRED_PRINCIPAL_MAJORITY); + bool is_majority = current_preferred_principal_votes + >= preferred_principal_majority; + CLUSTERING_UNLOCK(); + + if (is_majority) { + *majority_preferred_principal = *current_preferred_principal; + // Majority found, halt reduce. + return CF_SHASH_ERR_FOUND; + } + + return CF_SHASH_OK; +} + +/** + * Get preferred principal based on a majority of non-principal's preferred + * principals. + * @return the preferred principal nodeid if there is a majority, else zero. + */ +static cf_node +clustering_principal_majority_preferred_principal_get() +{ + // A hash from each unique non null vinfo to a vector of partition ids + // having the vinfo. + cf_shash* preferred_principal_votes = cf_shash_create(cf_nodeid_shash_fn, + sizeof(cf_node), sizeof(int), AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT, + 0); + + CLUSTERING_LOCK(); + + // Use a single iteration over the clustering data received via the + // heartbeats instead of individual calls to get a consistent view and avoid + // small lock and release. + as_hb_plugin_data_iterate(&g_register.succession_list, + AS_HB_PLUGIN_CLUSTERING, + clustering_principal_preferred_principal_votes_count, + preferred_principal_votes); + + // Find the majority preferred principal. + cf_node preferred_principal = 0; + cf_shash_reduce(preferred_principal_votes, + clustering_principal_preferred_principal_majority_find, + &preferred_principal); + + CLUSTERING_UNLOCK(); + + cf_shash_destroy(preferred_principal_votes); + + DETAIL("preferred principal is %"PRIx64, preferred_principal); + + return preferred_principal; +} + +/** + * Indicates if this node is a principal and its cluster can be merged with this + * principal node's cluster. + * + * @param nodeid the candidate nodeid. + * @param node_succession_list the candidate node's succession list. + * @param node_succession_list_length the length of the node's succession list. + * @return true if current node can be merged with this node's cluster. + */ +bool +clustering_is_merge_candidate(cf_node nodeid, cf_node* node_succession_list, + int node_succession_list_length) +{ + if (node_succession_list_length <= 0 || node_succession_list[0] != nodeid) { + // Not a principal node. Ignore. + return false; + } + + if (nodeid < config_self_nodeid_get()) { + // Has a smaller nodeid. Ignore. This node will merge with our cluster. + return false; + } + + cf_vector* new_succession_list = vector_stack_lockless_create(cf_node); + + CLUSTERING_LOCK(); + vector_copy_unique(new_succession_list, &g_register.succession_list); + CLUSTERING_UNLOCK(); + + bool is_candidate = false; + + // Node is the principal of its cluster. Create the new succession list. + for (int i = 0; i < node_succession_list_length; i++) { + cf_vector_append_unique(new_succession_list, &node_succession_list[i]); + } + + int expected_cluster_size = cf_vector_size(new_succession_list); + + // Find and evict the nodes that are not well connected. + clustering_succession_list_clique_evict(new_succession_list, + "clique based evicted nodes at cluster merge:"); + int new_cluster_size = cf_vector_size(new_succession_list); + + // If no nodes need to be evicted then the merge is fine. + is_candidate = (expected_cluster_size == new_cluster_size); + + // Exit: + cf_vector_destroy(new_succession_list); + + return is_candidate; +} + +/** + * HB plugin iterate function to find principals that this node's cluster can be + * merged with. + */ +static void +clustering_merge_candiate_find(cf_node nodeid, void* plugin_data, + size_t plugin_data_size, cf_clock recv_monotonic_ts, + as_hlc_msg_timestamp* msg_hlc_ts, void* udata) +{ + cf_node* candidate_principal = (cf_node*)udata; + + CLUSTERING_LOCK(); + + if (!clustering_hb_plugin_data_is_obsolete( + g_register.cluster_modified_hlc_ts, + g_register.cluster_modified_time, plugin_data, plugin_data_size, + recv_monotonic_ts, msg_hlc_ts)) { + uint32_t* other_succession_list_length = + clustering_hb_plugin_succession_length_get(plugin_data, + plugin_data_size); + + cf_node* other_succession_list = clustering_hb_plugin_succession_get( + plugin_data, plugin_data_size); + + if (other_succession_list != NULL + && clustering_is_merge_candidate(nodeid, other_succession_list, + *other_succession_list_length) + && *candidate_principal < nodeid) { + DETAIL("principal node %"PRIx64" potential candidate for cluster merge", nodeid); + *candidate_principal = nodeid; + } + + } + else { + DETAIL( + "merge check skipped - found obsolete plugin data for node %"PRIx64, + nodeid); + } + + CLUSTERING_UNLOCK(); +} + +/** + * Attempt to move to the majority preferred principal. + * + * @return 0 if the move to preferred principal was attempted, -1 otherwise. + */ +static int +clustering_preferred_principal_move() +{ + cf_node preferred_principal = + clustering_principal_majority_preferred_principal_get(); + + if (preferred_principal == 0 + || preferred_principal == config_self_nodeid_get()) { + return -1; + } + + cf_vector* succession_list = vector_stack_lockless_create(cf_node); + as_cluster_key cluster_key = 0; + CLUSTERING_LOCK(); + vector_copy(succession_list, &g_register.succession_list); + cluster_key = g_register.cluster_key; + // Update the time move command was sent. + g_clustering.move_cmd_issue_time = cf_getms(); + CLUSTERING_UNLOCK(); + + INFO("majority nodes find %"PRIx64" to be a better principal - sending move command to all cluster members", + preferred_principal); + clustering_cluster_move_send(preferred_principal, cluster_key, + succession_list); + cf_vector_destroy(succession_list); + + return 0; +} + +/** + * Attempt to merge with a larger adjacent cluster is the resulting cluster will + * form a clique. + * + * @return 0 if a merge is attempted, -1 otherwise. + */ +static int +clustering_merge_attempt() +{ + int rv = -1; + CLUSTERING_LOCK(); + cf_vector* succession_list = vector_stack_lockless_create(cf_node); + vector_copy(succession_list, &g_register.succession_list); + as_cluster_key cluster_key = g_register.cluster_key; + cf_node candidate_principal = 0; + + // Use a single iteration over the clustering data received via the + // heartbeats instead of individual calls to get a consistent view and avoid + // small lock and release. + as_hb_plugin_data_iterate_all(AS_HB_PLUGIN_CLUSTERING, + clustering_merge_candiate_find, &candidate_principal); + + CLUSTERING_UNLOCK(); + + if (candidate_principal == 0) { + DEBUG("no cluster merge candidates found"); + rv = -1; + goto Exit; + } + + // Send a move command to all nodes in the succession list. Need not switch + // to orphan state immediately, this node will receive the move command too + // and will handle the move accordingly. + INFO("this cluster can merge with cluster with principal %"PRIx64" - sending move command to all cluster members", + candidate_principal); + clustering_cluster_move_send(candidate_principal, cluster_key, + succession_list); + rv = 0; +Exit: + cf_vector_destroy(succession_list); + return rv; +} + +/** + * Handle quantum interval start when self node is the principal of its cluster. + */ +static void +clustering_principal_quantum_interval_start_handle( + as_clustering_internal_event* event) +{ + DETAIL("principal node quantum wakeup"); + + if (as_hb_self_is_duplicate()) { + // Cluster is in a bad shape and self node has a duplicate node-id. + register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST); + return; + } + + CLUSTERING_LOCK(); + bool paxos_proposal_started = false; + + cf_vector* dead_nodes = vector_stack_lockless_create(cf_node); + clustering_dead_nodes_find(dead_nodes); + + log_cf_node_vector("dead nodes at quantum start:", dead_nodes, + cf_vector_size(dead_nodes) > 0 ? CF_INFO : CF_DEBUG); + + cf_vector* faulty_nodes = vector_stack_lockless_create(cf_node); + clustering_faulty_nodes_find(faulty_nodes); + + log_cf_node_vector("faulty nodes at quantum start:", faulty_nodes, + cf_vector_size(faulty_nodes) > 0 ? CF_INFO : CF_DEBUG); + + // Having dead node or faulty nodes is a sign of cluster integrity breach. + // New nodes should not count as integrity breach. + g_clustering.has_integrity = cf_vector_size(faulty_nodes) == 0 + && cf_vector_size(dead_nodes) == 0; + + cf_vector* new_nodes = vector_stack_lockless_create(cf_node); + clustering_nodes_to_add_get(new_nodes); + log_cf_node_vector("join requests at quantum start:", new_nodes, + cf_vector_size(new_nodes) > 0 ? CF_INFO : CF_DEBUG); + + cf_vector* new_succession_list = vector_stack_lockless_create(cf_node); + vector_copy_unique(new_succession_list, &g_register.succession_list); + vector_subtract(new_succession_list, dead_nodes); + vector_subtract(new_succession_list, faulty_nodes); + vector_copy_unique(new_succession_list, new_nodes); + + // Add self node. We should not miss self in the succession list, but be + // doubly sure. + cf_node self_nodeid = config_self_nodeid_get(); + cf_vector_append_unique(new_succession_list, &self_nodeid); + + vector_sort_unique(new_succession_list, cf_node_compare_desc); + uint32_t num_evicted = clustering_succession_list_clique_evict( + new_succession_list, + "clique based evicted nodes at quantum start:"); + + if (event->quantum_interval_is_skippable && cf_vector_size(dead_nodes) != 0 + && !quantum_interval_is_adjacency_fault_seen()) { + // There is an imminent adjacency fault that has not been seen by the + // quantum interval generator, lets not take any action. + DEBUG("adjacency fault imminent - skipping quantum interval handling"); + quantum_interval_mark_postponed(); + goto Exit; + } + + if (event->quantum_interval_is_skippable && num_evicted != 0 + && !quantum_interval_is_peer_adjacency_fault_seen()) { + // There is an imminent adjacency fault that has not been seen by the + // quantum interval generator, lets not take any action. + DEBUG( + "peer adjacency fault imminent - skipping quantum interval handling"); + quantum_interval_mark_postponed(); + goto Exit; + } + + if (cf_vector_size(faulty_nodes) == 0 && cf_vector_size(dead_nodes) == 0) { + // We might have only pending join requests. Attempt a move to a + // preferred principal or a merge before trying to add new nodes. + if (clustering_preferred_principal_move() == 0 + || clustering_merge_attempt() == 0) { + goto Exit; + } + } + + if (vector_equals(new_succession_list, &g_register.succession_list) + && cf_vector_size(faulty_nodes) == 0) { + // There is no change in the succession list and also there are no + // faulty nodes. If there are faulty nodes they have probably restarted + // quickly, in which case a new cluster transition with the same + // succession list is required. + goto Exit; + } + + if (cf_vector_size(faulty_nodes) != 0 + && cf_vector_size(new_succession_list) == 1) { + // This node most likely lost time (slept/paused) and the rest of the + // cluster reformed. Its best to go to the orphan state and start from + // there instead of moving to a single node cluster and again eventually + // forming a larger cluster. + WARNING( + "all cluster members are part of different cluster - changing state to orphan"); + register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST); + goto Exit; + } + + // Start a new paxos round. + log_cf_node_vector("current succession list", &g_register.succession_list, + CF_DEBUG); + + log_cf_node_vector("proposed succession list", new_succession_list, + CF_DEBUG); + DEBUG("proposed cluster size %d", cf_vector_size(new_succession_list)); + + as_paxos_start_result result = paxos_proposer_proposal_start( + new_succession_list, new_succession_list); + + // Log paxos result. + paxos_result_log(result, new_succession_list); + + // TODO: Should we move to orphan state if there are not enough nodes in the + // cluster. + // Tentatively yes.... + if (result == AS_PAXOS_RESULT_CLUSTER_TOO_SMALL) { + register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST); + } + + paxos_proposal_started = (result == AS_PAXOS_RESULT_STARTED); +Exit: + // Although these are stack vectors the contents can be heap allocated on + // resize. Destroy call is prudent. + cf_vector_destroy(dead_nodes); + cf_vector_destroy(faulty_nodes); + cf_vector_destroy(new_nodes); + cf_vector_destroy(new_succession_list); + + // Compute list of rejected nodes. + if (paxos_proposal_started) { + // Nodes in set (pending_join - new succession list) could not be + // accomodated and should receive a join reject. + vector_subtract(&g_clustering.pending_join_requests, + new_succession_list); + } + else { + // Nodes in set (pending_join - current succession list) could not be + // accomodated and should receive a join reject. + vector_subtract(&g_clustering.pending_join_requests, + &g_register.succession_list); + + } + + cf_vector* rejected_nodes = vector_stack_lockless_create(cf_node); + vector_copy_unique(rejected_nodes, &g_clustering.pending_join_requests); + + // Clear the pending join requests + vector_clear(&g_clustering.pending_join_requests); + + // Send reject messages to rejected nodes. + clustering_join_requests_reject(rejected_nodes); + + cf_vector_destroy(rejected_nodes); + + CLUSTERING_UNLOCK(); +} + +/** + * Check for and handle eviction by self node's principal. + * + * @param principal_plugin_data the pluging data for the principal. + * @param plugin_data_hlc_ts the hlc timestamp when the plugin data was + * received. + * @param plugin_data_ts the monotonic clock timestamp when the plugin data was + * recvied. + */ +static void +clustering_non_principal_evicted_check(cf_node principal_nodeid, + as_hb_plugin_node_data* principal_plugin_data, + as_hlc_msg_timestamp* plugin_data_hlc_ts, cf_clock plugin_data_ts) +{ + CLUSTERING_LOCK(); + bool is_evicted = false; + + if (!as_hb_is_alive(principal_nodeid)) { + is_evicted = true; + goto Exit; + } + + if (!clustering_is_our_principal(principal_nodeid) + || clustering_hb_plugin_data_is_obsolete( + g_register.cluster_modified_hlc_ts, + g_register.cluster_modified_time, + principal_plugin_data->data, + principal_plugin_data->data_size, plugin_data_ts, + plugin_data_hlc_ts)) { + // The plugin data is obsolete. Can't take decisions based on it. + goto Exit; + } + + // Get the changed node's succession list, cluster key. All the fields + // should be present since the obsolete check also checked for fields being + // valid. + cf_node* succession_list_p = clustering_hb_plugin_succession_get( + principal_plugin_data->data, principal_plugin_data->data_size); + uint32_t* succession_list_length_p = + clustering_hb_plugin_succession_length_get( + principal_plugin_data->data, + principal_plugin_data->data_size); + + // Check if we have been evicted. + if (!clustering_is_node_in_succession(config_self_nodeid_get(), + succession_list_p, *succession_list_length_p)) { + is_evicted = true; + } + +Exit: + if (is_evicted) { + // This node has been evicted from the cluster. + WARNING("evicted from cluster by principal node %"PRIx64"- changing state to orphan", + principal_nodeid); + register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST); + } + + CLUSTERING_UNLOCK(); +} + +/** + * Monitor plugin data change events for evictions. + */ +static void +clustering_non_principal_hb_plugin_data_changed_handle( + as_clustering_internal_event* change_event) +{ + clustering_non_principal_evicted_check( + change_event->plugin_data_changed_nodeid, change_event->plugin_data, + &change_event->plugin_data_changed_hlc_ts, + change_event->plugin_data_changed_ts); +} + +/** + * Update the preferred principal in the non-principal mode. + */ +static void +clustering_non_principal_preferred_principal_update() +{ + cf_node current_principal = 0; + if (clustering_principal_get(¤t_principal) != 0 + || current_principal == 0) { + // We are an orphan. + return; + } + + cf_vector* new_succession_list = vector_stack_lockless_create(cf_node); + + clustering_neighboring_nodes_get(new_succession_list); + cf_node self_nodeid = config_self_nodeid_get(); + cf_vector_append(new_succession_list, &self_nodeid); + + clustering_succession_list_clique_evict(new_succession_list, + "clique based evicted nodes while updating preferred principal:"); + + // Sort the new succession list. + vector_sort_unique(new_succession_list, cf_node_compare_desc); + + cf_node preferred_principal = 0; + int new_cluster_size = cf_vector_size(new_succession_list); + if (new_cluster_size > 0) { + if (vector_find(new_succession_list, ¤t_principal) < 0) { + cf_vector_get(new_succession_list, 0, &preferred_principal); + } + } + + CLUSTERING_LOCK(); + if (preferred_principal != 0 + && g_clustering.preferred_principal != preferred_principal) { + INFO("preferred principal updated to %"PRIx64, + g_clustering.preferred_principal); + } + g_clustering.preferred_principal = preferred_principal; + + cf_vector_destroy(new_succession_list); + CLUSTERING_UNLOCK(); +} + +/** + * Handle quantum interval start in the non principal state. + */ +static void +clustering_non_principal_quantum_interval_start_handle() +{ + // Reject all accumulated join requests since we are no longer a principal. + clustering_join_requests_reject_all(); + + if (as_hb_self_is_duplicate()) { + // Cluster is in a bad shape and self node has a duplicate node-id. + register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST); + return; + } + + // Update the preferred principal. + clustering_non_principal_preferred_principal_update(); + + // Check if we have been evicted. + cf_node principal = 0; + + if (clustering_principal_get(&principal) != 0) { + WARNING("could not get principal for self node"); + return; + } + + as_hlc_msg_timestamp plugin_data_hlc_ts; + cf_clock plugin_data_ts = 0; + as_hb_plugin_node_data plugin_data = { 0 }; + + if (clustering_hb_plugin_data_get(principal, &plugin_data, + &plugin_data_hlc_ts, &plugin_data_ts) != 0) { + plugin_data_ts = 0; + memset(&plugin_data, 0, sizeof(plugin_data)); + } + + clustering_non_principal_evicted_check(principal, &plugin_data, + &plugin_data_hlc_ts, plugin_data_ts); +} + +/** + * Handle quantum interval start. + */ +static void +clustering_quantum_interval_start_handle(as_clustering_internal_event* event) +{ + CLUSTERING_LOCK(); + + // Dispatch based on state. + switch (g_clustering.state) { + case AS_CLUSTERING_STATE_ORPHAN: + clustering_orphan_quantum_interval_start_handle(); + break; + case AS_CLUSTERING_STATE_PRINCIPAL: + clustering_principal_quantum_interval_start_handle(event); + break; + case AS_CLUSTERING_STATE_NON_PRINCIPAL: + clustering_non_principal_quantum_interval_start_handle(); + default: + break; + } + + CLUSTERING_UNLOCK(); +} + +/** + * Handle a timer event in the orphan state. + */ +static void +clustering_orphan_timer_event_handle() +{ + // Attempt a join request. + DETAIL("attempting join request from orphan state"); + clustering_join_request_attempt(); +} + +/** + * Handle a timer event for the clustering module. + */ +static void +clustering_timer_event_handle() +{ + CLUSTERING_LOCK(); + + // Dispatch based on state. + switch (g_clustering.state) { + case AS_CLUSTERING_STATE_ORPHAN: + clustering_orphan_timer_event_handle(); + break; + default: + break; + } + + CLUSTERING_UNLOCK(); +} + +/** + * Check if the incoming message is sane to be proccessed further. + */ +static bool +clustering_message_sanity_check(cf_node src_nodeid, msg* msg) +{ + as_cluster_proto_identifier proto; + if (msg_proto_id_get(msg, &proto) != 0) { + WARNING( + "received message with no clustering protocol identifier from node %"PRIx64, + src_nodeid); + return false; + } + + return clustering_versions_are_compatible(proto, + clustering_protocol_identifier_get()); +} + +/** + * Handle an incoming join request. We do not bother with older replay's for + * join requests because the pending request are cleanup during new cluster + * formation. + */ +static void +clustering_join_request_handle(as_clustering_internal_event* msg_event) +{ + cf_node src_nodeid = msg_event->msg_src_nodeid; + DEBUG("received cluster join request from node %"PRIx64, src_nodeid); + bool fire_quantum_event = false; + + CLUSTERING_LOCK(); + + cf_clock now = cf_getms(); + + if (g_clustering.move_cmd_issue_time + join_request_move_reject_interval() + > now) { + // We have just send out a move request. Reject this join request. + INFO("ignoring join request from node %"PRIx64" since we have just issued a move command", + src_nodeid); + clustering_join_reject_send(src_nodeid); + goto Exit; + } + + if ((!clustering_is_principal() && !clustering_is_orphan()) + || g_clustering.last_join_request_sent_time + join_request_timeout() + >= cf_getms()) { + // Can't handle a join request this node is not the principal right now + // or this node is trying to join another cluster. + msg* msg = msg_pool_get(AS_CLUSTERING_MSG_TYPE_JOIN_REJECT); + + DETAIL("sent join reject to node %"PRIx64, msg_event->msg_src_nodeid); + + // Sent the message to the acceptors. + msg_node_send(msg, msg_event->msg_src_nodeid); + + goto Exit; + } + + if (vector_find(&g_clustering.pending_join_requests, &src_nodeid) >= 0) { + DEBUG("ignoring join request from node %"PRIx64" since a request is already pending", + src_nodeid); + goto Exit; + } + + // Check if we are receiving a stale or very delayed join request. + int64_t message_delay_estimate = as_hlc_timestamp_diff_ms( + as_hlc_timestamp_now(), msg_event->msg_hlc_ts.send_ts); + if (message_delay_estimate < 0 + || message_delay_estimate > join_request_accept_delay_max()) { + INFO("ignoring stale join request from node %"PRIx64" - delay estimate %lu(ms) ", + src_nodeid, message_delay_estimate); + goto Exit; + } + + // Add this request to the pending queue. + cf_vector_append_unique(&g_clustering.pending_join_requests, &src_nodeid); + + // Generate a join request accepted event for the quantum interval + // generator. + as_clustering_internal_event join_request_event; + memset(&join_request_event, 0, sizeof(join_request_event)); + join_request_event.type = + AS_CLUSTERING_INTERNAL_EVENT_JOIN_REQUEST_ACCEPTED; + join_request_event.join_request_source_nodeid = src_nodeid; + internal_event_dispatch(&join_request_event); + fire_quantum_event = true; + + INFO("accepted join request from node %"PRIx64, src_nodeid); + +Exit: + CLUSTERING_UNLOCK(); + + if (fire_quantum_event) { + internal_event_dispatch(&join_request_event); + } +} + +/** + * Handle an incoming join reject. + */ +static void +clustering_join_reject_handle(as_clustering_internal_event* event) +{ + cf_node src_nodeid = event->msg_src_nodeid; + + DEBUG("received cluster join reject from node %"PRIx64, src_nodeid); + + CLUSTERING_LOCK(); + + if (!clustering_is_orphan()) { + // Already part of a cluster. Ignore the reject. + INFO( + "already part of a cluster - ignoring join reject from node %"PRIx64, + src_nodeid); + goto Exit; + } + + if (paxos_proposer_proposal_is_active()) { + // This node is attempting to form a new cluster. + INFO( + "already trying to form a cluster - ignoring join reject from node %"PRIx64, + src_nodeid); + goto Exit; + } + + if (g_clustering.last_join_request_principal == src_nodeid) { + // This node had requested the source principal for cluster membership + // which was rejected. Try and join a different cluster. + + // This join request should not be considered as pending, so reset the + // join request sent time. + g_clustering.last_join_request_sent_time = 0; + g_clustering.last_join_request_principal = 0; + clustering_join_request_attempt(); + } + +Exit: + CLUSTERING_UNLOCK(); +} + +/** + * Handle an incoming merge move command. Basically this node switched to orphan + * state and sends a join request to the principal listed in the merge move. + */ +static void +clustering_merge_move_handle(as_clustering_internal_event* event) +{ + cf_node src_nodeid = event->msg_src_nodeid; + + DEBUG("received cluster merge move from node %"PRIx64, src_nodeid); + + CLUSTERING_LOCK(); + + as_cluster_key msg_cluster_key = 0; + msg_cluster_key_get(event->msg, &msg_cluster_key); + + if (clustering_is_orphan()) { + // Already part of a cluster. Ignore the reject. + INFO( + "already orphan node - ignoring merge move command from node %"PRIx64, + src_nodeid); + goto Exit; + } + + if (msg_is_obsolete(g_register.cluster_modified_hlc_ts, + g_register.cluster_modified_time, event->msg_recvd_ts, + &event->msg_hlc_ts) || !clustering_is_our_principal(src_nodeid) + || paxos_proposer_proposal_is_active() + || msg_cluster_key != g_register.cluster_key) { + INFO("ignoring cluster merge move from node %"PRIx64, src_nodeid); + goto Exit; + } + + // Madril simulation black lists current principal so that we do not end up + // joining him again immediately. However the check for obsolete data should + // make that check from madril redundant. + cf_node new_principal = 0; + + if (msg_proposed_principal_get(event->msg, &new_principal) != 0) { + // Move command does not have the proposed principal + WARNING( + "received merge move command without a proposed principal. Will join the first available principal"); + new_principal = 0; + } + + // Switch to orphan cluster state so that we move to the new principal. + register_become_orphan (AS_CLUSTERING_ATTEMPTING_MERGE); + + // Send a join request to a the new principal + clustering_principal_join_request_attempt(new_principal); +Exit: + CLUSTERING_UNLOCK(); +} + +/** + * Handle an incoming message. + */ +static void +clustering_msg_event_handle(as_clustering_internal_event* msg_event) +{ + // Delegate handling based on message type. + switch (msg_event->msg_type) { + case AS_CLUSTERING_MSG_TYPE_JOIN_REQUEST: + clustering_join_request_handle(msg_event); + break; + case AS_CLUSTERING_MSG_TYPE_JOIN_REJECT: + clustering_join_reject_handle(msg_event); + break; + case AS_CLUSTERING_MSG_TYPE_MERGE_MOVE: + clustering_merge_move_handle(msg_event); + break; + default: // Non cluster management messages. + break; + } +} + +/** + * Fabric msg listener that generates an internal message event and dispatches + * it to the sub system. + */ +static int +clustering_fabric_msg_listener(cf_node msg_src_nodeid, msg* msg, void* udata) +{ + if (!clustering_is_running()) { + // Ignore fabric messages when clustering is not running. + WARNING("clustering stopped - ignoring message from node %"PRIx64, + msg_src_nodeid); + goto Exit; + } + + // Sanity check. + if (!clustering_message_sanity_check(msg_src_nodeid, msg)) { + WARNING("invalid mesage received from node %"PRIx64, msg_src_nodeid); + goto Exit; + } + + as_clustering_internal_event msg_event; + memset(&msg_event, 0, sizeof(msg_event)); + msg_event.type = AS_CLUSTERING_INTERNAL_EVENT_MSG; + + msg_event.msg_src_nodeid = msg_src_nodeid; + + // Update hlc and store update message timestamp for the event. + as_hlc_timestamp send_ts = 0; + msg_send_ts_get(msg, &send_ts); + as_hlc_timestamp_update(msg_event.msg_src_nodeid, send_ts, + &msg_event.msg_hlc_ts); + + msg_event.msg = msg; + msg_event.msg_recvd_ts = cf_getms(); + msg_type_get(msg, &msg_event.msg_type); + + internal_event_dispatch(&msg_event); + +Exit: + as_fabric_msg_put(msg); + return 0; +} + +/** + * Handle register cluster changed. + */ +static void +clustering_register_cluster_changed_handle() +{ + CLUSTERING_LOCK(); + + if (paxos_proposer_proposal_is_active()) { + paxos_proposer_fail(); + } + + if (clustering_is_principal()) { + g_clustering.state = AS_CLUSTERING_STATE_PRINCIPAL; + } + else { + g_clustering.state = AS_CLUSTERING_STATE_NON_PRINCIPAL; + // We are a non-principal. Reject all pending join requests. + clustering_join_requests_reject_all(); + } + + g_clustering.preferred_principal = 0; + g_clustering.last_join_request_principal = 0; + g_clustering.move_cmd_issue_time = 0; + + CLUSTERING_UNLOCK(); +} + +/** + * Handle register synced events. Basically this means it is safe to publish the + * cluster changed event to external sub systems. + */ +static void +clustering_register_cluster_synced_handle(as_clustering_internal_event* event) +{ + CLUSTERING_LOCK(); + + // Queue the cluster change event for publishing. + as_clustering_event cluster_change_event; + cluster_change_event.type = AS_CLUSTERING_CLUSTER_CHANGED; + cluster_change_event.qualifier = event->qualifier; + cluster_change_event.cluster_key = g_register.cluster_key; + cluster_change_event.succession_list = &g_register.succession_list; + external_event_queue(&cluster_change_event); + + g_clustering.has_integrity = true; + + CLUSTERING_UNLOCK(); +} + +/** + * Handle the register going to orphaned state. + */ +static void +clustering_register_orphaned_handle(as_clustering_internal_event* event) +{ + CLUSTERING_LOCK(); + g_clustering.state = AS_CLUSTERING_STATE_ORPHAN; + g_clustering.orphan_state_start_time = cf_getms(); + g_clustering.preferred_principal = 0; + + // Queue the cluster change event for publishing. + as_clustering_event orphaned_event; + orphaned_event.type = AS_CLUSTERING_ORPHANED; + orphaned_event.qualifier = event->qualifier; + orphaned_event.cluster_key = 0; + orphaned_event.succession_list = NULL; + external_event_queue(&orphaned_event); + CLUSTERING_UNLOCK(); +} + +/** + * Handle hb plugin data change by dispatching it based on clustering change. + */ +static void +clustering_hb_plugin_data_changed_event_handle( + as_clustering_internal_event* change_event) +{ + CLUSTERING_LOCK(); + switch (g_clustering.state) { + case AS_CLUSTERING_STATE_NON_PRINCIPAL: + clustering_non_principal_hb_plugin_data_changed_handle(change_event); + break; + default: + break; + } + CLUSTERING_UNLOCK(); +} + +/** + * Handle heartbeat event. + */ +static void +clustering_hb_event_handle(as_clustering_internal_event* hb_event) +{ + for (int i = 0; i < hb_event->hb_n_events; i++) { + if (hb_event->hb_events[i].evt == AS_HB_NODE_DEPART + && clustering_is_our_principal(hb_event->hb_events[i].nodeid)) { + // Our principal is no longer visible. + INFO("principal node %"PRIx64" departed - switching to orphan state", + hb_event->hb_events[i].nodeid); + register_become_orphan (AS_CLUSTERING_MEMBERSHIP_LOST); + } + } +} + +/** + * Handle the fail of a paxos proposal started by the self node. + */ +static void +clustering_paxos_proposer_fail_handle() +{ + // Send reject to all pending join requesters. + clustering_join_requests_reject_all(); +} + +/** + * Clustering module event handler. + */ +static void +clustering_event_handle(as_clustering_internal_event* event) +{ + // Lock to enusure the entire event handling is atomic and parallel events + // events (hb/fabric) do not interfere. + CLUSTERING_LOCK(); + + switch (event->type) { + case AS_CLUSTERING_INTERNAL_EVENT_TIMER: + clustering_timer_event_handle(); + break; + case AS_CLUSTERING_INTERNAL_EVENT_QUANTUM_INTERVAL_START: + clustering_quantum_interval_start_handle(event); + break; + case AS_CLUSTERING_INTERNAL_EVENT_HB: + clustering_hb_event_handle(event); + break; + case AS_CLUSTERING_INTERNAL_EVENT_HB_PLUGIN_DATA_CHANGED: + clustering_hb_plugin_data_changed_event_handle(event); + break; + case AS_CLUSTERING_INTERNAL_EVENT_MSG: + clustering_msg_event_handle(event); + break; + case AS_CLUSTERING_INTERNAL_EVENT_REGISTER_ORPHANED: + clustering_register_orphaned_handle(event); + break; + case AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_CHANGED: + clustering_register_cluster_changed_handle(); + break; + case AS_CLUSTERING_INTERNAL_EVENT_REGISTER_CLUSTER_SYNCED: + clustering_register_cluster_synced_handle(event); + break; + case AS_CLUSTERING_INTERNAL_EVENT_PAXOS_PROPOSER_FAIL: // Send reject message to all + clustering_paxos_proposer_fail_handle(); + break; + default: // Not of interest for main clustering module. + break; + } + + CLUSTERING_UNLOCK(); +} + +/** + * Initialize the template to be used for clustering messages. + */ +static void +clustering_msg_init() +{ + // Register fabric clustering msg type with no processing function: + // This permits getting / putting clustering msgs to be moderated via an + // idle msg queue. + as_fabric_register_msg_fn(M_TYPE_CLUSTERING, g_clustering_msg_template, + sizeof(g_clustering_msg_template), AS_CLUSTERING_MSG_SCRATCH_SIZE, + clustering_fabric_msg_listener, NULL); +} + +/** + * Change listener that updates the first time in current quantum. + */ +static void +clustering_hb_plugin_data_change_listener(cf_node changed_node_id) +{ + if (!clustering_is_running()) { + return; + } + + DETAIL("cluster information change detected for node %"PRIx64, + changed_node_id); + + as_hb_plugin_node_data plugin_data; + as_clustering_internal_event change_event; + memset(&change_event, 0, sizeof(change_event)); + change_event.type = AS_CLUSTERING_INTERNAL_EVENT_HB_PLUGIN_DATA_CHANGED; + change_event.plugin_data_changed_nodeid = changed_node_id; + change_event.plugin_data = &plugin_data; + + if (clustering_hb_plugin_data_get(changed_node_id, &plugin_data, + &change_event.plugin_data_changed_hlc_ts, + &change_event.plugin_data_changed_ts) != 0) { + // Not possible. We should be able to read the plugin data that changed. + return; + } + internal_event_dispatch(&change_event); +} + +/** + * Listen to external heartbeat event and dispatch an internal heartbeat event. + */ +static void +clustering_hb_event_listener(int n_events, as_hb_event_node* hb_node_events, + void* udata) +{ + if (!clustering_is_running()) { + return; + } + + // Wrap the events in an internal event and dispatch. + as_clustering_internal_event hb_event; + memset(&hb_event, 0, sizeof(hb_event)); + hb_event.type = AS_CLUSTERING_INTERNAL_EVENT_HB; + hb_event.hb_n_events = n_events; + hb_event.hb_events = hb_node_events; + + internal_event_dispatch(&hb_event); +} + +/** + * Reform the cluster with the same succession list.This would trigger the + * generation of new partition info and the cluster would get a new cluster key. + * + * @return 0 if new clustering round started, 1 if not principal, -1 otherwise. + */ +static int +clustering_cluster_reform() +{ + int rv = -1; + CLUSTERING_LOCK(); + + cf_vector* dead_nodes = vector_stack_lockless_create(cf_node); + clustering_dead_nodes_find(dead_nodes); + + log_cf_node_vector("recluster: dead nodes - ", dead_nodes, + cf_vector_size(dead_nodes) > 0 ? CF_INFO : CF_DEBUG); + + cf_vector* faulty_nodes = vector_stack_lockless_create(cf_node); + clustering_faulty_nodes_find(faulty_nodes); + + log_cf_node_vector("recluster: faulty nodes - ", faulty_nodes, + cf_vector_size(faulty_nodes) > 0 ? CF_INFO : CF_DEBUG); + + cf_vector* new_nodes = vector_stack_lockless_create(cf_node); + clustering_nodes_to_add_get(new_nodes); + log_cf_node_vector("recluster: pending join requests - ", new_nodes, + cf_vector_size(new_nodes) > 0 ? CF_INFO : CF_DEBUG); + + if (!clustering_is_running() || !clustering_is_principal() + || cf_vector_size(dead_nodes) > 0 + || cf_vector_size(faulty_nodes) > 0 + || cf_vector_size(new_nodes) > 0) { + INFO( + "recluster: skipped - principal %s dead_nodes %d faulty_nodes %d new_nodes %d", + clustering_is_principal() ? "true" : "false", + cf_vector_size(dead_nodes), cf_vector_size(faulty_nodes), + cf_vector_size(new_nodes)); + + if (!clustering_is_principal()) { + // Common case - command will likely be sent to all nodes. + rv = 1; + } + + goto Exit; + } + + cf_vector* succession_list = vector_stack_lockless_create(cf_node); + vector_copy(succession_list, &g_register.succession_list); + + log_cf_node_vector( + "recluster: principal node - reforming new cluster with succession list:", + succession_list, CF_INFO); + + as_paxos_start_result result = paxos_proposer_proposal_start( + succession_list, succession_list); + + // Log paxos result. + paxos_result_log(result, succession_list); + + rv = (result == AS_PAXOS_RESULT_STARTED) ? 0 : -1; + + if (rv == -1) { + INFO("recluster: skipped"); + } + else { + INFO("recluster: triggered..."); + } + + cf_vector_destroy(succession_list); + +Exit: + cf_vector_destroy(dead_nodes); + cf_vector_destroy(faulty_nodes); + cf_vector_destroy(new_nodes); + CLUSTERING_UNLOCK(); + return rv; +} + +/** + * Initialize clustering subsystem. + */ +static void +clustering_init() +{ + if (clustering_is_initialized()) { + return; + } + + CLUSTERING_LOCK(); + memset(&g_clustering, 0, sizeof(g_clustering)); + + // Start out as an orphan cluster. + g_clustering.state = AS_CLUSTERING_STATE_ORPHAN; + g_clustering.orphan_state_start_time = cf_getms(); + + g_clustering.join_request_blackout = cf_shash_create(cf_nodeid_shash_fn, + sizeof(cf_node), sizeof(cf_clock), + AS_CLUSTERING_CLUSTER_MAX_SIZE_SOFT, 0); + + vector_lockless_init(&g_clustering.pending_join_requests, cf_node); + + // Register as a plugin with the heartbeat subsystem. + as_hb_plugin clustering_plugin; + memset(&clustering_plugin, 0, sizeof(clustering_plugin)); + + clustering_plugin.id = AS_HB_PLUGIN_CLUSTERING; + // Includes the size for the protocol version, the cluster key, the paxos + // sequence number for current cluster and the preferred principal. + clustering_plugin.wire_size_fixed = sizeof(uint32_t) + + sizeof(as_cluster_key) + sizeof(as_paxos_sequence_number) + + sizeof(cf_node); + // Size of the node in succession list. + clustering_plugin.wire_size_per_node = sizeof(cf_node); + clustering_plugin.set_fn = clustering_hb_plugin_set_fn; + clustering_plugin.parse_fn = clustering_hb_plugin_parse_data_fn; + clustering_plugin.change_listener = + clustering_hb_plugin_data_change_listener; + + as_hb_plugin_register(&clustering_plugin); + + // Register as hb event listener + as_hb_register_listener(clustering_hb_event_listener, NULL); + + // Initialize fabric message pool. + clustering_msg_init(); + + // Initialize external event publisher. + external_event_publisher_init(); + + // Initialize the register. + register_init(); + + // Initialize timer. + timer_init(); + + // Initialize the quantum interval generator + quantum_interval_generator_init(); + + // Initialize paxos. + paxos_init(); + + g_clustering.sys_state = AS_CLUSTERING_SYS_STATE_STOPPED; + + DETAIL("clustering module initialized"); + + CLUSTERING_UNLOCK(); +} + +/** + * Start the clustering sub-system. + */ +static void +clustering_start() +{ + if (clustering_is_running()) { + return; + } + + CLUSTERING_LOCK(); + g_clustering.sys_state = AS_CLUSTERING_SYS_STATE_RUNNING; + CLUSTERING_UNLOCK(); + + // Start quantum interval generator. + quantum_interval_generator_start(); + + // Start the timer. + timer_start(); + + // Start the external event publisher. + external_event_publisher_start(); +} + +/** + * Stop the clustering sub-system. + */ +static void +clustering_stop() +{ + if (!clustering_is_running()) { + return; + } + + CLUSTERING_LOCK(); + g_clustering.sys_state = AS_CLUSTERING_SYS_STATE_SHUTTING_DOWN; + CLUSTERING_UNLOCK(); + + // Stop the timer. + timer_stop(); + + // Stop the external event publisher. + external_event_publisher_stop(); + + CLUSTERING_LOCK(); + g_clustering.sys_state = AS_CLUSTERING_SYS_STATE_STOPPED; + CLUSTERING_UNLOCK(); +} + +/** + * Dump clustering state to logs. + */ +static void +clustering_dump(bool verbose) +{ + if (!clustering_is_running()) { + INFO("CL: stopped"); + return; + } + + paxos_proposer_dump(verbose); + paxos_acceptor_dump(verbose); + register_dump(verbose); + + CLUSTERING_LOCK(); + + switch (g_clustering.state) { + case AS_CLUSTERING_STATE_ORPHAN: + INFO("CL: state: orphan"); + break; + case AS_CLUSTERING_STATE_PRINCIPAL: + INFO("CL: state: principal"); + break; + case AS_CLUSTERING_STATE_NON_PRINCIPAL: + INFO("CL: state: non-principal"); + break; + } + + INFO("CL: %s", + g_clustering.has_integrity ? "has integrity" : "integrity fault"); + cf_node current_principal; + if (clustering_principal_get(¤t_principal) != 0) { + if (g_clustering.preferred_principal != current_principal) { + INFO("CL: preferred principal %"PRIx64, + g_clustering.preferred_principal); + } + } + + if (g_clustering.state == AS_CLUSTERING_STATE_ORPHAN) { + INFO("CL: join request sent to principal %"PRIx64, + g_clustering.last_join_request_principal); + INFO("CL: join request sent time: %"PRIu64" now: %"PRIu64 , + g_clustering.last_join_request_sent_time, cf_getms()); + } + + if (verbose) { + log_cf_node_vector("CL: pending join requests:", + &g_clustering.pending_join_requests, CF_INFO); + } + + CLUSTERING_UNLOCK(); +} + +/* + * ---------------------------------------------------------------------------- + * Internal event dispatcher + * ---------------------------------------------------------------------------- + */ + +/** + * Simple dispatcher for events. The order of dispatch is from lower (less + * dependent) to higher (more dependent) sub-modules. + */ +static void +internal_event_dispatch(as_clustering_internal_event* event) +{ + // Sub-module dispatch. + quantum_interval_generator_event_dispatch(event); + paxos_event_dispatch(event); + register_event_dispatch(event); + + // Dispatch to the main clustering module. + clustering_event_handle(event); +} + +/* + * ---------------------------------------------------------------------------- + * Public API. + * ---------------------------------------------------------------------------- + */ + +/** + * + * Initialize clustering subsystem. + */ +void +as_clustering_init() +{ + clustering_init(); +} + +/** + * Start clustering subsystem. + */ +void +as_clustering_start() +{ + clustering_start(); +} + +/** + * Stop clustering subsystem. + */ +void +as_clustering_stop() +{ + clustering_stop(); +} + +/** + * Reform the cluster with the same succession list.This would trigger the + * generation of new partition info and the cluster would get a new cluster key. + * + * @return 0 if new clustering round started, -1 otherwise. + */ +int +as_clustering_cluster_reform() +{ + return clustering_cluster_reform(); +} + +/** + * Return the quantum interval, i.e., the interval at which cluster change + * decisions are taken. The unit is milliseconds. + */ +uint64_t +as_clustering_quantum_interval() +{ + return quantum_interval(); +} + +/** + * TEMPORARY - used by paxos only. + */ +void +as_clustering_set_integrity(bool has_integrity) +{ + g_clustering.has_integrity = has_integrity; +} + +/* + * ---------------------------------------------------------------------------- + * Clustering info command functions. + * ---------------------------------------------------------------------------- + */ + +/** + * If false means than either this node is orphaned, or is undergoing a cluster + * change. + */ +bool +as_clustering_has_integrity() +{ + return g_clustering.has_integrity; +} + +/** + * Indicates if self node is orphaned. + */ +bool +as_clustering_is_orphan() +{ + return clustering_is_orphan(); +} + +/** + * Dump clustering state to the log. + */ +void +as_clustering_dump(bool verbose) +{ + clustering_dump(verbose); +} + +/** + * Set the min cluster size. + */ +int +as_clustering_cluster_size_min_set(uint32_t new_cluster_size_min) +{ + CLUSTERING_LOCK(); + int rv = 0; + uint32_t cluster_size = cf_vector_size(&g_register.succession_list); + if (clustering_is_orphan() || cluster_size >= new_cluster_size_min) { + INFO("changing value of min-cluster-size from %u to %u", + g_config.clustering_config.cluster_size_min, + new_cluster_size_min); + g_config.clustering_config.cluster_size_min = new_cluster_size_min; + } + else { + WARNING( + "min-cluster-size %d should be <= current cluster size %d - ignoring", + new_cluster_size_min, cluster_size); + rv = -1; + } + CLUSTERING_UNLOCK(); + return rv; +} + +/** + * Log a vector of node-ids at input severity spliting long vectors over + * multiple lines. The call might not work if the vector is not protected + * against multi-threaded access. + * + * @param context the logging context. + * @param severity the log severity. + * @param file_name the source file name for the log line. + * @param line the source file line number for the log line. + * @param message the message prefix for each log line. Message and node list + * will be separated with a space. Can be NULL for no prefix. + * @param nodes the vector of nodes. + */ +void +as_clustering_cf_node_vector_event(cf_fault_severity severity, + cf_fault_context context, char* file_name, int line, char* message, + cf_vector* nodes) +{ + as_clustering_cf_node_array_event(severity, context, file_name, line, + message, vector_to_array(nodes), cf_vector_size(nodes)); +} + +/** + * Log an array of node-ids at input severity spliting long vectors over + * multiple lines. The call might not work if the array is not protected against + * multi-threaded access. + * + * @param context the logging context. + * @param severity the log severity. + * @param file_name the source file name for the log line. + * @param line the source file line number for the log line. + * @param message the message prefix for each log line. Message and node list + * will be separated with a space. Can be NULL for no prefix. + * @param nodes the array of nodes. + * @param node_count the count of nodes in the array. + */ +void +as_clustering_cf_node_array_event(cf_fault_severity severity, + cf_fault_context context, char* file_name, int line, char* message, + cf_node* nodes, int node_count) +{ + if (!cf_context_at_severity(context, severity) && severity != CF_TRACE) { + return; + } + + // Also account the space following the nodeid. + int node_str_len = 2 * (sizeof(cf_node)) + 1; + + int message_length = 0; + char copied_message[LOG_LENGTH_MAX()]; + + if (message) { + // Limit the message length to allow at least one node to fit in the log + // line. Accounting for the separator between message and node list. + message_length = MIN(strnlen(message, LOG_LENGTH_MAX() - 1), + LOG_LENGTH_MAX() - 1 - node_str_len) + 1; + + // Truncate the message. + strncpy(copied_message, message, message_length); + message = copied_message; + } + + // Allow for the NULL terminator. + int nodes_per_line = (LOG_LENGTH_MAX() - message_length - 1) / node_str_len; + nodes_per_line = MAX(1, nodes_per_line); + + // Have a buffer large enough to accomodate the message and nodes per line. + char log_buffer[message_length + (nodes_per_line * node_str_len) + 1]; // For the NULL terminator. + int output_node_count = 0; + + // Marks the start of the nodeid list in the log line buffer. + char* node_buffer_start = log_buffer; + if (message) { + node_buffer_start += sprintf(log_buffer, "%s ", message); + } + + for (int i = 0; i < node_count;) { + char* buffer = node_buffer_start; + + for (int j = 0; j < nodes_per_line && i < node_count; j++) { + buffer += sprintf(buffer, "%"PRIx64" ", nodes[i]); + output_node_count++; + i++; + } + + // Overwrite the space from the last node on the log line only if there + // is atleast one node output + if (buffer != node_buffer_start) { + *(buffer - 1) = 0; + cf_fault_event(context, severity, file_name, line, "%s", + log_buffer); + } + } + + // Handle the empty vector case. + if (output_node_count == 0) { + sprintf(node_buffer_start, "(empty)"); + cf_fault_event(context, severity, file_name, line, "%s", log_buffer); + } +} diff --git a/as/src/fabric/endpoint.c b/as/src/fabric/endpoint.c new file mode 100644 index 00000000..d0538b70 --- /dev/null +++ b/as/src/fabric/endpoint.c @@ -0,0 +1,880 @@ +/* + * endpoint.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "fabric/endpoint.h" + +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" + +#include "fault.h" +#include "socket.h" + +#include "base/cfg.h" + +/*---------------------------------------------------------------------------- + * Private internal data structures. + *----------------------------------------------------------------------------*/ +typedef struct as_endpoint_collect_udata_s +{ + /** + * Collected endpoint pointers. + */ + const as_endpoint** endpoints; + + /** + * Collected endpoint count. + */ + uint32_t collected_count; +} as_endpoint_collect_udata; + +typedef struct as_endpoint_to_string_udata_s +{ + /** + * Current write pointer. + */ + char* write_ptr; + + /** + * buffer remaining capacity. + */ + size_t buffer_remaining; + + /** + * Number of endpoints converted. + */ + uint32_t endpoints_converted; + + /** + * Capabilities of endpoint. + */ + uint8_t capabilities; + + /** + * Capability mask. Set to 0 to match all the endpoints. + */ + uint8_t capability_mask; +} as_endpoint_to_string_udata; + +typedef struct as_endpoint_list_overlap_udata_s +{ + /** + * Indicates if there was an overlap. + */ + bool overlapped; + + /** + * Indicates if endpoint capabilities should be ignored. + */ + bool ignore_capabilities; + + /** + * The other list to compare. + */ + const as_endpoint_list* other; +} as_endpoint_list_overlap_udata; + +typedef struct as_endpoint_list_endpoint_find_udata_s +{ + /** + * Indicates if there was an overlap. + */ + bool match_found; + + /** + * Indicates if endpoint capabilities should be ignored. + */ + bool ignore_capabilities; + + /** + * The other list to compare. + */ + const as_endpoint* to_find; +} as_endpoint_list_endpoint_find_udata; + +/*---------------------------------------------------------------------------- + * Private internal function forward declarations. + *----------------------------------------------------------------------------*/ +static bool endpoint_addr_type_is_valid(uint8_t type); +static size_t endpoint_addr_binary_size(uint8_t type); +static size_t endpoint_sizeof_by_addr_type(uint8_t addr_type); +static as_endpoint* endpoint_allocate(uint8_t addr_type); +static void endpoint_collect_iterate_fn(const as_endpoint* endpoint, void* udata); +static void endpoint_to_string_iterate(const as_endpoint* endpoint, void* udata); +static uint8_t endpoint_addr_type_from_cf_ip_addr(const cf_ip_addr* addr); +static void endpoint_from_sock_cfg(const cf_sock_cfg* src, as_endpoint* endpoint); +static void endpoint_list_overlap_iterate(const as_endpoint* endpoint, void* udata); +static void endpoint_list_find_iterate(const as_endpoint* endpoint, void* udata); + +static bool endpoints_are_equal(const as_endpoint* endpoint1, const as_endpoint* endpoint2, const bool ignore_capabilities); +static void endpoints_preference_sort(const as_endpoint* endpoints[], size_t n_endpoints); + +/*---------------------------------------------------------------------------- + * Public API. + *----------------------------------------------------------------------------*/ + +/** + * Get the sizeof an endpoint. Accounts for variable size of the address field. + * @return the size of the endpoint address. Zero if the endpoint address is + * invalid. + */ +size_t +as_endpoint_sizeof(const as_endpoint* endpoint) +{ + return endpoint_sizeof_by_addr_type(endpoint->addr_type); +} + +/** + * Enable a capability on an endpoint given its mask. + * @param endpoint the endpoint. + * @param capability_mask the capability mask. + */ +void +as_endpoint_capability_enable(as_endpoint* endpoint, uint8_t capability_mask) +{ + endpoint->capabilities |= capability_mask; +} + +/** + * Disable a capability on an endpoint given its mask. + * @param endpoint the endpoint. + * @param capability_mask the capability mask. + */ +void +as_endpoint_capability_disable(as_endpoint* endpoint, uint8_t capability_mask) +{ + endpoint->capabilities &= ~capability_mask; +} + +/** + * Connect to an endpoint. + * + * @param endpoint the peer endpoint to connect to. + * @param timeout the overall connect timeout. + * @param sock (output) will be populated if connections is successful. + * @return -1 on success, 0 on failure. + */ +int +as_endpoint_connect(const as_endpoint* endpoint, int32_t timeout, cf_socket* sock) +{ + if (!endpoint_addr_type_is_valid(endpoint->addr_type)) { + return -1; + } + + cf_sock_cfg cfg; + cf_sock_cfg_init(&cfg, CF_SOCK_OWNER_INVALID); + cfg.port = endpoint->port; + if (cf_ip_addr_from_binary(endpoint->addr, endpoint_addr_binary_size(endpoint->addr_type), + &cfg.addr) <= 0) { + return -1; + } + + int rv = cf_socket_init_client(&cfg, timeout, sock); + + // Reset the client sock config, because the config is a stack pointer. + sock->cfg = NULL; + return rv; +} + +/** + * Connect to the best matching endpoint in the endpoint list. + * + * @param endpoint_list the list of endpoints. + * @param filter_fn filter function to discard incompatible endpoints. Can be + * NULL. + * @param filter_udata udata passed on as is to the filter function. + * @param timeout the overall connect timeout. + * @param sock (output) will be populated if connection is successful. + * @return the connected endpoint on success, NULL if no endpoint count be + * connected. + */ +const as_endpoint* +as_endpoint_connect_any(const as_endpoint_list* endpoint_list, + as_endpoint_filter_fn filter_fn, void* filter_udata, int32_t timeout, cf_socket* sock) +{ + if (endpoint_list->n_endpoints == 0) { + return NULL; + } + + const as_endpoint* ordered_endpoints[endpoint_list->n_endpoints]; + const as_endpoint* rv = NULL; + + as_endpoint_collect_udata collect_udata; + collect_udata.endpoints = ordered_endpoints; + collect_udata.collected_count = 0; + + // Collect all endpoints in a pointer array. + as_endpoint_list_iterate(endpoint_list, endpoint_collect_iterate_fn, &collect_udata); + + // Sort by descending preference. + endpoints_preference_sort(ordered_endpoints, endpoint_list->n_endpoints); + + // TODO: Timeout individual connect or have the caller adjust based on + // number of endpoints + for (uint8_t i = 0; i < endpoint_list->n_endpoints; i++) { + if (filter_fn && !(filter_fn)(ordered_endpoints[i], filter_udata)) { + continue; + } + + // Try this potential candidate. + if (as_endpoint_connect(ordered_endpoints[i], timeout, sock) == 0) { + // Connect succeeded. + rv = ordered_endpoints[i]; + break; + } + } + + return rv; +} + +/** + * Convert a socket configuration to an endpoint in place. + * @return a heap allocated, converted endpoint. Should be freed using cf_free + * once the endpoint is no longer needed. + */ +void +as_endpoint_from_sock_cfg_fill(const cf_sock_cfg* src, as_endpoint* endpoint) +{ + endpoint_from_sock_cfg(src, endpoint); +} + +/** + * Convert a socket configuration to an endpoint. + * @return a heap allocated, converted endpoint. Should be freed using cf_free + * once the endpoint is no longer needed. + */ +as_endpoint* +as_endpoint_from_sock_cfg(const cf_sock_cfg* src) +{ + uint8_t addr_type = endpoint_addr_type_from_cf_ip_addr(&src->addr); + as_endpoint* endpoint = endpoint_allocate(addr_type); + endpoint_from_sock_cfg(src, endpoint); + return endpoint; +} + +/** + * Convert an endpoint to a cf_sock_addr. + * @param endpoint the source endpoint. + * @param sock_addr the target socket address. + * @return 0 on success, -1 on failure. + */ +int +as_endpoint_to_sock_addr(const as_endpoint* endpoint, cf_sock_addr* sock_addr) +{ + sock_addr->port = endpoint->port; + return + cf_ip_addr_from_binary(endpoint->addr, endpoint_addr_binary_size(endpoint->addr_type), + &sock_addr->addr) > 0 ? 0 : -1; +} + +/** + * Indicates if an endpoint supports listed capabilities. + * @return true if the endpoint supports the input capability. + */ +bool +as_endpoint_capability_is_supported(const as_endpoint* endpoint, uint8_t capability_mask) +{ + return (endpoint->capabilities & capability_mask) > 0; +} + +/** + * Return the in memory size in bytes of the endpoint list. + * @param endpoint_list the endpoint list. + * @param size (output) the size of the list on success. + * @return 0 on successful size calculation, -1 otherwise. + */ +int +as_endpoint_list_sizeof(const as_endpoint_list* endpoint_list, size_t* size) +{ + return as_endpoint_list_nsizeof(endpoint_list, size, SIZE_MAX); +} + +/** + * Return the in memory size in bytes of the endpoint list, but abort if the + * size of the read exceeds the input size. + * @param endpoint_list the endpoint list. + * @param size (output) the size of the list on success. + * @param size_max the maximum size until which parsing will be attempted. + * @return 0 on successful size calculation, -1 otherwise. + */ +int +as_endpoint_list_nsizeof(const as_endpoint_list* endpoint_list, size_t* size, size_t size_max) +{ + if (!endpoint_list) { + return 0; + } + + *size = sizeof(as_endpoint_list); + + uint8_t* endpoint_ptr = (uint8_t*) endpoint_list->endpoints; + for (int i = 0; i < endpoint_list->n_endpoints; i++) { + size_t endpoint_size = as_endpoint_sizeof((as_endpoint*)endpoint_ptr); + if (endpoint_size == 0) { + // Invalid endpoint. Signal error + *size = 0; + return -1; + } + + if (*size + endpoint_size > size_max) { + *size = 0; + return -1; + } + + *size += endpoint_size; + endpoint_ptr += endpoint_size; + } + + return 0; +} + +/** + * Iterate over endpoints in an endpoint list and invoke the iterate function + * for each endpoint. + * @param iterate_fn the iterate function invoked for each endpoint in the list. + * @param udata passed as is to the iterate function. Useful for getting results + * out of the iteration. + * NULL if there is no plugin data. + * @return the size of the plugin data. 0 if there is no plugin data. + */ +void +as_endpoint_list_iterate(const as_endpoint_list* endpoint_list, + const as_endpoint_iterate_fn iterate_fn, void* udata) +{ + if(!endpoint_list) { + return; + } + + uint8_t* endpoint_ptr = (uint8_t*) endpoint_list->endpoints; + + for (int i = 0; i < endpoint_list->n_endpoints; i++) { + if (iterate_fn) { + (iterate_fn)((as_endpoint*) endpoint_ptr, udata); + } + endpoint_ptr += as_endpoint_sizeof((as_endpoint*) endpoint_ptr); + } +} + +/** + * Convert a server configuration to an endpoint list in place into the + * destination endpoint list. + * @param serv_cfg source server configuration. + * @param endpoint_list destination endpoint list. + */ +void +as_endpoint_list_from_serv_cfg_fill(const cf_serv_cfg* serv_cfg, as_endpoint_list* endpoint_list) +{ + endpoint_list->n_endpoints = serv_cfg->n_cfgs; + + uint8_t* endpoint_ptr = (uint8_t*) &endpoint_list->endpoints[0]; + for (int i = 0; i < serv_cfg->n_cfgs; i++) { + as_endpoint* endpoint = (as_endpoint*) endpoint_ptr; + endpoint_from_sock_cfg(&serv_cfg->cfgs[i], endpoint); + endpoint_ptr += as_endpoint_sizeof(endpoint); + } +} + +/** + * Convert a server configuration to an endpoint list. + * @param serv_cfg server configuration. + * @return a heap allocated endpoint list. Should be freed using cf_free + * once the endpoint is no longer needed. + */ +as_endpoint_list* +as_endpoint_list_from_serv_cfg(const cf_serv_cfg* serv_cfg) +{ + size_t result_size = sizeof(as_endpoint_list); + for (int i = 0; i < serv_cfg->n_cfgs; i++) { + result_size += endpoint_sizeof_by_addr_type( + endpoint_addr_type_from_cf_ip_addr(&serv_cfg->cfgs[i].addr)); + } + + as_endpoint_list* endpoint_list = (as_endpoint_list*) cf_malloc(result_size); + + as_endpoint_list_from_serv_cfg_fill(serv_cfg, endpoint_list); + + return endpoint_list; +} + +/** + * Compare two endpoint lists for equality. + * @param list1 the first. NULL allowed. + * @param list2 the second list. NULL allowed. + * @return true iff the lists are equals, false otherwise. + */ +bool +as_endpoint_lists_are_equal(const as_endpoint_list* list1, const as_endpoint_list* list2) +{ + if (list1 == list2) { + return true; + } + + if (!list1 || !list2) { + return false; + } + + size_t size1; + if (as_endpoint_list_sizeof(list1, &size1) != 0) { + return false; + } + + size_t size2; + if (as_endpoint_list_sizeof(list2, &size2) != 0) { + return false; + } + + if (size1 != size2) { + return false; + } + + return memcmp(list1, list2, size1) == 0; +} + +/** + * Check if two lists overlap in at least one endpoint. + * @param list1 the first. NULL allowed. + * @param list2 the second list. NULL allowed. + * @param ignore_capabilities set to true if the overlap match should ignore + * node capabilities, false if capabilities should also be matched. + * @return true iff the lists are overlap, false otherwise. + */ +bool +as_endpoint_lists_are_overlapping(const as_endpoint_list* list1, const as_endpoint_list* list2, + bool ignore_capabilities) +{ + if (list1 == list2) { + return true; + } + + if (!list1 || !list2) { + return false; + } + + as_endpoint_list_overlap_udata udata; + udata.overlapped = false; + udata.other = list2; + udata.ignore_capabilities = ignore_capabilities; + + as_endpoint_list_iterate(list1, endpoint_list_overlap_iterate, &udata); + + return udata.overlapped; +} + +/** + * Convert an endpoint list to a string. + * @param endpoint_list the input list. NULL allowed. + * @param buffer the output buffer. + * @param buffer_capacity the capacity of the output buffer. + * @return the number of characters printed (excluding the null byte used to + * end output to strings) + */ +int +as_endpoint_list_to_string(const as_endpoint_list* endpoint_list, char* buffer, + size_t buffer_capacity) +{ + return as_endpoint_list_to_string_match_capabilities(endpoint_list, buffer, + buffer_capacity, 0, 0); +} + +/** + * Convert an endpoint list to a string matching capabilities. + * @param endpoint_list the input list. NULL allowed. + * @param buffer the output buffer. + * @param buffer_capacity the capacity of the output buffer. + * @param capability_mask specifies which bit to match. + * @param capabilities specifies capabilities to be match for. + * @return the number of characters printed (excluding the null byte used to + * end output to strings) + */ +int +as_endpoint_list_to_string_match_capabilities( + const as_endpoint_list* endpoint_list, char* buffer, + size_t buffer_capacity, uint8_t capability_mask, uint8_t capabilities) +{ + if (!endpoint_list) { + buffer[0] = 0; + return 0; + } + + as_endpoint_to_string_udata udata = { 0 }; + udata.write_ptr = buffer; + udata.buffer_remaining = buffer_capacity; + udata.capabilities = capabilities; + udata.capability_mask = capability_mask; + as_endpoint_list_iterate(endpoint_list, endpoint_to_string_iterate, &udata); + + if (udata.endpoints_converted) { + if (udata.endpoints_converted != endpoint_list->n_endpoints) { + // Truncation has happened. Add ellipses. + if (udata.buffer_remaining > 4) { + udata.buffer_remaining -= sprintf(udata.write_ptr, "..."); + } + } + else { + // Remove the dangling comma from the last endpoint. + udata.write_ptr--; + udata.buffer_remaining++; + } + } + + // Ensure NULL termination. + *udata.write_ptr = 0; + + return buffer_capacity - udata.buffer_remaining; +} + +/** + * Populate dyn buf with endpoints info + * @param endpoint_list the input list. NULL allowed. + * @param db the dynamic buffer. + */ +void +as_endpoint_list_info(const as_endpoint_list* endpoint_list, cf_dyn_buf* db) +{ + size_t endpoint_list_size = 0; + as_endpoint_list_sizeof(endpoint_list, &endpoint_list_size); + // 4 chars for delimiters, 50 chars for ipv6 ip and port, rounded to 64 + size_t endpoint_list_str_size = 64 * endpoint_list_size; + + char endpoint_list_str[endpoint_list_str_size]; + as_endpoint_list_to_string_match_capabilities(endpoint_list, + endpoint_list_str, sizeof(endpoint_list_str), AS_ENDPOINT_TLS_MASK, + 0); + + cf_dyn_buf_append_string(db, "endpoint="); + if (endpoint_list_str[0] != '\0') { + cf_dyn_buf_append_string(db, endpoint_list_str); + } + cf_dyn_buf_append_string(db, ":"); + + as_endpoint_list_to_string_match_capabilities(endpoint_list, + endpoint_list_str, sizeof(endpoint_list_str), AS_ENDPOINT_TLS_MASK, + AS_ENDPOINT_TLS_MASK); + + cf_dyn_buf_append_string(db, "endpoint-tls="); + if (endpoint_list_str[0] != '\0') { + cf_dyn_buf_append_string(db, endpoint_list_str); + } + +} + +/*---------------------------------------------------------------------------- + * Private internal functions. + *----------------------------------------------------------------------------*/ +/** + * Indicates if input address type is valid. + */ +static bool +endpoint_addr_type_is_valid(uint8_t type) +{ + return type > AS_ENDPOINT_ADDR_TYPE_UNDEF && type < AS_ENDPOINT_ADDR_TYPE_SENTINEL; +} + +/** + * Get the size of the binary for input address type. + * TODO: Move to socket API. Not if we support DNS names. + */ +static size_t +endpoint_addr_binary_size(uint8_t type) +{ + return (type == AS_ENDPOINT_ADDR_TYPE_IPv4) ? 4 : 16; +} + +/** + * Return the sizeof endpoint give its address type. + */ +static size_t +endpoint_sizeof_by_addr_type(uint8_t addr_type) +{ + return sizeof(as_endpoint) + endpoint_addr_binary_size(addr_type); +} + +/** + * Convert cf_ip address to endpoint address type. + */ +static uint8_t +endpoint_addr_type_from_cf_ip_addr(const cf_ip_addr* addr) +{ + return cf_ip_addr_is_legacy(addr) ? AS_ENDPOINT_ADDR_TYPE_IPv4 : AS_ENDPOINT_ADDR_TYPE_IPv6; +} + +/** + * Heap allocate an endpoint. + */ +static as_endpoint* +endpoint_allocate(uint8_t addr_type) +{ + return cf_malloc(endpoint_sizeof_by_addr_type(addr_type)); +} + +/** + * Convert a socket to an endpoint. + */ +static void +endpoint_from_sock_cfg(const cf_sock_cfg* src, as_endpoint* endpoint) +{ + endpoint->addr_type = + cf_ip_addr_is_legacy(&src->addr) ? AS_ENDPOINT_ADDR_TYPE_IPv4 : AS_ENDPOINT_ADDR_TYPE_IPv6; + endpoint->port = src->port; + + // We will have allocated correct binary size. + CF_IGNORE_ERROR( + cf_ip_addr_to_binary(&src->addr, endpoint->addr, + endpoint_addr_binary_size(endpoint->addr_type))); + + endpoint->capabilities = (src->owner == CF_SOCK_OWNER_HEARTBEAT_TLS || + src->owner == CF_SOCK_OWNER_FABRIC_TLS) ? AS_ENDPOINT_TLS_MASK : 0; +} + +/** + * Generate a hash for an endpoint, but salted with the a random tie breaker to + * generate random looking shuffles for "equal" endpoints. This is jenkins + * one-at-a-time hash of the tie breaker concatenated with the endpoint. + */ +static uint32_t +endpoint_sort_hash(const as_endpoint* endpoint, int tie_breaker) +{ + uint32_t hash = 0; + + // Hash the nodeid. + uint8_t* key = (uint8_t*)&tie_breaker; + for (int i = 0; i < sizeof(tie_breaker); ++i) { + hash += *key; + hash += (hash << 10); + hash ^= (hash >> 6); + key++; + } + + // Hash the endpoint value. + size_t endpoint_size = as_endpoint_sizeof(endpoint); + key = (uint8_t*)endpoint; + for (int i = 0; i < endpoint_size; ++i) { + hash += *key; + hash += (hash << 10); + hash ^= (hash >> 6); + key++; + } + + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + return hash; +} + +/** + * Comparator to sort endpoints in descending order of preference. + */ +static int +endpoint_preference_compare(const void* e1, const void* e2, void* arg) +{ + const as_endpoint* endpoint1 = *(as_endpoint**)e1; + const as_endpoint* endpoint2 = *(as_endpoint**)e2; + int tie_breaker = *((int*)arg); + + // Prefer TLS over clear text. + bool endpoint1_is_tls = as_endpoint_capability_is_supported(endpoint1, AS_ENDPOINT_TLS_MASK); + + bool endpoint2_is_tls = as_endpoint_capability_is_supported(endpoint2, AS_ENDPOINT_TLS_MASK); + + if (endpoint1_is_tls != endpoint2_is_tls) { + return endpoint1_is_tls ? -1 : 1; + } + + // If TLS capabilities match prefer IPv6. + bool endpoint1_is_ipv6 = endpoint1->addr_type == AS_ENDPOINT_ADDR_TYPE_IPv6; + bool endpoint2_is_ipv6 = endpoint2->addr_type == AS_ENDPOINT_ADDR_TYPE_IPv6; + + if (endpoint1_is_ipv6 != endpoint2_is_ipv6) { + return endpoint1_is_ipv6 ? -1 : 1; + } + + // Used tie breaker parameter to salt the hashes for load balancing. + return endpoint_sort_hash(endpoint1, tie_breaker) - + endpoint_sort_hash(endpoint2, tie_breaker); +} + +/** + * Sort endpoints in place in descending order of preference. + * @param endpoints array of endpoint pointers. + */ +static void +endpoints_preference_sort(const as_endpoint* endpoints[], size_t n_endpoints) +{ + // Random tie breaker to load balance between two equivalent endpoints. + int tie_breaker = rand(); + + qsort_r(endpoints, n_endpoints, sizeof(as_endpoint*), + endpoint_preference_compare, &tie_breaker); +} + +/** + * Iterate and collect all endpoint addresses in passed in udata. + */ +static void +endpoint_collect_iterate_fn(const as_endpoint* endpoint, void* udata) +{ + as_endpoint_collect_udata* endpoints_data = (as_endpoint_collect_udata*) udata; + endpoints_data->endpoints[endpoints_data->collected_count++] = endpoint; +} + +/** + * Iterate over endpoints and convert them to strings. + */ +static void +endpoint_to_string_iterate(const as_endpoint* endpoint, void* udata) +{ + as_endpoint_to_string_udata* to_string_data = + (as_endpoint_to_string_udata*)udata; + + if ((endpoint->capabilities & to_string_data->capability_mask) + != (to_string_data->capabilities & to_string_data->capability_mask)) { + // skip as the capabilities do not match + to_string_data->endpoints_converted++; + return; + } + + char address_buffer[1024]; + int capacity = sizeof(address_buffer); + char* endpoint_str_ptr = address_buffer; + + cf_sock_addr temp_addr; + if (cf_ip_addr_from_binary(endpoint->addr, + endpoint_addr_binary_size(endpoint->addr_type), &temp_addr.addr) + <= 0) { + return; + } + + int rv = 0; + if (endpoint->port) { + temp_addr.port = endpoint->port; + rv = cf_sock_addr_to_string(&temp_addr, endpoint_str_ptr, capacity); + if (rv <= 0) { + return; + } + + capacity -= rv; + endpoint_str_ptr += rv; + rv = snprintf(endpoint_str_ptr, capacity, ","); + } + else { + // Skip port and tls capabilities. + rv = cf_ip_addr_to_string(&temp_addr.addr, endpoint_str_ptr, capacity); + if (rv <= 0) { + return; + } + + capacity -= rv; + endpoint_str_ptr += rv; + rv = snprintf(endpoint_str_ptr, capacity, ","); + } + + if (rv == capacity) { + // Output truncated. Abort. + return; + } + + int to_write = strnlen(address_buffer, sizeof(address_buffer)); + + // Ensure we leave space for the NULL terminator. + if (to_write + 1 <= to_string_data->buffer_remaining) { + sprintf(to_string_data->write_ptr, "%s", address_buffer); + to_string_data->buffer_remaining -= to_write; + to_string_data->write_ptr += to_write; + to_string_data->endpoints_converted++; + } +} + +/** + * Compare two endpoints for equality. + * @param endpoint1 the first. NULL allowed. + * @param endpoint2 the second endpoint. NULL allowed. + * @param ignore_capabilities indicates if endpoint capabilities should be + * ignored. + * @return true iff the endpoints are equals, false otherwise. + */ +static bool +endpoints_are_equal(const as_endpoint* endpoint1, const as_endpoint* endpoint2, + bool ignore_capabilities) +{ + if (endpoint1 == endpoint2) { + return true; + } + + if (!endpoint1 || !endpoint2) { + return false; + } + + size_t size1 = as_endpoint_sizeof(endpoint1); + if (!size1) { + return false; + } + + size_t size2 = as_endpoint_sizeof(endpoint2); + if (!size2) { + return false; + } + + if (size1 != size2) { + return false; + } + + return (ignore_capabilities || endpoint1->capabilities == endpoint2->capabilities) + && endpoint1->port == endpoint2->port && endpoint1->addr_type == endpoint2->addr_type + && memcmp(endpoint1->addr, endpoint2->addr, endpoint_addr_binary_size(endpoint1->addr_type)) == 0; +} + +/** + * Iterate function to find an overlap. + */ +static void +endpoint_list_overlap_iterate(const as_endpoint* endpoint, void* udata) +{ + as_endpoint_list_overlap_udata* overlap_udata = (as_endpoint_list_overlap_udata*) udata; + as_endpoint_list_endpoint_find_udata find_udata; + find_udata.match_found = false; + find_udata.ignore_capabilities = overlap_udata->ignore_capabilities; + find_udata.to_find = endpoint; + + as_endpoint_list_iterate(overlap_udata->other, endpoint_list_find_iterate, &find_udata); + + overlap_udata->overlapped |= find_udata.match_found; +} + +/** + * Iterate function to search for an endpoint. + */ +static void +endpoint_list_find_iterate(const as_endpoint* endpoint, void* udata) +{ + as_endpoint_list_endpoint_find_udata* find_udata = (as_endpoint_list_endpoint_find_udata*) udata; + + const as_endpoint* to_find = find_udata->to_find; + if (!to_find) { + return; + } + + find_udata->match_found |= endpoints_are_equal(endpoint, to_find, + find_udata->ignore_capabilities); +} diff --git a/as/src/fabric/exchange.c b/as/src/fabric/exchange.c new file mode 100644 index 00000000..fe9a94d1 --- /dev/null +++ b/as/src/fabric/exchange.c @@ -0,0 +1,3457 @@ +/* + * exchange.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "fabric/exchange.h" + +#include +#include +#include +#include // For MAX() and MIN(). + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_queue.h" + +#include "dynbuf.h" +#include "fault.h" +#include "shash.h" +#include "socket.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/stats.h" +#include "fabric/fabric.h" +#include "fabric/hb.h" +#include "fabric/partition_balance.h" +#include "storage/storage.h" + +/* + * Overview + * ======== + * Cluster data exchange state machine. Exchanges per namespace partition + * version exchange for now, after evey cluster change. + * + * State transition diagram + * ======================== + * The exchange state transition diagram responds to three events + * 1. Incoming message + * 2. Timer event + * 3. Clustering module's cluster change event. + * + * There are four states + * 1. Rest - the exchange is complete with all exchanged data committed. + * 2. Exchanging - the cluster has changed since the last commit and new data + * exchange is in progress. + * 3. Ready to commit - this node has send its exchange data to all cluster + * members, received corresponding acks and also exchange data from all cluster + * members. + * 4. Orphaned - this node is an orphan. After a timeout blocks client + * transactions. + * + * Exchange starts by being in the orphaned state. + * + * Code organization + * ================= + * + * There are different sections for each state. Each state has a dispatcher + * which delegates the event handing to a state specific function. All state is + * protected under a single lock. + */ + +/* + * ---------------------------------------------------------------------------- + * Constants + * ---------------------------------------------------------------------------- + */ + +/** + * Exchange protocol version information. + */ +#define AS_EXCHANGE_PROTOCOL_IDENTIFIER 1 + +/** + * A soft limit for the maximum cluster size. Meant to be optimize hash and list + * data structures and not as a limit on the number of nodes. + */ +#define AS_EXCHANGE_CLUSTER_MAX_SIZE_SOFT 200 + +/** + * A soft limit for the maximum number of unique vinfo's in a namespace. Meant + * to be optimize hash and list data structures and not as a limit on the number + * of vinfos processed. + */ +#define AS_EXCHANGE_UNIQUE_VINFO_MAX_SIZE_SOFT 200 + +/** + * Average number of partitions for a version information. Used as initial + * allocation size for every unique vinfo, hence a smaller value. + */ +#define AS_EXCHANGE_VINFO_NUM_PIDS_AVG 1024 + +/** + * Maximum event listeners. + */ +#define AS_EXTERNAL_EVENT_LISTENER_MAX 7 + +/* + * ---------------------------------------------------------------------------- + * Exchange data format for namespaces payload + * ---------------------------------------------------------------------------- + */ + +/** + * Partition data exchanged for each unique vinfo for a namespace. + */ +typedef struct as_exchange_vinfo_payload_s +{ + /** + * The partition vinfo. + */ + as_partition_version vinfo; + + /** + * Count of partitions having this vinfo. + */ + uint32_t num_pids; + + /** + * Partition having this vinfo. + */ + uint16_t pids[]; +}__attribute__((__packed__)) as_exchange_vinfo_payload; + +/** + * Information exchanged for a single namespace. + */ +typedef struct as_exchange_ns_vinfos_payload_s +{ + /** + * Count of version infos. + */ + uint32_t num_vinfos; + + /** + * Parition version information for each unique version. + */ + as_exchange_vinfo_payload vinfos[]; +}__attribute__((__packed__)) as_exchange_ns_vinfos_payload; + +/** + * Received data stored per node, per namespace, before actual commit. + */ +typedef struct as_exchange_node_namespace_data_s +{ + /** + * Mapped local namespace. + */ + as_namespace* local_namespace; + + /** + * Partition versions for this namespace. This field is reused across + * exchange rounds and may not be null even if the local namespace is null. + */ + as_exchange_ns_vinfos_payload* partition_versions; + + /** + * Sending node's rack id for this namespace. + */ + uint32_t rack_id; + + /** + * Sending node's roster generation for this namespace. + */ + uint32_t roster_generation; + + /** + * Sending node's roster count for this namespace. + */ + uint32_t roster_count; + + /** + * Sending node's roster for this namespace. + */ + cf_node* roster; + + /** + * Sending node's roster rack-ids for this namespace. + */ + cf_node* roster_rack_ids; + + /** + * Sender's eventual regime for this namespace. + */ + uint32_t eventual_regime; + + /** + * Sender's rebalance regime for this namespace. + */ + uint32_t rebalance_regime; +} as_exchange_node_namespace_data; + +/** + * Exchanged data for a single node. + */ +typedef struct as_exchange_node_data_s +{ + /** + * Number of sender's namespaces that have a matching local namespace. + */ + uint32_t num_namespaces; + + /** + * Data for sender's namespaces having a matching local namespace. + */ + as_exchange_node_namespace_data namespace_data[AS_NAMESPACE_SZ]; +} as_exchange_node_data; + +/* + * ---------------------------------------------------------------------------- + * Exchange internal data structures + * ---------------------------------------------------------------------------- + */ + +/** + * Exchange subsystem status. + */ +typedef enum +{ + AS_EXCHANGE_SYS_STATE_UNINITIALIZED, + AS_EXCHANGE_SYS_STATE_RUNNING, + AS_EXCHANGE_SYS_STATE_SHUTTING_DOWN, + AS_EXCHANGE_SYS_STATE_STOPPED +} as_exchange_sys_state; + +/** + * Exchange message types. + */ +typedef enum +{ + /** + * Exchange data for one node. + */ + AS_EXCHANGE_MSG_TYPE_DATA, + + /** + * Ack on receipt of exchanged data. + */ + AS_EXCHANGE_MSG_TYPE_DATA_ACK, + + /** + * Not used. + */ + AS_EXCHANGE_MSG_TYPE_DATA_NACK, + + /** + * The source is ready to commit exchanged information. + */ + AS_EXCHANGE_MSG_TYPE_READY_TO_COMMIT, + + /** + * Message from the principal asking all nodes to commit the exchanged + * information. + */ + AS_EXCHANGE_MSG_TYPE_COMMIT, + + /** + * Sentinel value for exchange message types. + */ + AS_EXCHANGE_MSG_TYPE_SENTINEL +} as_exchange_msg_type; + +/** + * Internal exchange event type. + */ +typedef enum +{ + /** + * Cluster change event. + */ + AS_EXCHANGE_EVENT_CLUSTER_CHANGE, + + /** + * Timer event. + */ + AS_EXCHANGE_EVENT_TIMER, + + /** + * Incoming message event. + */ + AS_EXCHANGE_EVENT_MSG, +} as_exchange_event_type; + +/** + * Internal exchange event. + */ +typedef struct as_exchange_event_s +{ + /** + * The type of the event. + */ + as_exchange_event_type type; + + /** + * Message for incoming message events. + */ + msg* msg; + + /** + * Source for incoming message events. + */ + cf_node msg_source; + + /** + * Clustering event instance for clustering events. + */ + as_clustering_event* clustering_event; +} as_exchange_event; + +/** + * Exchange subsystem state in the state transition diagram. + */ +typedef enum as_exchange_state_s +{ + /** + * Exchange subsystem is at rest will all data exchanged synchronized and + * committed. + */ + AS_EXCHANGE_STATE_REST, + + /** + * Data exchange is in progress. + */ + AS_EXCHANGE_STATE_EXCHANGING, + + /** + * Data exchange is complete and this node is ready to commit data. + */ + AS_EXCHANGE_STATE_READY_TO_COMMIT, + + /** + * Self node is orphaned. + */ + AS_EXCHANGE_STATE_ORPHANED +} as_exchange_state; + +/** + * State for a single node in the succession list. + */ +typedef struct as_exchange_node_state_s +{ + /** + * Inidicates if peer node has acknowledged send from self. + */ + bool send_acked; + + /** + * Inidicates if self node has received data from this peer. + */ + bool received; + + /** + * Inidicates if this peer node is ready to commit. Only relevant and used + * by the current principal. + */ + bool is_ready_to_commit; + + /** + * Exchange data received from this peer node. Member variables may be heap + * allocated and hence should be freed carefully while discarding this + * structure instance. + */ + as_exchange_node_data* data; +} as_exchange_node_state; + +/** + * State maintained by the exchange subsystem. + */ +typedef struct as_exchange_s +{ + /** + * Exchange subsystem status. + */ + as_exchange_sys_state sys_state; + + /** + * Exchange state in the state transition diagram. + */ + as_exchange_state state; + + /** + * Time when this node's exchange data was sent out. + */ + cf_clock send_ts; + + /** + * Time when this node's ready to commit was sent out. + */ + cf_clock ready_to_commit_send_ts; + + /** + * Thread id of the timer event generator. + */ + pthread_t timer_tid; + + /** + * Nodes that are not yet ready to commit. + */ + cf_vector ready_to_commit_pending_nodes; + + /** + * Current cluster key. + */ + as_cluster_key cluster_key; + + /** + * Cluster size - size of the succession list. + */ + uint32_t cluster_size; + + /** + * Exchange's copy of the succession list. + */ + cf_vector succession_list; + + /** + * The principal node in current succession list. Always the first node. + */ + cf_node principal; + + /** + * Last committed cluster key. + */ + as_cluster_key committed_cluster_key; + + /** + * Last committed cluster size - size of the succession list. + */ + uint32_t committed_cluster_size; + + /** + * Last committed exchange's succession list. + */ + cf_vector committed_succession_list; + + /** + * The principal node in the committed succession list. Always the first + * node. + */ + cf_node committed_principal; + + /** + * The time this node entered orphan state. + */ + cf_clock orphan_state_start_time; + + /** + * Indicates if transactions have already been blocked in the orphan state. + */ + bool orphan_state_are_transactions_blocked; + + /** + * Will have an as_exchange_node_state entry for every node in the + * succession list. + */ + cf_shash* nodeid_to_node_state; + + /** + * This node's data payload for current round. + */ + cf_dyn_buf self_data_dyn_buf[AS_NAMESPACE_SZ]; +} as_exchange; + +/** + * Internal storage for external event listeners. + */ +typedef struct as_exchange_event_listener_s +{ + /** + * The listener's calback function. + */ + as_exchange_cluster_changed_cb event_callback; + + /** + * The listeners user data object passed back as is to the callback + * function. + */ + void* udata; +} as_exchange_event_listener; + +/** + * External event publisher state. + */ +typedef struct as_exchange_external_event_publisher_s +{ + /** + * State of the external event publisher. + */ + as_exchange_sys_state sys_state; + + /** + * Inidicates if there is an event to publish. + */ + bool event_queued; + + /** + * The pending event to publish. + */ + as_exchange_cluster_changed_event to_publish; + + /** + * The static succession list published with the message. + */ + cf_vector published_succession_list; + + /** + * Conditional variable to signal a pending event. + */ + pthread_cond_t is_pending; + + /** + * Thread id of the publisher thread. + */ + pthread_t event_publisher_tid; + + /** + * Mutex to protect the conditional variable. + */ + pthread_mutex_t is_pending_mutex; + + /** + * External event listeners. + */ + as_exchange_event_listener event_listeners[AS_EXTERNAL_EVENT_LISTENER_MAX]; + + /** + * Event listener count. + */ + uint32_t event_listener_count; +} as_exchange_external_event_publisher; + + +/* + * ---------------------------------------------------------------------------- + * Externs + * ---------------------------------------------------------------------------- + */ +void +as_skew_monitor_update(); + +/* + * ---------------------------------------------------------------------------- + * Globals + * ---------------------------------------------------------------------------- + */ + +/** + * Singleton exchange state all initialized to zero. + */ +static as_exchange g_exchange = { 0 }; + +/** + * The fields in the exchange message. Should never change the order or elements + * in between. + */ +typedef enum +{ + AS_EXCHANGE_MSG_ID, + AS_EXCHANGE_MSG_TYPE, + AS_EXCHANGE_MSG_CLUSTER_KEY, + AS_EXCHANGE_MSG_NAMESPACES, + AS_EXCHANGE_MSG_NS_PARTITION_VERSIONS, + AS_EXCHANGE_MSG_NS_RACK_IDS, + AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS, + AS_EXCHANGE_MSG_NS_ROSTERS, + AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS, + AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES, + AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES, + + NUM_EXCHANGE_MSG_FIELDS +} as_exchange_msg_fields; + +/** + * Exchange message template. + */ +static const msg_template exchange_msg_template[] = { + { AS_EXCHANGE_MSG_ID, M_FT_UINT32 }, + { AS_EXCHANGE_MSG_TYPE, M_FT_UINT32 }, + { AS_EXCHANGE_MSG_CLUSTER_KEY, M_FT_UINT64 }, + { AS_EXCHANGE_MSG_NAMESPACES, M_FT_MSGPACK }, + { AS_EXCHANGE_MSG_NS_PARTITION_VERSIONS, M_FT_MSGPACK }, + { AS_EXCHANGE_MSG_NS_RACK_IDS, M_FT_MSGPACK }, + { AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS, M_FT_MSGPACK }, + { AS_EXCHANGE_MSG_NS_ROSTERS, M_FT_MSGPACK }, + { AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS, M_FT_MSGPACK }, + { AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES, M_FT_MSGPACK }, + { AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES, M_FT_MSGPACK } +}; + +COMPILER_ASSERT(sizeof(exchange_msg_template) / sizeof(msg_template) == + NUM_EXCHANGE_MSG_FIELDS); + +/** + * Global lock to set or get exchanged info from other threads. + */ +pthread_mutex_t g_exchanged_info_lock = PTHREAD_MUTEX_INITIALIZER; + +/** + * Global lock to serialize all reads and writes to the exchange state. + */ +pthread_mutex_t g_exchange_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +/** + * Singleton external events publisher. + */ +static as_exchange_external_event_publisher g_external_event_publisher; + +/** + * The fat lock for all clustering events listener changes. + */ +static pthread_mutex_t g_external_event_publisher_lock = + PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +/** + * Acquire a lock on the event publisher. + */ +#define EXTERNAL_EVENT_PUBLISHER_LOCK() \ +({ \ + pthread_mutex_lock (&g_external_event_publisher_lock); \ + LOCK_DEBUG("publisher locked in %s", __FUNCTION__); \ +}) + +/** + * Relinquish the lock on the external event publisher. + */ +#define EXTERNAL_EVENT_PUBLISHER_UNLOCK() \ +({ \ + pthread_mutex_unlock (&g_external_event_publisher_lock); \ + LOCK_DEBUG("publisher unLocked in %s", __FUNCTION__); \ +}) + +/* + * ---------------------------------------------------------------------------- + * Logging macros. + * ---------------------------------------------------------------------------- + */ + +/** + * Used to limit potentially long log lines. Includes space for NULL terminator. + */ +#define LOG_LENGTH_MAX() (800) +#define CRASH(format, ...) cf_crash(AS_EXCHANGE, format, ##__VA_ARGS__) +#define WARNING(format, ...) cf_warning(AS_EXCHANGE, format, ##__VA_ARGS__) +#define INFO(format, ...) cf_info(AS_EXCHANGE, format, ##__VA_ARGS__) +#define DEBUG(format, ...) cf_debug(AS_EXCHANGE, format, ##__VA_ARGS__) +#define DETAIL(format, ...) cf_detail(AS_EXCHANGE, format, ##__VA_ARGS__) +#define LOG(severity, format, ...) \ +({ \ + switch (severity) { \ + case CF_CRITICAL: \ + CRASH(format, ##__VA_ARGS__); \ + break; \ + case CF_WARNING: \ + WARNING(format, ##__VA_ARGS__); \ + break; \ + case CF_INFO: \ + INFO(format, ##__VA_ARGS__); \ + break; \ + case CF_DEBUG: \ + DEBUG(format, ##__VA_ARGS__); \ + break; \ + case CF_DETAIL: \ + DETAIL(format, ##__VA_ARGS__); \ + break; \ + default: \ + break; \ + } \ +}) + +/** + * Size of the (per-namespace) self payload dynamic buffer. + */ +#define AS_EXCHANGE_SELF_DYN_BUF_SIZE() (AS_EXCHANGE_UNIQUE_VINFO_MAX_SIZE_SOFT \ + * ((AS_EXCHANGE_VINFO_NUM_PIDS_AVG * sizeof(uint16_t)) \ + + sizeof(as_partition_version))) + +/** + * Scratch size for exchange messages. + * TODO: Compute this properly. + */ +#define AS_EXCHANGE_MSG_SCRATCH_SIZE 2048 + +#ifdef LOCK_DEBUG_ENABLED +#define LOCK_DEBUG(format, ...) DEBUG(format, ##__VA_ARGS__) +#else +#define LOCK_DEBUG(format, ...) +#endif + +/** + * Acquire a lock on the exchange subsystem. + */ +#define EXCHANGE_LOCK() \ +({ \ + pthread_mutex_lock (&g_exchange_lock); \ + LOCK_DEBUG("locked in %s", __FUNCTION__); \ +}) + +/** + * Relinquish the lock on the exchange subsystem. + */ +#define EXCHANGE_UNLOCK() \ +({ \ + pthread_mutex_unlock (&g_exchange_lock); \ + LOCK_DEBUG("unLocked in %s", __FUNCTION__); \ +}) + +/** + * Timer event generation interval. + */ +#define EXCHANGE_TIMER_TICK_INTERVAL() (75) + +/** + * Minimum timeout interval for sent exchange data. + */ +#define EXCHANGE_SEND_MIN_TIMEOUT() (MAX(75, as_hb_tx_interval_get() / 2)) + +/** + * Maximum timeout interval for sent exchange data. + */ +#define EXCHANGE_SEND_MAX_TIMEOUT() (30000) + +/** + * Timeout for receiving commit message after transitioning to ready to commit. + */ +#define EXCHANGE_READY_TO_COMMIT_TIMEOUT() (EXCHANGE_SEND_MIN_TIMEOUT()) + +/** + * Send timeout is a step function with this value as the interval for each + * step. + */ +#define EXCHANGE_SEND_STEP_INTERVAL() \ +(MAX(EXCHANGE_SEND_MIN_TIMEOUT(), as_hb_tx_interval_get())) + +/** + * Check if exchange is initialized. + */ +#define EXCHANGE_IS_INITIALIZED() \ +({ \ + EXCHANGE_LOCK(); \ + bool initialized = (g_exchange.sys_state \ + != AS_EXCHANGE_SYS_STATE_UNINITIALIZED); \ + EXCHANGE_UNLOCK(); \ + initialized; \ +}) + +/** + * * Check if exchange is running. + */ +#define EXCHANGE_IS_RUNNING() \ +({ \ + EXCHANGE_LOCK(); \ + bool running = (EXCHANGE_IS_INITIALIZED() \ + && g_exchange.sys_state == AS_EXCHANGE_SYS_STATE_RUNNING); \ + EXCHANGE_UNLOCK(); \ + running; \ +}) + +/** + * Create temporary stack variables. + */ +#define TOKEN_PASTE(x, y) x##y +#define STACK_VAR(x, y) TOKEN_PASTE(x, y) + +/** + * Convert a vector to a stack allocated array. + */ +#define cf_vector_to_stack_array(vector_p, nodes_array_p, num_nodes_p) \ +({ \ + *num_nodes_p = cf_vector_size(vector_p); \ + if (*num_nodes_p > 0) { \ + *nodes_array_p = alloca(sizeof(cf_node) * (*num_nodes_p)); \ + for (int i = 0; i < *num_nodes_p; i++) { \ + cf_vector_get(vector_p, i, &(*nodes_array_p)[i]); \ + } \ + } \ + else { \ + *nodes_array_p = NULL; \ + } \ +}) + +/** + * Create and initialize a lockless stack allocated vector to initially sized to + * store cluster node number of elements. + */ +#define cf_vector_stack_create(value_type) \ +({ \ + cf_vector * STACK_VAR(vector, __LINE__) = (cf_vector*)alloca( \ + sizeof(cf_vector)); \ + size_t buffer_size = AS_EXCHANGE_CLUSTER_MAX_SIZE_SOFT \ + * sizeof(value_type); \ + void* STACK_VAR(buff, __LINE__) = alloca(buffer_size); cf_vector_init_smalloc( \ + STACK_VAR(vector, __LINE__), sizeof(value_type), \ + (uint8_t*)STACK_VAR(buff, __LINE__), buffer_size, \ + VECTOR_FLAG_INITZERO); \ + STACK_VAR(vector, __LINE__); \ +}) + +/* + * ---------------------------------------------------------------------------- + * Vector functions to be moved to cf_vector + * ---------------------------------------------------------------------------- + */ + +/** + * Convert a vector to an array. + * FIXME: return pointer to the internal vector storage. + */ +static cf_node* +vector_to_array(cf_vector* vector) +{ + return (cf_node*)vector->vector; +} + +/** + * Clear / delete all entries in a vector. + */ +static void +vector_clear(cf_vector* vector) +{ + cf_vector_delete_range(vector, 0, cf_vector_size(vector)); +} + +/** + * Find the index of an element in the vector. Equality is based on mem compare. + * + * @param vector the source vector. + * @param element the element to find. + * @return the index if the element is found, -1 otherwise. + */ +static int +vector_find(cf_vector* vector, const void* element) +{ + int element_count = cf_vector_size(vector); + size_t value_len = VECTOR_ELEM_SZ(vector); + for (int i = 0; i < element_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + void* src_element = cf_vector_getp(vector, i); + if (src_element) { + if (memcmp(element, src_element, value_len) == 0) { + return i; + } + } + } + return -1; +} + +/** + * Copy all elements form the source vector to the destination vector to the + * destination vector. Assumes the source and destination vector are not being + * modified while the copy operation is in progress. + * + * @param dest the destination vector. + * @param src the source vector. + * @return the number of elements copied. + */ +static int +vector_copy(cf_vector* dest, cf_vector* src) +{ + int element_count = cf_vector_size(src); + int copied_count = 0; + for (int i = 0; i < element_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + void* src_element = cf_vector_getp(src, i); + if (src_element) { + cf_vector_append(dest, src_element); + copied_count++; + } + } + return copied_count; +} + +/** + * Generate a hash code for a blob using Jenkins hash function. + */ +static uint32_t +exchange_blob_hash(const uint8_t* value, size_t value_size) +{ + uint32_t hash = 0; + for (int i = 0; i < value_size; ++i) { + hash += value[i]; + hash += (hash << 10); + hash ^= (hash >> 6); + } + hash += (hash << 3); + hash ^= (hash >> 11); + hash += (hash << 15); + + return hash; +} + +/** + * Generate a hash code for a mesh node key. + */ +static uint32_t +exchange_vinfo_shash(const void* value) +{ + return exchange_blob_hash((const uint8_t*)value, + sizeof(as_partition_version)); +} + +/* + * ---------------------------------------------------------------------------- + * Clustering external event publisher + * ---------------------------------------------------------------------------- + */ + +/** + * * Check if event publisher is running. + */ +static bool +exchange_external_event_publisher_is_running() +{ + EXTERNAL_EVENT_PUBLISHER_LOCK(); + bool running = g_external_event_publisher.sys_state + == AS_EXCHANGE_SYS_STATE_RUNNING; + EXTERNAL_EVENT_PUBLISHER_UNLOCK(); + return running; +} + +/** + * Initialize the event publisher. + */ +static void +exchange_external_event_publisher_init() +{ + EXTERNAL_EVENT_PUBLISHER_LOCK(); + memset(&g_external_event_publisher, 0, sizeof(g_external_event_publisher)); + cf_vector_init(&g_external_event_publisher.published_succession_list, + sizeof(cf_node), + AS_EXCHANGE_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO); + + pthread_mutex_init(&g_external_event_publisher.is_pending_mutex, NULL); + pthread_cond_init(&g_external_event_publisher.is_pending, NULL); + EXTERNAL_EVENT_PUBLISHER_UNLOCK(); +} + +/** + * Register a clustering event listener. + */ +static void +exchange_external_event_listener_register( + as_exchange_cluster_changed_cb event_callback, void* udata) +{ + EXTERNAL_EVENT_PUBLISHER_LOCK(); + + if (g_external_event_publisher.event_listener_count + >= AS_EXTERNAL_EVENT_LISTENER_MAX) { + CRASH("cannot register more than %d event listeners", + AS_EXTERNAL_EVENT_LISTENER_MAX); + } + + g_external_event_publisher.event_listeners[g_external_event_publisher.event_listener_count].event_callback = + event_callback; + g_external_event_publisher.event_listeners[g_external_event_publisher.event_listener_count].udata = + udata; + g_external_event_publisher.event_listener_count++; + + EXTERNAL_EVENT_PUBLISHER_UNLOCK(); +} + +/** + * Wakeup the publisher thread. + */ +static void +exchange_external_event_publisher_thr_wakeup() +{ + pthread_mutex_lock(&g_external_event_publisher.is_pending_mutex); + pthread_cond_signal(&g_external_event_publisher.is_pending); + pthread_mutex_unlock(&g_external_event_publisher.is_pending_mutex); +} + +/** + * Queue up and external event to publish. + */ +static void +exchange_external_event_queue(as_exchange_cluster_changed_event* event) +{ + EXTERNAL_EVENT_PUBLISHER_LOCK(); + memcpy(&g_external_event_publisher.to_publish, event, + sizeof(g_external_event_publisher.to_publish)); + + vector_clear(&g_external_event_publisher.published_succession_list); + if (event->succession) { + // Use the static list for the published event, so that the input event + // object can be destroyed irrespective of when the it is published. + for (int i = 0; i < event->cluster_size; i++) { + cf_vector_append( + &g_external_event_publisher.published_succession_list, + &event->succession[i]); + } + g_external_event_publisher.to_publish.succession = vector_to_array( + &g_external_event_publisher.published_succession_list); + + } + else { + g_external_event_publisher.to_publish.succession = NULL; + } + + g_external_event_publisher.event_queued = true; + + EXTERNAL_EVENT_PUBLISHER_UNLOCK(); + + // Wake up the publisher thread. + exchange_external_event_publisher_thr_wakeup(); +} + +/** + * Publish external events if any are pending. + */ +static void +exchange_external_events_publish() +{ + EXTERNAL_EVENT_PUBLISHER_LOCK(); + + if (g_external_event_publisher.event_queued) { + g_external_event_publisher.event_queued = false; + for (uint32_t i = 0; + i < g_external_event_publisher.event_listener_count; i++) { + (g_external_event_publisher.event_listeners[i].event_callback)( + &g_external_event_publisher.to_publish, + g_external_event_publisher.event_listeners[i].udata); + } + } + EXTERNAL_EVENT_PUBLISHER_UNLOCK(); +} + +/** + * External event publisher thread. + */ +static void* +exchange_external_event_publisher_thr(void* arg) +{ + pthread_mutex_lock(&g_external_event_publisher.is_pending_mutex); + + while (true) { + pthread_cond_wait(&g_external_event_publisher.is_pending, + &g_external_event_publisher.is_pending_mutex); + if (exchange_external_event_publisher_is_running()) { + exchange_external_events_publish(); + } + else { + // Publisher stopped, exit the tread. + break; + } + } + + return NULL; +} + +/** + * Start the event publisher. + */ +static void +exchange_external_event_publisher_start() +{ + EXTERNAL_EVENT_PUBLISHER_LOCK(); + g_external_event_publisher.sys_state = AS_EXCHANGE_SYS_STATE_RUNNING; + + // Start the event publishing thread. + if (pthread_create(&g_external_event_publisher.event_publisher_tid, 0, + exchange_external_event_publisher_thr, NULL) != 0) { + CRASH("could not create event publishing thread: %s", + cf_strerror(errno)); + } + EXTERNAL_EVENT_PUBLISHER_UNLOCK(); +} + +/** + * Stop the event publisher. + */ +static void +external_event_publisher_stop() +{ + EXTERNAL_EVENT_PUBLISHER_LOCK(); + g_external_event_publisher.sys_state = AS_EXCHANGE_SYS_STATE_SHUTTING_DOWN; + EXTERNAL_EVENT_PUBLISHER_UNLOCK(); + + exchange_external_event_publisher_thr_wakeup(); + pthread_join(g_external_event_publisher.event_publisher_tid, NULL); + + EXTERNAL_EVENT_PUBLISHER_LOCK(); + g_external_event_publisher.sys_state = AS_EXCHANGE_SYS_STATE_STOPPED; + g_external_event_publisher.event_queued = false; + EXTERNAL_EVENT_PUBLISHER_UNLOCK(); +} + +/* + * ---------------------------------------------------------------------------- + * Node state related + * ---------------------------------------------------------------------------- + */ + +/** + * Initialize node state. + */ +static void +exchange_node_state_init(as_exchange_node_state* node_state) +{ + memset(node_state, 0, sizeof(*node_state)); + + node_state->data = cf_calloc(1, sizeof(as_exchange_node_data)); +} + +/** + * Reset node state. + */ +static void +exchange_node_state_reset(as_exchange_node_state* node_state) +{ + node_state->send_acked = false; + node_state->received = false; + node_state->is_ready_to_commit = false; + + node_state->data->num_namespaces = 0; + for (int i = 0; i < AS_NAMESPACE_SZ; i++) { + node_state->data->namespace_data[i].local_namespace = NULL; + } +} + +/** + * Destroy node state. + */ +static void +exchange_node_state_destroy(as_exchange_node_state* node_state) +{ + for (int i = 0; i < AS_NAMESPACE_SZ; i++) { + if (node_state->data->namespace_data[i].partition_versions) { + cf_free(node_state->data->namespace_data[i].partition_versions); + } + + if (node_state->data->namespace_data[i].roster) { + cf_free(node_state->data->namespace_data[i].roster); + } + + if (node_state->data->namespace_data[i].roster_rack_ids) { + cf_free(node_state->data->namespace_data[i].roster_rack_ids); + } + } + + cf_free(node_state->data); +} + +/** + * Reduce function to match node -> node state hash to the succession list. + * Should always be invoked under a lock over the main hash. + */ +static int +exchange_node_states_reset_reduce(const void* key, void* data, void* udata) +{ + const cf_node* node = (const cf_node*)key; + as_exchange_node_state* node_state = (as_exchange_node_state*)data; + + int node_index = vector_find(&g_exchange.succession_list, node); + if (node_index < 0) { + // Node not in succession list + exchange_node_state_destroy(node_state); + return CF_SHASH_REDUCE_DELETE; + } + + exchange_node_state_reset(node_state); + return CF_SHASH_OK; +} + +/** + * Adjust the nodeid_to_node_state hash to have an entry for every node in the + * succession list with state reset for a new round of exchange. Removes entries + * not in the succession list. + */ +static void +exchange_node_states_reset() +{ + EXCHANGE_LOCK(); + + // Fix existing entries by reseting entries in succession and removing + // entries not in succession list. + cf_shash_reduce(g_exchange.nodeid_to_node_state, + exchange_node_states_reset_reduce, NULL); + + // Add missing entries. + int succession_length = cf_vector_size(&g_exchange.succession_list); + + as_exchange_node_state temp_state; + for (int i = 0; i < succession_length; i++) { + cf_node nodeid; + + cf_vector_get(&g_exchange.succession_list, i, &nodeid); + if (cf_shash_get(g_exchange.nodeid_to_node_state, &nodeid, &temp_state) + == CF_SHASH_ERR_NOT_FOUND) { + exchange_node_state_init(&temp_state); + + cf_shash_put(g_exchange.nodeid_to_node_state, &nodeid, &temp_state); + } + } + + EXCHANGE_UNLOCK(); +} + +/** + * Reduce function to find nodes that had not acked self node's exchange data. + */ +static int +exchange_nodes_find_send_unacked_reduce(const void* key, void* data, + void* udata) +{ + const cf_node* node = (const cf_node*)key; + as_exchange_node_state* node_state = (as_exchange_node_state*)data; + cf_vector* unacked = (cf_vector*)udata; + + if (!node_state->send_acked) { + cf_vector_append(unacked, node); + } + return CF_SHASH_OK; +} + +/** + * Find nodes that have not acked self node's exchange data. + */ +static void +exchange_nodes_find_send_unacked(cf_vector* unacked) +{ + cf_shash_reduce(g_exchange.nodeid_to_node_state, + exchange_nodes_find_send_unacked_reduce, unacked); +} + +/** + * Reduce function to find peer nodes from whom self node has not received + * exchange data. + */ +static int +exchange_nodes_find_not_received_reduce(const void* key, void* data, + void* udata) +{ + const cf_node* node = (const cf_node*)key; + as_exchange_node_state* node_state = (as_exchange_node_state*)data; + cf_vector* not_received = (cf_vector*)udata; + + if (!node_state->received) { + cf_vector_append(not_received, node); + } + return CF_SHASH_OK; +} + +/** + * Find peer nodes from whom self node has not received exchange data. + */ +static void +exchange_nodes_find_not_received(cf_vector* not_received) +{ + cf_shash_reduce(g_exchange.nodeid_to_node_state, + exchange_nodes_find_not_received_reduce, not_received); +} + +/** + * Reduce function to find peer nodes that are not ready to commit. + */ +static int +exchange_nodes_find_not_ready_to_commit_reduce(const void* key, void* data, + void* udata) +{ + const cf_node* node = (const cf_node*)key; + as_exchange_node_state* node_state = (as_exchange_node_state*)data; + cf_vector* not_ready_to_commit = (cf_vector*)udata; + + if (!node_state->is_ready_to_commit) { + cf_vector_append(not_ready_to_commit, node); + } + return CF_SHASH_OK; +} + +/** + * Find peer nodes that are not ready to commit. + */ +static void +exchange_nodes_find_not_ready_to_commit(cf_vector* not_ready_to_commit) +{ + cf_shash_reduce(g_exchange.nodeid_to_node_state, + exchange_nodes_find_not_ready_to_commit_reduce, + not_ready_to_commit); +} + +/** + * Update the node state for a node. + */ +static void +exchange_node_state_update(cf_node nodeid, as_exchange_node_state* node_state) +{ + cf_shash_put(g_exchange.nodeid_to_node_state, &nodeid, node_state); +} + +/** + * Get state of a node from the hash. If not found crash because this entry + * should be present in the hash. + */ +static void +exchange_node_state_get_safe(cf_node nodeid, as_exchange_node_state* node_state) +{ + if (cf_shash_get(g_exchange.nodeid_to_node_state, &nodeid, node_state) + == CF_SHASH_ERR_NOT_FOUND) { + CRASH( + "node entry for node %"PRIx64" missing from node state hash", nodeid); + } +} + +/* + * ---------------------------------------------------------------------------- + * Message related + * ---------------------------------------------------------------------------- + */ + +/** + * Fill compulsary fields in a message common to all message types. + */ +static void +exchange_msg_src_fill(msg* msg, as_exchange_msg_type type) +{ + EXCHANGE_LOCK(); + msg_set_uint32(msg, AS_EXCHANGE_MSG_ID, AS_EXCHANGE_PROTOCOL_IDENTIFIER); + msg_set_uint64(msg, AS_EXCHANGE_MSG_CLUSTER_KEY, g_exchange.cluster_key); + msg_set_uint32(msg, AS_EXCHANGE_MSG_TYPE, type); + EXCHANGE_UNLOCK(); +} + +/** + * Get the msg buffer from a pool and fill in all compulsory fields. + * @return the msg buff with compulsory fields filled in. + */ +static msg* +exchange_msg_get(as_exchange_msg_type type) +{ + msg* msg = as_fabric_msg_get(M_TYPE_EXCHANGE); + exchange_msg_src_fill(msg, type); + return msg; +} + +/** + * Return the message buffer back to the pool. + */ +static void +exchange_msg_return(msg* msg) +{ + as_fabric_msg_put(msg); +} + +/** + * Get message id. + */ +static int +exchange_msg_id_get(msg* msg, uint32_t* msg_id) +{ + if (msg_get_uint32(msg, AS_EXCHANGE_MSG_ID, msg_id) != 0) { + return -1; + } + return 0; +} + +/** + * Get message type. + */ +static int +exchange_msg_type_get(msg* msg, as_exchange_msg_type* msg_type) +{ + if (msg_get_uint32(msg, AS_EXCHANGE_MSG_TYPE, msg_type) != 0) { + return -1; + } + return 0; +} + +/** + * Get message cluster key. + */ +static int +exchange_msg_cluster_key_get(msg* msg, as_cluster_key* cluster_key) +{ + if (msg_get_uint64(msg, AS_EXCHANGE_MSG_CLUSTER_KEY, cluster_key) != 0) { + return -1; + } + return 0; +} + +/** + * Set data payload for a message. + */ +static void +exchange_msg_data_payload_set(msg* msg) +{ + uint32_t ns_count = g_config.n_namespaces; + + cf_vector_define(namespace_list, sizeof(msg_buf_ele), ns_count, 0); + cf_vector_define(partition_versions, sizeof(msg_buf_ele), ns_count, 0); + uint32_t rack_ids[ns_count]; + + bool have_roster = false; + bool have_roster_rack_ids = false; + uint32_t roster_generations[ns_count]; + cf_vector_define(rosters, sizeof(msg_buf_ele), ns_count, 0); + cf_vector_define(rosters_rack_ids, sizeof(msg_buf_ele), ns_count, 0); + + bool have_regimes = false; + uint32_t eventual_regimes[ns_count]; + uint32_t rebalance_regimes[ns_count]; + + pthread_mutex_lock(&g_exchanged_info_lock); + + for (uint32_t ns_ix = 0; ns_ix < ns_count; ns_ix++) { + as_namespace* ns = g_config.namespaces[ns_ix]; + + msg_buf_ele ns_ele = { + .sz = (uint32_t)strlen(ns->name), + .ptr = (uint8_t*)ns->name + }; + + msg_buf_ele pv_ele = { + .sz = (uint32_t)g_exchange.self_data_dyn_buf[ns_ix].used_sz, + .ptr = g_exchange.self_data_dyn_buf[ns_ix].buf + }; + + msg_buf_ele rn_ele = { + .sz = (uint32_t)(ns->smd_roster_count * sizeof(cf_node)), + .ptr = (uint8_t*)ns->smd_roster + }; + + msg_buf_ele rri_ele = { + .sz = (uint32_t)(ns->smd_roster_count * sizeof(uint32_t)), + .ptr = (uint8_t*)ns->smd_roster_rack_ids + }; + + cf_vector_append(&namespace_list, &ns_ele); + cf_vector_append(&partition_versions, &pv_ele); + rack_ids[ns_ix] = ns->rack_id; + + if (ns->smd_roster_generation != 0) { + have_roster = true; + + if (! have_roster_rack_ids) { + for (uint32_t n = 0; n < ns->smd_roster_count; n++) { + if (ns->smd_roster_rack_ids[n] != 0) { + have_roster_rack_ids = true; + break; + } + } + } + } + + roster_generations[ns_ix] = ns->smd_roster_generation; + cf_vector_append(&rosters, &rn_ele); + cf_vector_append(&rosters_rack_ids, &rri_ele); + + eventual_regimes[ns_ix] = ns->eventual_regime; + rebalance_regimes[ns_ix] = ns->rebalance_regime; + + if (eventual_regimes[ns_ix] != 0 || rebalance_regimes[ns_ix] != 0) { + have_regimes = true; + } + } + + msg_msgpack_list_set_buf(msg, AS_EXCHANGE_MSG_NAMESPACES, &namespace_list); + msg_msgpack_list_set_buf(msg, AS_EXCHANGE_MSG_NS_PARTITION_VERSIONS, + &partition_versions); + msg_msgpack_list_set_uint32(msg, AS_EXCHANGE_MSG_NS_RACK_IDS, rack_ids, + ns_count); + + if (have_roster) { + msg_msgpack_list_set_uint32(msg, AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS, + roster_generations, ns_count); + msg_msgpack_list_set_buf(msg, AS_EXCHANGE_MSG_NS_ROSTERS, &rosters); + + if (have_roster_rack_ids) { + msg_msgpack_list_set_buf(msg, AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS, + &rosters_rack_ids); + } + } + + if (have_regimes) { + msg_msgpack_list_set_uint32(msg, AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES, + eventual_regimes, ns_count); + msg_msgpack_list_set_uint32(msg, AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES, + rebalance_regimes, ns_count); + } + + pthread_mutex_unlock(&g_exchanged_info_lock); +} + +/** + * Check sanity of an incoming message. If this check passes the message is + * guaranteed to have valid protocol identifier, valid type and valid matching + * cluster key with source node being a part of the cluster. + * @return 0 if the message in valid, -1 if the message is invalid and should be + * ignored. + */ +static bool +exchange_msg_is_sane(cf_node source, msg* msg) +{ + uint32_t id = 0; + if (exchange_msg_id_get(msg, &id) != 0|| + id != AS_EXCHANGE_PROTOCOL_IDENTIFIER) { + DEBUG( + "received exchange message with mismatching identifier - expected %u but was %u", + AS_EXCHANGE_PROTOCOL_IDENTIFIER, id); + return false; + } + + as_exchange_msg_type msg_type = 0; + + if (exchange_msg_type_get(msg, &msg_type) != 0 + || msg_type >= AS_EXCHANGE_MSG_TYPE_SENTINEL) { + WARNING("received exchange message with invalid message type %u", + msg_type); + return false; + } + + EXCHANGE_LOCK(); + as_cluster_key current_cluster_key = g_exchange.cluster_key; + bool is_in_cluster = vector_find(&g_exchange.succession_list, &source) >= 0; + EXCHANGE_UNLOCK(); + + if (!is_in_cluster) { + DEBUG("received exchange message from node %"PRIx64" not in cluster", + source); + return false; + } + + as_cluster_key incoming_cluster_key = 0; + if (exchange_msg_cluster_key_get(msg, &incoming_cluster_key) != 0 + || (current_cluster_key != incoming_cluster_key) + || current_cluster_key == 0) { + DEBUG("received exchange message with mismatching cluster key - expected %"PRIx64" but was %"PRIx64, + current_cluster_key, incoming_cluster_key); + return false; + } + + return true; +} + +/** + * Send a message over fabric. + * + * @param msg the message to send. + * @param dest the desination node. + * @param error_msg the error message. + */ +static void +exchange_msg_send(msg* msg, cf_node dest, char* error_msg) +{ + if (as_fabric_send(dest, msg, AS_FABRIC_CHANNEL_CTRL)) { + // Fabric will not return the message to the pool. Do it ourself. + exchange_msg_return(msg); + WARNING("%s (dest:%"PRIx64")", error_msg, dest); + } +} + +/** + * Send a message over to a list of destination nodes. + * + * @param msg the message to send. + * @param dests the node list to send the message to. + * @param num_dests the number of destination nodes. + * @param error_msg the error message. + */ +static void +exchange_msg_send_list(msg* msg, cf_node* dests, int num_dests, char* error_msg) +{ + if (as_fabric_send_list(dests, num_dests, msg, AS_FABRIC_CHANNEL_CTRL) + != 0) { + // Fabric will not return the message to the pool. Do it ourself. + exchange_msg_return(msg); + as_clustering_log_cf_node_array(CF_WARNING, AS_EXCHANGE, error_msg, + dests, num_dests); + } +} + +/** + * Send a commit message to a destination node. + * @param dest the destination node. + */ +static void +exchange_commit_msg_send(cf_node dest) +{ + msg* commit_msg = exchange_msg_get(AS_EXCHANGE_MSG_TYPE_COMMIT); + DEBUG("sending commit message to node %"PRIx64, dest); + exchange_msg_send(commit_msg, dest, "error sending commit message"); +} + +/** + * Send a commit message to a list of destination nodes. + * @param dests the destination nodes. + * @param num_dests the number of destination nodes. + */ +static void +exchange_commit_msg_send_all(cf_node* dests, int num_dests) +{ + msg* commit_msg = exchange_msg_get(AS_EXCHANGE_MSG_TYPE_COMMIT); + as_clustering_log_cf_node_array(CF_DEBUG, AS_EXCHANGE, + "sending commit message to nodes:", dests, num_dests); + exchange_msg_send_list(commit_msg, dests, num_dests, + "error sending commit message"); +} + +/** + * Send ready to commit message to the principal. + */ +static void +exchange_ready_to_commit_msg_send() +{ + EXCHANGE_LOCK(); + g_exchange.ready_to_commit_send_ts = cf_getms(); + cf_node principal = g_exchange.principal; + EXCHANGE_UNLOCK(); + + msg* ready_to_commit_msg = exchange_msg_get( + AS_EXCHANGE_MSG_TYPE_READY_TO_COMMIT); + DEBUG("sending ready to commit message to node %"PRIx64, principal); + exchange_msg_send(ready_to_commit_msg, principal, + "error sending ready to commit message"); +} + +/** + * Send exchange data to all nodes that have not acked the send. + */ +static void +exchange_data_msg_send_pending_ack() +{ + EXCHANGE_LOCK(); + g_exchange.send_ts = cf_getms(); + + cf_node* unacked_nodes; + int num_unacked_nodes; + cf_vector* unacked_nodes_vector = cf_vector_stack_create(cf_node); + + exchange_nodes_find_send_unacked(unacked_nodes_vector); + cf_vector_to_stack_array(unacked_nodes_vector, &unacked_nodes, + &num_unacked_nodes); + + cf_vector_destroy(unacked_nodes_vector); + + if (!num_unacked_nodes) { + goto Exit; + } + + msg* data_msg = exchange_msg_get(AS_EXCHANGE_MSG_TYPE_DATA); + exchange_msg_data_payload_set(data_msg); + + as_clustering_log_cf_node_array(CF_DEBUG, AS_EXCHANGE, + "sending exchange data to nodes:", unacked_nodes, + num_unacked_nodes); + + exchange_msg_send_list(data_msg, unacked_nodes, num_unacked_nodes, + "error sending exchange data"); +Exit: + EXCHANGE_UNLOCK(); +} + +/** + * Send a commit message to a destination node. + * @param dest the destination node. + */ +static void +exchange_data_ack_msg_send(cf_node dest) +{ + msg* ack_msg = exchange_msg_get(AS_EXCHANGE_MSG_TYPE_DATA_ACK); + DEBUG("sending data ack message to node %"PRIx64, dest); + exchange_msg_send(ack_msg, dest, "error sending data ack message"); +} + +/* + * ---------------------------------------------------------------------------- + * Data payload related + * ---------------------------------------------------------------------------- + */ + +/** + * Add a pid to the namespace hash for the input vinfo. + */ +static void +exchange_namespace_hash_pid_add(cf_shash* ns_hash, as_partition_version* vinfo, + uint16_t pid) +{ + if (as_partition_version_is_null(vinfo)) { + // Ignore NULL vinfos. + return; + } + + cf_vector* pid_vector; + + // Append the hash. + if (cf_shash_get(ns_hash, vinfo, &pid_vector) != CF_SHASH_OK) { + // We are seeing this vinfo for the first time. + pid_vector = cf_vector_create(sizeof(uint16_t), + AS_EXCHANGE_VINFO_NUM_PIDS_AVG, 0); + cf_shash_put(ns_hash, vinfo, &pid_vector); + } + + cf_vector_append(pid_vector, &pid); +} + +/** + * Destroy the pid vector for each vinfo. + */ +static int +exchange_namespace_hash_destroy_reduce(const void* key, void* data, void* udata) +{ + cf_vector* pid_vector = *(cf_vector**)data; + cf_vector_destroy(pid_vector); + return CF_SHASH_REDUCE_DELETE; +} + +/** + * Serialize each vinfo and accumulated pids to the input buffer. + */ +static int +exchange_namespace_hash_serialize_reduce(const void* key, void* data, + void* udata) +{ + const as_partition_version* vinfo = (const as_partition_version*)key; + cf_vector* pid_vector = *(cf_vector**)data; + cf_dyn_buf* dyn_buf = (cf_dyn_buf*)udata; + + // Append the vinfo. + cf_dyn_buf_append_buf(dyn_buf, (uint8_t*)vinfo, sizeof(*vinfo)); + + // Append the count of pids. + uint32_t num_pids = cf_vector_size(pid_vector); + cf_dyn_buf_append_buf(dyn_buf, (uint8_t*)&num_pids, sizeof(num_pids)); + + // Append each pid. + for (int i = 0; i < num_pids; i++) { + uint16_t* pid = cf_vector_getp(pid_vector, i); + cf_dyn_buf_append_buf(dyn_buf, (uint8_t*)pid, sizeof(*pid)); + } + + return CF_SHASH_OK; +} + +/** + * Append namespace payload, in as_exchange_namespace_payload format, for a + * namespace to the dynamic buffer. + * + * @param ns the namespace. + * @param dyn_buf the dynamic buffer. + */ +static void +exchange_data_namespace_payload_add(as_namespace* ns, cf_dyn_buf* dyn_buf) +{ + // A hash from each unique non null vinfo to a vector of partition ids + // having the vinfo. + cf_shash* ns_hash = cf_shash_create(exchange_vinfo_shash, + sizeof(as_partition_version), sizeof(cf_vector*), + AS_EXCHANGE_UNIQUE_VINFO_MAX_SIZE_SOFT, 0); + + as_partition* partitions = ns->partitions; + + // Populate the hash with one entry for each vinfo + for (int i = 0; i < AS_PARTITIONS; i++) { + as_partition_version* current_vinfo = &partitions[i].version; + exchange_namespace_hash_pid_add(ns_hash, current_vinfo, i); + } + + // We are ready to populate the dyn buffer with this ns's data. + DEBUG("namespace %s has %d unique vinfos", ns->name, + cf_shash_get_size(ns_hash)); + + // Append the vinfo count. + uint32_t num_vinfos = cf_shash_get_size(ns_hash); + cf_dyn_buf_append_buf(dyn_buf, (uint8_t*)&num_vinfos, sizeof(num_vinfos)); + + // Append vinfos and partitions. + cf_shash_reduce(ns_hash, exchange_namespace_hash_serialize_reduce, dyn_buf); + + // Destroy the intermediate hash and the pid vectors. + cf_shash_reduce(ns_hash, exchange_namespace_hash_destroy_reduce, NULL); + + cf_shash_destroy(ns_hash); +} + +/** + * Prepare the exchanged data payloads. + */ +static void +exchange_data_payloads_prepare() +{ + EXCHANGE_LOCK(); + + // Block / abort migrations and freeze the partition version infos. + as_partition_balance_disallow_migrations(); + as_partition_balance_synchronize_migrations(); + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + // Append payload for each namespace. + + // TODO - add API to reset dynbuf? + g_exchange.self_data_dyn_buf[ns_ix].used_sz = 0; + + exchange_data_namespace_payload_add(g_config.namespaces[ns_ix], + &g_exchange.self_data_dyn_buf[ns_ix]); + } + + EXCHANGE_UNLOCK(); +} + +/** + * Indicates if the per-namespace fields in an incoming data message are valid. + * + * @return number of namespaces. + */ +static uint32_t +exchange_data_msg_get_num_namespaces(as_exchange_event* msg_event) +{ + uint32_t num_namespaces_sent = 0; + uint32_t num_namespace_elements_sent = 0; + + if (!msg_msgpack_container_get_count(msg_event->msg, + AS_EXCHANGE_MSG_NAMESPACES, &num_namespaces_sent) + || num_namespaces_sent > AS_NAMESPACE_SZ) { + WARNING("received invalid namespaces from node %"PRIx64, + msg_event->msg_source); + return 0; + } + + if (!msg_msgpack_container_get_count(msg_event->msg, + AS_EXCHANGE_MSG_NS_PARTITION_VERSIONS, &num_namespace_elements_sent) + || num_namespaces_sent != num_namespace_elements_sent) { + WARNING("received invalid partition versions from node %"PRIx64, + msg_event->msg_source); + return 0; + } + + if (!msg_msgpack_container_get_count(msg_event->msg, + AS_EXCHANGE_MSG_NS_RACK_IDS, &num_namespace_elements_sent) + || num_namespaces_sent != num_namespace_elements_sent) { + WARNING("received invalid cluster groups from node %"PRIx64, + msg_event->msg_source); + return 0; + } + + if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS) + && (!msg_msgpack_container_get_count(msg_event->msg, + AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS, + &num_namespace_elements_sent) + || num_namespaces_sent != num_namespace_elements_sent)) { + WARNING("received invalid roster generations from node %"PRIx64, + msg_event->msg_source); + return 0; + } + + if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_ROSTERS) + && (!msg_msgpack_container_get_count(msg_event->msg, + AS_EXCHANGE_MSG_NS_ROSTERS, + &num_namespace_elements_sent) + || num_namespaces_sent != num_namespace_elements_sent)) { + WARNING("received invalid rosters from node %"PRIx64, + msg_event->msg_source); + return 0; + } + + if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS) + && (!msg_msgpack_container_get_count(msg_event->msg, + AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS, + &num_namespace_elements_sent) + || num_namespaces_sent != num_namespace_elements_sent)) { + WARNING("received invalid rosters-rack-ids from node %"PRIx64, + msg_event->msg_source); + return 0; + } + + if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES) + && (!msg_msgpack_container_get_count(msg_event->msg, + AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES, + &num_namespace_elements_sent) + || num_namespaces_sent != num_namespace_elements_sent)) { + WARNING("received invalid eventual regimes from node %"PRIx64, + msg_event->msg_source); + return 0; + } + + if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES) + && (!msg_msgpack_container_get_count(msg_event->msg, + AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES, + &num_namespace_elements_sent) + || num_namespaces_sent != num_namespace_elements_sent)) { + WARNING("received invalid rebalance regimes from node %"PRIx64, + msg_event->msg_source); + return 0; + } + + return num_namespaces_sent; +} + +/** + * Basic validation for incoming namespace payload. + * Validates that + * 1. Number of vinfos < AS_PARTITIONS. + * 2. Each partition is between 0 and AS_PARTITIONS. + * 3. Namespaces payload does not exceed payload_end_ptr. + * + * @param ns_payload pointer to start of the namespace payload. + * @param ns_payload_size the size of the input namespace payload. + * @return true if this is a valid payload. + */ +static bool +exchange_namespace_payload_is_valid(as_exchange_ns_vinfos_payload* ns_payload, + uint32_t ns_payload_size) +{ + // Pointer past the last byte in the payload. + uint8_t* payload_end_ptr = (uint8_t*)ns_payload + ns_payload_size; + + if ((uint8_t*)ns_payload->vinfos > payload_end_ptr) { + return false; + } + + if (ns_payload->num_vinfos > AS_PARTITIONS) { + return false; + } + + uint8_t* read_ptr = (uint8_t*)ns_payload->vinfos; + + for (uint32_t i = 0; i < ns_payload->num_vinfos; i++) { + if (read_ptr >= payload_end_ptr) { + return false; + } + + as_exchange_vinfo_payload* vinfo_payload = + (as_exchange_vinfo_payload*)read_ptr; + + if ((uint8_t*)vinfo_payload->pids > payload_end_ptr) { + return false; + } + + if (vinfo_payload->num_pids > AS_PARTITIONS) { + return false; + } + + size_t pids_size = vinfo_payload->num_pids * sizeof(uint16_t); + + if ((uint8_t*)vinfo_payload->pids + pids_size > payload_end_ptr) { + return false; + } + + for (uint32_t j = 0; j < vinfo_payload->num_pids; j++) { + if (vinfo_payload->pids[j] >= AS_PARTITIONS) { + return false; + } + } + + read_ptr += sizeof(as_exchange_vinfo_payload) + pids_size; + } + + if (read_ptr != payload_end_ptr) { + // There are unaccounted for extra bytes in the payload. + return false; + } + + return true; +} + +/* + * ---------------------------------------------------------------------------- + * Common across all states + * ---------------------------------------------------------------------------- + */ + +/** + * Indicates if self node is the cluster principal. + */ +static bool +exchange_self_is_principal() +{ + EXCHANGE_LOCK(); + bool is_principal = (g_config.self_node == g_exchange.principal); + EXCHANGE_UNLOCK(); + return is_principal; +} + +/** + * Dump exchange state. + */ +static void +exchange_dump(cf_fault_severity severity, bool verbose) +{ + EXCHANGE_LOCK(); + cf_vector* node_vector = cf_vector_stack_create(cf_node); + + char* state_str = ""; + switch (g_exchange.state) { + case AS_EXCHANGE_STATE_REST: + state_str = "rest"; + break; + case AS_EXCHANGE_STATE_EXCHANGING: + state_str = "exchanging"; + break; + case AS_EXCHANGE_STATE_READY_TO_COMMIT: + state_str = "ready to commit"; + break; + case AS_EXCHANGE_STATE_ORPHANED: + state_str = "orphaned"; + break; + } + + LOG(severity, "EXG: state: %s", state_str); + + if (g_exchange.state == AS_EXCHANGE_STATE_ORPHANED) { + LOG(severity, "EXG: client transactions blocked: %s", + g_exchange.orphan_state_are_transactions_blocked ? + "true" : "false"); + LOG(severity, "EXG: orphan since: %"PRIu64"(millis)", + cf_getms() - g_exchange.orphan_state_start_time); + } + else { + LOG(severity, "EXG: cluster key: %"PRIx64, g_exchange.cluster_key); + as_clustering_log_cf_node_vector(severity, AS_EXCHANGE, + "EXG: succession:", &g_exchange.succession_list); + + if (verbose) { + vector_clear(node_vector); + exchange_nodes_find_send_unacked(node_vector); + as_clustering_log_cf_node_vector(severity, AS_EXCHANGE, + "EXG: send pending:", node_vector); + + vector_clear(node_vector); + exchange_nodes_find_not_received(node_vector); + as_clustering_log_cf_node_vector(severity, AS_EXCHANGE, + "EXG: receive pending:", node_vector); + + if (exchange_self_is_principal()) { + vector_clear(node_vector); + exchange_nodes_find_not_ready_to_commit(node_vector); + as_clustering_log_cf_node_vector(severity, AS_EXCHANGE, + "EXG: ready to commit pending:", node_vector); + } + } + } + + cf_vector_destroy(node_vector); + EXCHANGE_UNLOCK(); +} + +/** + * Reset state for new round of exchange, while reusing as mush heap allocated + * space for exchanged data. + * @param new_succession_list new succession list. Can be NULL for orphaned + * state. + * @param new_cluster_key 0 for orphaned state. + */ +static void +exchange_reset_for_new_round(cf_vector* new_succession_list, + as_cluster_key new_cluster_key) +{ + EXCHANGE_LOCK(); + vector_clear(&g_exchange.succession_list); + g_exchange.principal = 0; + + if (new_succession_list && cf_vector_size(new_succession_list) > 0) { + vector_copy(&g_exchange.succession_list, new_succession_list); + // Set the principal node. + cf_vector_get(&g_exchange.succession_list, 0, &g_exchange.principal); + g_exchange.cluster_size = cf_vector_size(new_succession_list); + } + else { + g_exchange.cluster_size = 0; + } + + // Reset accumulated node states. + exchange_node_states_reset(); + + g_exchange.cluster_key = new_cluster_key; + EXCHANGE_UNLOCK(); +} + +/** + * Commit exchange state to reflect self node being an orphan. + */ +static void +exchange_orphan_commit() +{ + EXCHANGE_LOCK(); + g_exchange.committed_cluster_key = 0; + g_exchange.committed_cluster_size = 0; + g_exchange.committed_principal = 0; + vector_clear(&g_exchange.committed_succession_list); + WARNING("blocking client transactions in orphan state!"); + as_partition_balance_revert_to_orphan(); + g_exchange.orphan_state_are_transactions_blocked = true; + EXCHANGE_UNLOCK(); +} + +/** + * Receive an orphaned event and abort current round. + */ +static void +exchange_orphaned_handle(as_clustering_event* orphaned_event) +{ + DEBUG("got orphaned event"); + + EXCHANGE_LOCK(); + + if (g_exchange.state != AS_EXCHANGE_STATE_REST + && g_exchange.state != AS_EXCHANGE_STATE_ORPHANED) { + INFO("aborting partition exchange with cluster key %"PRIx64, + g_exchange.cluster_key); + } + + g_exchange.state = AS_EXCHANGE_STATE_ORPHANED; + exchange_reset_for_new_round(NULL, 0); + + // Stop ongoing migrations if any. + as_partition_balance_disallow_migrations(); + as_partition_balance_synchronize_migrations(); + + // Update the time this node got into orphan state. + g_exchange.orphan_state_start_time = cf_getms(); + + // Potentially temporary orphan state. We will timeout and commit orphan + // state if this persists for long. + g_exchange.orphan_state_are_transactions_blocked = false; + + EXCHANGE_UNLOCK(); +} + +/** + * Receive a cluster change event and start a new data exchange round. + */ +static void +exchange_cluster_change_handle(as_clustering_event* clustering_event) +{ + EXCHANGE_LOCK(); + + DEBUG("got cluster change event"); + + if (g_exchange.state != AS_EXCHANGE_STATE_REST + && g_exchange.state != AS_EXCHANGE_STATE_ORPHANED) { + INFO("aborting partition exchange with cluster key %"PRIx64, + g_exchange.cluster_key); + } + + exchange_reset_for_new_round(clustering_event->succession_list, + clustering_event->cluster_key); + + g_exchange.state = AS_EXCHANGE_STATE_EXCHANGING; + + INFO("data exchange started with cluster key %"PRIx64, + g_exchange.cluster_key); + + // Prepare the data payloads. + exchange_data_payloads_prepare(); + + EXCHANGE_UNLOCK(); + + exchange_data_msg_send_pending_ack(); +} + +/** + * Handle a cluster change event. + * @param cluster_change_event the cluster change event. + */ +static void +exchange_clustering_event_handle(as_exchange_event* exchange_clustering_event) +{ + as_clustering_event* clustering_event = + exchange_clustering_event->clustering_event; + + switch (clustering_event->type) { + case AS_CLUSTERING_ORPHANED: + exchange_orphaned_handle(clustering_event); + break; + case AS_CLUSTERING_CLUSTER_CHANGED: + exchange_cluster_change_handle(clustering_event); + break; + } +} + +/* + * ---------------------------------------------------------------------------- + * Orphan state event handling + * ---------------------------------------------------------------------------- + */ + +/** + * The wait time in orphan state after which client transactions and transaction + * related interactions (e.g. valid partition map publishing) should be blocked. + */ +static uint32_t +exchange_orphan_transaction_block_timeout() +{ + return (uint32_t)as_clustering_quantum_interval() + * AS_EXCHANGE_REVERT_ORPHAN_INTERVALS; +} + +/** + * Handle the timer event and if we have been an orphan for too long, block + * client transactions. + */ +static void +exchange_orphan_timer_event_handle() +{ + uint32_t timeout = exchange_orphan_transaction_block_timeout(); + EXCHANGE_LOCK(); + if (!g_exchange.orphan_state_are_transactions_blocked + && g_exchange.orphan_state_start_time + timeout < cf_getms()) { + exchange_orphan_commit(); + } + EXCHANGE_UNLOCK(); +} + +/** + * Event processing in the orphan state. + */ +static void +exchange_orphan_event_handle(as_exchange_event* event) +{ + switch (event->type) { + case AS_EXCHANGE_EVENT_CLUSTER_CHANGE: + exchange_clustering_event_handle(event); + break; + case AS_EXCHANGE_EVENT_TIMER: + exchange_orphan_timer_event_handle(); + break; + default: + break; + } +} + +/* + * ---------------------------------------------------------------------------- + * Rest state event handling + * ---------------------------------------------------------------------------- + */ + +/** + * Process a message event when in rest state. + */ +static void +exchange_rest_msg_event_handle(as_exchange_event* msg_event) +{ + EXCHANGE_LOCK(); + + if (!exchange_msg_is_sane(msg_event->msg_source, msg_event->msg)) { + goto Exit; + } + + as_exchange_msg_type msg_type; + exchange_msg_type_get(msg_event->msg, &msg_type); + + if (exchange_self_is_principal() + && msg_type == AS_EXCHANGE_MSG_TYPE_READY_TO_COMMIT) { + // The commit message did not make it to the source node, hence it send + // us the ready to commit message. Resend the commit message. + DEBUG("received a ready to commit message from %"PRIx64, + msg_event->msg_source); + exchange_commit_msg_send(msg_event->msg_source); + } + else { + DEBUG( + "rest state received unexpected mesage of type %d from node %"PRIx64, + msg_type, msg_event->msg_source); + + } + +Exit: + + EXCHANGE_UNLOCK(); +} + +/** + * Event processing in the rest state. + */ +static void +exchange_rest_event_handle(as_exchange_event* event) +{ + switch (event->type) { + case AS_EXCHANGE_EVENT_CLUSTER_CHANGE: + exchange_clustering_event_handle(event); + break; + case AS_EXCHANGE_EVENT_MSG: + exchange_rest_msg_event_handle(event); + break; + default: + break; + } +} + +/* + * ---------------------------------------------------------------------------- + * Exchanging state event handling + * ---------------------------------------------------------------------------- + */ + +/** + * Commit namespace payload for a node. + * Assumes the namespace vinfo and succession list have been zero set before. + */ +static void +exchange_namespace_payload_pre_commit_for_node(cf_node node, + as_exchange_node_namespace_data* namespace_data) +{ + as_namespace* ns = namespace_data->local_namespace; + + uint32_t sl_ix = ns->cluster_size++; + + ns->succession[sl_ix] = node; + + as_exchange_ns_vinfos_payload* ns_payload = + namespace_data->partition_versions; + uint8_t* read_ptr = (uint8_t*)ns_payload->vinfos; + + for (int i = 0; i < ns_payload->num_vinfos; i++) { + as_exchange_vinfo_payload* vinfo_payload = + (as_exchange_vinfo_payload*)read_ptr; + + for (int j = 0; j < vinfo_payload->num_pids; j++) { + memcpy(&ns->cluster_versions[sl_ix][vinfo_payload->pids[j]], + &vinfo_payload->vinfo, sizeof(vinfo_payload->vinfo)); + } + + read_ptr += sizeof(as_exchange_vinfo_payload) + + vinfo_payload->num_pids * sizeof(uint16_t); + } + + ns->rack_ids[sl_ix] = namespace_data->rack_id; + + if (namespace_data->roster_generation > ns->roster_generation) { + ns->roster_generation = namespace_data->roster_generation; + ns->roster_count = namespace_data->roster_count; + + memcpy(ns->roster, namespace_data->roster, + ns->roster_count * sizeof(cf_node)); + + if (namespace_data->roster_rack_ids) { + memcpy(ns->roster_rack_ids, namespace_data->roster_rack_ids, + ns->roster_count * sizeof(uint32_t)); + } + else { + memset(ns->roster_rack_ids, 0, ns->roster_count * sizeof(uint32_t)); + } + } + + if (namespace_data->eventual_regime > ns->eventual_regime) { + ns->eventual_regime = namespace_data->eventual_regime; + } + + ns->rebalance_regimes[sl_ix] = namespace_data->rebalance_regime; +} + +/** + * Commit exchange data for a given node. + */ +static void +exchange_data_pre_commit_for_node(cf_node node) +{ + EXCHANGE_LOCK(); + as_exchange_node_state node_state; + exchange_node_state_get_safe(node, &node_state); + + for (uint32_t i = 0; i < node_state.data->num_namespaces; i++) { + exchange_namespace_payload_pre_commit_for_node(node, + &node_state.data->namespace_data[i]); + } + + EXCHANGE_UNLOCK(); +} + +/** + * Check that there's not a mixture of AP and CP nodes in any namespace. + */ +static bool +exchange_data_pre_commit_ap_cp_check() +{ + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_namespace* ns = g_config.namespaces[i]; + + cf_node ap_node = (cf_node)0; + cf_node cp_node = (cf_node)0; + + for (uint32_t n = 0; n < ns->cluster_size; n++) { + if (ns->rebalance_regimes[n] == 0) { + ap_node = ns->succession[n]; + } + else { + cp_node = ns->succession[n]; + } + } + + if (ap_node != (cf_node)0 && cp_node != (cf_node)0) { + WARNING("{%s} has mixture of AP and SC nodes - for example %lx is AP and %lx is SC", + ns->name, ap_node, cp_node); + return false; + } + } + + return true; +} + +/** + * Pre commit namespace data anticipating a successful commit from the + * principal. This pre commit is to ensure regime advances in cp mode to cover + * the case where the principal commits exchange data but the commit to a + * non-principal is lost. + */ +static bool +exchange_exchanging_pre_commit() +{ + EXCHANGE_LOCK(); + pthread_mutex_lock(&g_exchanged_info_lock); + + // Reset exchange data for all namespaces. + for (int i = 0; i < g_config.n_namespaces; i++) { + as_namespace* ns = g_config.namespaces[i]; + memset(ns->succession, 0, sizeof(ns->succession)); + + // Assuming zero to represent "null" partition. + memset(ns->cluster_versions, 0, sizeof(ns->cluster_versions)); + + memset(ns->rack_ids, 0, sizeof(ns->rack_ids)); + + ns->roster_generation = 0; + ns->roster_count = 0; + memset(ns->roster, 0, sizeof(ns->roster)); + memset(ns->roster_rack_ids, 0, sizeof(ns->roster_rack_ids)); + + ns->eventual_regime = 0; + // Note - not clearing ns->rebalance_regime - it's not set here. + memset(ns->rebalance_regimes, 0, sizeof(ns->rebalance_regimes)); + + // Reset ns cluster size to zero. + ns->cluster_size = 0; + } + + // Fill the namespace partition version info in succession list order. + int num_nodes = cf_vector_size(&g_exchange.succession_list); + for (int i = 0; i < num_nodes; i++) { + cf_node node; + cf_vector_get(&g_exchange.succession_list, i, &node); + exchange_data_pre_commit_for_node(node); + } + + // Collected all exchanged data - do final configuration consistency checks. + if (! exchange_data_pre_commit_ap_cp_check()) { + WARNING("abandoned exchange - fix configuration conflict"); + pthread_mutex_unlock(&g_exchanged_info_lock); + EXCHANGE_UNLOCK(); + return false; + } + + for (int i = 0; i < g_config.n_namespaces; i++) { + as_namespace* ns = g_config.namespaces[i]; + + if (ns->eventual_regime != 0) { + ns->eventual_regime += 2; + + // TODO - until future storage format change, we'll use partition 0 + // to save and restore ns->eventual_regime. + + // Ok to not take partition lock. + as_partition* p = &ns->partitions[0]; + + as_storage_info_set(ns, p, true); + + INFO("{%s} eventual-regime %u ready", ns->name, + ns->eventual_regime); + } + } + + pthread_mutex_unlock(&g_exchanged_info_lock); + EXCHANGE_UNLOCK(); + + return true; +} + +/** + * Check to see if all exchange data is sent and received. If so switch to + * ready_to_commit state. + */ +static void +exchange_exchanging_check_switch_ready_to_commit() +{ + EXCHANGE_LOCK(); + + cf_vector* node_vector = cf_vector_stack_create(cf_node); + bool ready_to_commit = false; + + if (g_exchange.state == AS_EXCHANGE_STATE_REST + || g_exchange.cluster_key == 0) { + goto Exit; + } + + exchange_nodes_find_send_unacked(node_vector); + if (cf_vector_size(node_vector) > 0) { + // We still have unacked exchange send messages. + goto Exit; + } + + vector_clear(node_vector); + exchange_nodes_find_not_received(node_vector); + if (cf_vector_size(node_vector) > 0) { + // We still haven't received exchange messages from all nodes in the + // succession list. + goto Exit; + } + + g_exchange.state = AS_EXCHANGE_STATE_READY_TO_COMMIT; + + ready_to_commit = true; + + DEBUG("ready to commit exchange data for cluster key %"PRIx64, + g_exchange.cluster_key); + +Exit: + cf_vector_destroy(node_vector); + + if (ready_to_commit && exchange_exchanging_pre_commit()) { + exchange_ready_to_commit_msg_send(); + } + + EXCHANGE_UNLOCK(); +} + +/** + * Handle incoming data message. + * + * Assumes the message has been checked for sanity. + */ +static void +exchange_exchanging_data_msg_handle(as_exchange_event* msg_event) +{ + EXCHANGE_LOCK(); + + DEBUG("received exchange data from node %"PRIx64, msg_event->msg_source); + + as_exchange_node_state node_state; + exchange_node_state_get_safe(msg_event->msg_source, &node_state); + + if (!node_state.received) { + uint32_t num_namespaces_sent = exchange_data_msg_get_num_namespaces( + msg_event); + + if (num_namespaces_sent == 0) { + WARNING("ignoring invalid exchange data from node %"PRIx64, + msg_event->msg_source); + goto Exit; + } + + cf_vector_define(namespace_list, sizeof(msg_buf_ele), + num_namespaces_sent, 0); + cf_vector_define(partition_versions, sizeof(msg_buf_ele), + num_namespaces_sent, 0); + uint32_t rack_ids[num_namespaces_sent]; + + uint32_t roster_generations[num_namespaces_sent]; + cf_vector_define(rosters, sizeof(msg_buf_ele), num_namespaces_sent, 0); + cf_vector_define(rosters_rack_ids, sizeof(msg_buf_ele), + num_namespaces_sent, 0); + + memset(roster_generations, 0, sizeof(roster_generations)); + + uint32_t eventual_regimes[num_namespaces_sent]; + uint32_t rebalance_regimes[num_namespaces_sent]; + + memset(eventual_regimes, 0, sizeof(eventual_regimes)); + memset(rebalance_regimes, 0, sizeof(rebalance_regimes)); + + if (!msg_msgpack_list_get_buf_array_presized(msg_event->msg, + AS_EXCHANGE_MSG_NAMESPACES, &namespace_list)) { + WARNING("received invalid namespaces from node %"PRIx64, + msg_event->msg_source); + goto Exit; + } + + if (!msg_msgpack_list_get_buf_array_presized(msg_event->msg, + AS_EXCHANGE_MSG_NS_PARTITION_VERSIONS, &partition_versions)) { + WARNING("received invalid partition versions from node %"PRIx64, + msg_event->msg_source); + goto Exit; + } + + uint32_t num_rack_ids = num_namespaces_sent; + + if (!msg_msgpack_list_get_uint32_array(msg_event->msg, + AS_EXCHANGE_MSG_NS_RACK_IDS, rack_ids, &num_rack_ids)) { + WARNING("received invalid cluster groups from node %"PRIx64, + msg_event->msg_source); + goto Exit; + } + + uint32_t num_roster_generations = num_namespaces_sent; + + if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS) + && !msg_msgpack_list_get_uint32_array(msg_event->msg, + AS_EXCHANGE_MSG_NS_ROSTER_GENERATIONS, + roster_generations, &num_roster_generations)) { + WARNING("received invalid roster generations from node %"PRIx64, + msg_event->msg_source); + goto Exit; + } + + if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_ROSTERS) + && !msg_msgpack_list_get_buf_array_presized(msg_event->msg, + AS_EXCHANGE_MSG_NS_ROSTERS, &rosters)) { + WARNING("received invalid rosters from node %"PRIx64, + msg_event->msg_source); + goto Exit; + } + + if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS) + && !msg_msgpack_list_get_buf_array_presized(msg_event->msg, + AS_EXCHANGE_MSG_NS_ROSTERS_RACK_IDS, + &rosters_rack_ids)) { + WARNING("received invalid rosters-rack-ids from node %"PRIx64, + msg_event->msg_source); + goto Exit; + } + + uint32_t num_eventual_regimes = num_namespaces_sent; + + if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES) + && !msg_msgpack_list_get_uint32_array(msg_event->msg, + AS_EXCHANGE_MSG_NS_EVENTUAL_REGIMES, eventual_regimes, + &num_eventual_regimes)) { + WARNING("received invalid eventual regimes from node %"PRIx64, + msg_event->msg_source); + goto Exit; + } + + uint32_t num_rebalance_regimes = num_namespaces_sent; + + if (msg_is_set(msg_event->msg, AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES) + && !msg_msgpack_list_get_uint32_array(msg_event->msg, + AS_EXCHANGE_MSG_NS_REBALANCE_REGIMES, rebalance_regimes, + &num_rebalance_regimes)) { + WARNING("received invalid rebalance regimes from node %"PRIx64, + msg_event->msg_source); + goto Exit; + } + + node_state.data->num_namespaces = 0; + + for (uint32_t i = 0; i < num_namespaces_sent; i++) { + msg_buf_ele* namespace_name_element = cf_vector_getp( + &namespace_list, i); + + // Find a match for the namespace. + as_namespace* matching_namespace = as_namespace_get_bybuf( + namespace_name_element->ptr, namespace_name_element->sz); + + if (!matching_namespace) { + continue; + } + + as_exchange_node_namespace_data* namespace_data = + &node_state.data->namespace_data[node_state.data->num_namespaces]; + node_state.data->num_namespaces++; + + namespace_data->local_namespace = matching_namespace; + namespace_data->rack_id = rack_ids[i]; + namespace_data->roster_generation = roster_generations[i]; + namespace_data->eventual_regime = eventual_regimes[i]; + namespace_data->rebalance_regime = rebalance_regimes[i]; + + // Copy partition versions. + msg_buf_ele* partition_versions_element = cf_vector_getp( + &partition_versions, i); + + if (!exchange_namespace_payload_is_valid( + (as_exchange_ns_vinfos_payload*)partition_versions_element->ptr, + partition_versions_element->sz)) { + WARNING( + "received invalid partition versions for namespace %s from node %"PRIx64, + matching_namespace->name, msg_event->msg_source); + goto Exit; + } + + namespace_data->partition_versions = cf_realloc( + namespace_data->partition_versions, + partition_versions_element->sz); + + memcpy(namespace_data->partition_versions, + partition_versions_element->ptr, + partition_versions_element->sz); + + // Copy rosters. + // TODO - make this piece a utility function? + if (namespace_data->roster_generation == 0) { + namespace_data->roster_count = 0; + } + else { + msg_buf_ele* roster_ele = cf_vector_getp(&rosters, i); + + namespace_data->roster_count = roster_ele->sz / sizeof(cf_node); + + if (namespace_data->roster_count == 0 + || namespace_data->roster_count > AS_CLUSTER_SZ + || roster_ele->sz % sizeof(cf_node) != 0) { + WARNING( + "received invalid roster for namespace %s from node %"PRIx64, + matching_namespace->name, msg_event->msg_source); + goto Exit; + } + + namespace_data->roster = + cf_realloc(namespace_data->roster, roster_ele->sz); + + memcpy(namespace_data->roster, roster_ele->ptr, roster_ele->sz); + + uint32_t rri_ele_sz = 0; + + if (cf_vector_size(&rosters_rack_ids) != 0) { + msg_buf_ele* rri_ele = cf_vector_getp(&rosters_rack_ids, i); + + if (rri_ele->sz != 0) { + rri_ele_sz = rri_ele->sz; + + if (rri_ele_sz != + namespace_data->roster_count * sizeof(uint32_t)) { + WARNING( + "received invalid roster-rack-ids for namespace %s from node %"PRIx64, + matching_namespace->name, msg_event->msg_source); + goto Exit; + } + + namespace_data->roster_rack_ids = + cf_realloc(namespace_data->roster_rack_ids, + rri_ele_sz); + + memcpy(namespace_data->roster_rack_ids, rri_ele->ptr, + rri_ele_sz); + } + } + + if (rri_ele_sz == 0 && namespace_data->roster_rack_ids) { + cf_free(namespace_data->roster_rack_ids); + namespace_data->roster_rack_ids = NULL; + } + } + } + + // Mark exchange data received from the source. + node_state.received = true; + exchange_node_state_update(msg_event->msg_source, &node_state); + } + else { + // Duplicate pinfo received. Ignore. + INFO("received duplicate exchange data from node %"PRIx64, + msg_event->msg_source); + } + + // Send an acknowledgement. + exchange_data_ack_msg_send(msg_event->msg_source); + + // Check if we can switch to ready to commit state. + exchange_exchanging_check_switch_ready_to_commit(); + +Exit: + EXCHANGE_UNLOCK(); +} + +/** + * Handle incoming data ack message. + * + * Assumes the message has been checked for sanity. + */ +static void +exchange_exchanging_data_ack_msg_handle(as_exchange_event* msg_event) +{ + EXCHANGE_LOCK(); + + DEBUG("received exchange data ack from node %"PRIx64, + msg_event->msg_source); + + as_exchange_node_state node_state; + exchange_node_state_get_safe(msg_event->msg_source, &node_state); + + if (!node_state.send_acked) { + // Mark send as acked in the node state. + node_state.send_acked = true; + exchange_node_state_update(msg_event->msg_source, &node_state); + } + else { + // Duplicate ack. Ignore. + DEBUG("received duplicate data ack from node %"PRIx64, + msg_event->msg_source); + } + + // We might have send and received all partition info. Check for completion. + exchange_exchanging_check_switch_ready_to_commit(); + + EXCHANGE_UNLOCK(); +} + +/** + * Process a message event when in exchanging state. + */ +static void +exchange_exchanging_msg_event_handle(as_exchange_event* msg_event) +{ + EXCHANGE_LOCK(); + + if (!exchange_msg_is_sane(msg_event->msg_source, msg_event->msg)) { + goto Exit; + } + + as_exchange_msg_type msg_type; + exchange_msg_type_get(msg_event->msg, &msg_type); + + switch (msg_type) { + case AS_EXCHANGE_MSG_TYPE_DATA: + exchange_exchanging_data_msg_handle(msg_event); + break; + case AS_EXCHANGE_MSG_TYPE_DATA_ACK: + exchange_exchanging_data_ack_msg_handle(msg_event); + break; + default: + DEBUG( + "exchanging state received unexpected mesage of type %d from node %"PRIx64, + msg_type, msg_event->msg_source); + } +Exit: + EXCHANGE_UNLOCK(); +} + +/** + * Process a message event when in exchanging state. + */ +static void +exchange_exchanging_timer_event_handle(as_exchange_event* msg_event) +{ + EXCHANGE_LOCK(); + bool send_data = false; + + cf_clock now = cf_getms(); + + // The timeout is a "linear" step function, where the timeout is constant + // for the step interval. + cf_clock min_timeout = EXCHANGE_SEND_MIN_TIMEOUT(); + cf_clock max_timeout = EXCHANGE_SEND_MAX_TIMEOUT(); + uint32_t step_interval = EXCHANGE_SEND_STEP_INTERVAL(); + cf_clock timeout = MAX(min_timeout, + MIN(max_timeout, + min_timeout + * ((now - g_exchange.send_ts) / step_interval))); + + if (g_exchange.send_ts + timeout < now) { + send_data = true; + } + + EXCHANGE_UNLOCK(); + + if (send_data) { + exchange_data_msg_send_pending_ack(); + } +} + +/** + * Event processing in the exchanging state. + */ +static void +exchange_exchanging_event_handle(as_exchange_event* event) +{ + switch (event->type) { + case AS_EXCHANGE_EVENT_CLUSTER_CHANGE: + exchange_clustering_event_handle(event); + break; + case AS_EXCHANGE_EVENT_MSG: + exchange_exchanging_msg_event_handle(event); + break; + case AS_EXCHANGE_EVENT_TIMER: + exchange_exchanging_timer_event_handle(event); + break; + } +} + +/* + * ---------------------------------------------------------------------------- + * Ready_To_Commit state event handling + * ---------------------------------------------------------------------------- + */ + +/** + * Handle incoming ready to commit message. + * + * Assumes the message has been checked for sanity. + */ +static void +exchange_ready_to_commit_rtc_msg_handle(as_exchange_event* msg_event) +{ + if (!exchange_self_is_principal()) { + WARNING( + "non-principal self received ready to commit message from %"PRIx64" - ignoring", + msg_event->msg_source); + return; + } + + EXCHANGE_LOCK(); + + DEBUG("received ready to commit from node %"PRIx64, msg_event->msg_source); + + as_exchange_node_state node_state; + exchange_node_state_get_safe(msg_event->msg_source, &node_state); + + if (!node_state.is_ready_to_commit) { + // Mark as ready to commit in the node state. + node_state.is_ready_to_commit = true; + exchange_node_state_update(msg_event->msg_source, &node_state); + } + else { + // Duplicate ready to commit received. Ignore. + INFO("received duplicate ready to commit message from node %"PRIx64, + msg_event->msg_source); + } + + cf_vector* node_vector = cf_vector_stack_create(cf_node); + exchange_nodes_find_not_ready_to_commit(node_vector); + + if (cf_vector_size(node_vector) <= 0) { + // Send a commit message to all nodes in succession list. + cf_node* node_list = NULL; + int num_node_list = 0; + cf_vector_to_stack_array(&g_exchange.succession_list, &node_list, + &num_node_list); + exchange_commit_msg_send_all(node_list, num_node_list); + } + + cf_vector_destroy(node_vector); + + EXCHANGE_UNLOCK(); +} + +/** + * Commit accumulated exchange data. + */ +static void +exchange_data_commit() +{ + EXCHANGE_LOCK(); + + INFO("data exchange completed with cluster key %"PRIx64, + g_exchange.cluster_key); + + // Exchange is done, use the current cluster details as the committed + // cluster details. + g_exchange.committed_cluster_key = g_exchange.cluster_key; + g_exchange.committed_cluster_size = g_exchange.cluster_size; + g_exchange.committed_principal = g_exchange.principal; + vector_clear(&g_exchange.committed_succession_list); + vector_copy(&g_exchange.committed_succession_list, + &g_exchange.succession_list); + + // Force an update of the skew, to ensure new nodes if any have been checked + // for skew. + as_skew_monitor_update(); + + // Must cover partition balance since it may manipulate ns->cluster_size. + pthread_mutex_lock(&g_exchanged_info_lock); + as_partition_balance(); + pthread_mutex_unlock(&g_exchanged_info_lock); + + EXCHANGE_UNLOCK(); +} + +/** + * Handle incoming data ack message. + * + * Assumes the message has been checked for sanity. + */ +static void +exchange_ready_to_commit_commit_msg_handle(as_exchange_event* msg_event) +{ + EXCHANGE_LOCK(); + + if (msg_event->msg_source != g_exchange.principal) { + WARNING( + "ignoring commit message from node %"PRIx64" - expected message from %"PRIx64, + msg_event->msg_source, g_exchange.principal); + goto Exit; + } + + INFO("received commit command from principal node %"PRIx64, + msg_event->msg_source); + + // Commit exchanged data. + exchange_data_commit(); + + // Move to the rest state. + g_exchange.state = AS_EXCHANGE_STATE_REST; + + // Queue up a cluster change event for downstream sub systems. + as_exchange_cluster_changed_event cluster_change_event; + cluster_change_event.cluster_key = g_exchange.committed_cluster_key; + cluster_change_event.succession = vector_to_array( + &g_exchange.committed_succession_list); + cluster_change_event.cluster_size = g_exchange.committed_cluster_size; + + exchange_external_event_queue(&cluster_change_event); + +Exit: + EXCHANGE_UNLOCK(); +} + +/** + * Handle incoming data message in ready to commit stage. + * + * Assumes the message has been checked for sanity. + */ +static void +exchange_ready_to_commit_data_msg_handle(as_exchange_event* msg_event) +{ + EXCHANGE_LOCK(); + + DEBUG("received exchange data from node %"PRIx64, msg_event->msg_source); + + // The source must have missed self node's data ack. Send an + // acknowledgement. + exchange_data_ack_msg_send(msg_event->msg_source); + + EXCHANGE_UNLOCK(); +} + +/** + * Process a message event when in ready_to_commit state. + */ +static void +exchange_ready_to_commit_msg_event_handle(as_exchange_event* msg_event) +{ + EXCHANGE_LOCK(); + + if (!exchange_msg_is_sane(msg_event->msg_source, msg_event->msg)) { + goto Exit; + } + + as_exchange_msg_type msg_type; + exchange_msg_type_get(msg_event->msg, &msg_type); + + switch (msg_type) { + case AS_EXCHANGE_MSG_TYPE_READY_TO_COMMIT: + exchange_ready_to_commit_rtc_msg_handle(msg_event); + break; + case AS_EXCHANGE_MSG_TYPE_COMMIT: + exchange_ready_to_commit_commit_msg_handle(msg_event); + break; + case AS_EXCHANGE_MSG_TYPE_DATA: + exchange_ready_to_commit_data_msg_handle(msg_event); + break; + default: + DEBUG( + "ready to commit state received unexpected message of type %d from node %"PRIx64, + msg_type, msg_event->msg_source); + } +Exit: + EXCHANGE_UNLOCK(); +} + +/** + * Process a message event when in ready_to_commit state. + */ +static void +exchange_ready_to_commit_timer_event_handle(as_exchange_event* msg_event) +{ + EXCHANGE_LOCK(); + + if (g_exchange.ready_to_commit_send_ts + EXCHANGE_READY_TO_COMMIT_TIMEOUT() + < cf_getms()) { + // Its been a while since ready to commit has been sent to the + // principal, retransmit it so that the principal gets it this time and + // supplies a commit message. + exchange_ready_to_commit_msg_send(); + } + EXCHANGE_UNLOCK(); +} + +/** + * Event processing in the ready_to_commit state. + */ +static void +exchange_ready_to_commit_event_handle(as_exchange_event* event) +{ + switch (event->type) { + case AS_EXCHANGE_EVENT_CLUSTER_CHANGE: + exchange_clustering_event_handle(event); + break; + case AS_EXCHANGE_EVENT_MSG: + exchange_ready_to_commit_msg_event_handle(event); + break; + case AS_EXCHANGE_EVENT_TIMER: + exchange_ready_to_commit_timer_event_handle(event); + break; + } +} + +/* + * ---------------------------------------------------------------------------- + * Exchange core subsystem + * ---------------------------------------------------------------------------- + */ + +/** + * Dispatch an exchange event inline to the relevant state handler. + */ +static void +exchange_event_handle(as_exchange_event* event) +{ + EXCHANGE_LOCK(); + + switch (g_exchange.state) { + case AS_EXCHANGE_STATE_REST: + exchange_rest_event_handle(event); + break; + case AS_EXCHANGE_STATE_EXCHANGING: + exchange_exchanging_event_handle(event); + break; + case AS_EXCHANGE_STATE_READY_TO_COMMIT: + exchange_ready_to_commit_event_handle(event); + break; + case AS_EXCHANGE_STATE_ORPHANED: + exchange_orphan_event_handle(event); + break; + } + + EXCHANGE_UNLOCK(); +} + +/** + * Exchange timer event generator thread, to help with retries and retransmits + * across all states. + */ +static void* +exchange_timer_thr(void* arg) +{ + as_exchange_event timer_event; + memset(&timer_event, 0, sizeof(timer_event)); + timer_event.type = AS_EXCHANGE_EVENT_TIMER; + + while (EXCHANGE_IS_RUNNING()) { + // Wait for a while and retry. + usleep(EXCHANGE_TIMER_TICK_INTERVAL() * 1000); + exchange_event_handle(&timer_event); + } + return NULL; +} + +/** + * Handle incoming messages from fabric. + */ +static int +exchange_fabric_msg_listener(cf_node source, msg* msg, void* udata) +{ + if (!EXCHANGE_IS_RUNNING()) { + // Ignore this message. + DEBUG("exchange stopped - ignoring message from %"PRIx64, source); + goto Exit; + } + + as_exchange_event msg_event; + memset(&msg_event, 0, sizeof(msg_event)); + msg_event.type = AS_EXCHANGE_EVENT_MSG; + msg_event.msg = msg; + msg_event.msg_source = source; + + exchange_event_handle(&msg_event); +Exit: + as_fabric_msg_put(msg); + return 0; +} + +/** + * Listener for cluster change events from clustering layer. + */ +void +exchange_clustering_event_listener(as_clustering_event* event) +{ + if (!EXCHANGE_IS_RUNNING()) { + // Ignore this message. + DEBUG("exchange stopped - ignoring cluster change event"); + return; + } + + as_exchange_event clustering_event; + memset(&clustering_event, 0, sizeof(clustering_event)); + clustering_event.type = AS_EXCHANGE_EVENT_CLUSTER_CHANGE; + clustering_event.clustering_event = event; + + // Dispatch the event. + exchange_event_handle(&clustering_event); +} + +/** + * Initialize the template to be used for exchange messages. + */ +static void +exchange_msg_init() +{ + // Register fabric exchange msg type with no processing function. + as_fabric_register_msg_fn(M_TYPE_EXCHANGE, exchange_msg_template, + sizeof(exchange_msg_template), AS_EXCHANGE_MSG_SCRATCH_SIZE, + exchange_fabric_msg_listener, NULL); +} + +/** + * Initialize exchange subsystem. + */ +static void +exchange_init() +{ + if (EXCHANGE_IS_INITIALIZED()) { + return; + } + + EXCHANGE_LOCK(); + + memset(&g_exchange, 0, sizeof(g_exchange)); + + // Start in the orphaned state. + g_exchange.state = AS_EXCHANGE_STATE_ORPHANED; + g_exchange.orphan_state_start_time = cf_getms(); + g_exchange.orphan_state_are_transactions_blocked = true; + + // Initialize the adjacencies. + g_exchange.nodeid_to_node_state = cf_shash_create(cf_nodeid_shash_fn, + sizeof(cf_node), sizeof(as_exchange_node_state), + AS_EXCHANGE_CLUSTER_MAX_SIZE_SOFT, 0); + + cf_vector_init(&g_exchange.succession_list, sizeof(cf_node), + AS_EXCHANGE_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO); + cf_vector_init(&g_exchange.committed_succession_list, sizeof(cf_node), + AS_EXCHANGE_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO); + + // Initialize exchange fabric messaging. + exchange_msg_init(); + + // Initialize self exchange data dynamic buffers. + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + cf_dyn_buf_init_heap(&g_exchange.self_data_dyn_buf[ns_ix], + AS_EXCHANGE_SELF_DYN_BUF_SIZE()); + } + + // Initialize external event publishing. + exchange_external_event_publisher_init(); + + // Get partition versions from storage. + as_partition_balance_init(); + + DEBUG("exchange module initialized"); + + EXCHANGE_UNLOCK(); +} + +/** + * Stop exchange subsystem. + */ +static void +exchange_stop() +{ + if (!EXCHANGE_IS_RUNNING()) { + WARNING("exchange is already stopped"); + return; + } + + // Ungaurded state, but this should be ok. + g_exchange.sys_state = AS_EXCHANGE_SYS_STATE_SHUTTING_DOWN; + + // Wait for the relanabce send thread to finish. + pthread_join(g_exchange.timer_tid, NULL); + + EXCHANGE_LOCK(); + + g_exchange.sys_state = AS_EXCHANGE_SYS_STATE_STOPPED; + + DEBUG("exchange module stopped"); + + EXCHANGE_UNLOCK(); + + external_event_publisher_stop(); +} + +/** + * Start the exchange subsystem. + */ +static void +exchange_start() +{ + EXCHANGE_LOCK(); + + if (EXCHANGE_IS_RUNNING()) { + // Shutdown the exchange subsystem. + exchange_stop(); + } + + g_exchange.sys_state = AS_EXCHANGE_SYS_STATE_RUNNING; + + // Start the timer thread. + if (0 + != pthread_create(&g_exchange.timer_tid, 0, exchange_timer_thr, + &g_exchange)) { + CRASH("could not create exchange thread: %s", cf_strerror(errno)); + } + + DEBUG("exchange module started"); + + EXCHANGE_UNLOCK(); + + exchange_external_event_publisher_start(); +} + +/* + * ---------------------------------------------------------------------------- + * Public API + * ---------------------------------------------------------------------------- + */ +/** + * Initialize exchange subsystem. + */ +void +as_exchange_init() +{ + exchange_init(); +} + +/** + * Start exchange subsystem. + */ +void +as_exchange_start() +{ + exchange_start(); +} + +/** + * Stop exchange subsystem. + */ +void +as_exchange_stop() +{ +} + +/** + * Register to receive cluster-changed events. + * TODO - may replace with simple static list someday. + */ +void +as_exchange_register_listener(as_exchange_cluster_changed_cb cb, void* udata) +{ + exchange_external_event_listener_register(cb, udata); +} + +/** + * Dump exchange state to log. + */ +void +as_exchange_dump(bool verbose) +{ + exchange_dump(CF_INFO, verbose); +} + +/** + * Member-access method. + */ +uint64_t +as_exchange_cluster_key() +{ + return (uint64_t)g_exchange.committed_cluster_key; +} + +/** + * Member-access method. + */ +uint32_t +as_exchange_cluster_size() +{ + return g_exchange.committed_cluster_size; +} + +/** + * Copy over the committed succession list. + * Ensure the input vector has enough capacity. + */ +void +as_exchange_succession(cf_vector* succession) +{ + EXCHANGE_LOCK(); + vector_copy(succession, &g_exchange.committed_succession_list); + EXCHANGE_UNLOCK(); +} + +/** + * Return the committed succession list. + */ +cf_node* +as_exchange_succession_unsafe() +{ + return vector_to_array(&g_exchange.committed_succession_list); +} + +/** + * Return the committed succession list as a string in a dyn-buf. + */ +void +as_exchange_info_get_succession(cf_dyn_buf* db) +{ + EXCHANGE_LOCK(); + + cf_node* nodes = vector_to_array(&g_exchange.committed_succession_list); + + for (uint32_t i = 0; i < g_exchange.committed_cluster_size; i++) { + cf_dyn_buf_append_uint64_x(db, nodes[i]); + cf_dyn_buf_append_char(db, ','); + } + + if (g_exchange.committed_cluster_size != 0) { + cf_dyn_buf_chomp(db); + } + + // Always succeeds. + cf_dyn_buf_append_string(db, "\nok"); + + EXCHANGE_UNLOCK(); +} + +/** + * Member-access method. + */ +cf_node +as_exchange_principal() +{ + return g_exchange.committed_principal; +} + +/** + * Lock before setting or getting exchanged info from non-exchange thread. + */ +void +as_exchange_info_lock() +{ + pthread_mutex_lock(&g_exchanged_info_lock); +} + +/** + * Unlock after setting or getting exchanged info from non-exchange thread. + */ +void +as_exchange_info_unlock() +{ + pthread_mutex_unlock(&g_exchanged_info_lock); +} diff --git a/as/src/fabric/fabric.c b/as/src/fabric/fabric.c new file mode 100644 index 00000000..3974acda --- /dev/null +++ b/as/src/fabric/fabric.c @@ -0,0 +1,2943 @@ +/* + * fabric.c + * + * Copyright (C) 2008-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +// Object Management: +// ------------------ +// +// Node and FC objects are reference counted. Correct book keeping on object +// references are vital to system operations. +// +// Holders of FC references: +// (1) node->fc_hash +// (2) node->send_idle_fc_queue +// (3) (epoll_event ev).data.ptr +// +// For sending, (2) and (3) are mutually exclusive. +// Refs between (2) and (3) are passed virtually whenever possible, without +// needing to explicitly call reserve/release. +// (3) takes ref on rearm. +// (3) gives ref to calling thread when epoll triggers, due to ONESHOT. +// Thread will either rearm or give ref to (2). Never do both. +// +// FCs are created in two methods: fabric_node_connect(), run_fabric_accept() +// +// Holders of Node references: +// * fc->node +// * g_fabric.node_hash + + +//========================================================== +// Includes. +// + +#include "fabric/fabric.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_ll.h" +#include "citrusleaf/cf_queue.h" +#include "citrusleaf/cf_rchash.h" +#include "citrusleaf/cf_vector.h" + +#include "fault.h" +#include "msg.h" +#include "node.h" +#include "shash.h" +#include "socket.h" +#include "tls.h" + +#include "base/cfg.h" +#include "base/stats.h" +#include "fabric/endpoint.h" +#include "fabric/hb.h" + + +//========================================================== +// Typedefs & constants. +// + +#define FABRIC_BUFFER_MEM_SZ (1024 * 1024) // bytes +#define FABRIC_BUFFER_MAX_SZ (128 * 1024 * 1024) // used simply for validation +#define FABRIC_EPOLL_SEND_EVENTS 16 +#define FABRIC_EPOLL_RECV_EVENTS 1 + +typedef enum { + // These values go on the wire, so mind backward compatibility if changing. + FS_FIELD_NODE, + FS_UNUSED1, // used to be FS_ADDR + FS_UNUSED2, // used to be FS_PORT + FS_UNUSED3, // used to be FS_ANV + FS_UNUSED4, // used to be FS_ADDR_EX + FS_CHANNEL, + + NUM_FS_FIELDS +} fs_msg_fields; + +static const msg_template fabric_mt[] = { + { FS_FIELD_NODE, M_FT_UINT64 }, + { FS_UNUSED1, M_FT_UINT32 }, + { FS_UNUSED2, M_FT_UINT32 }, + { FS_UNUSED3, M_FT_BUF }, + { FS_UNUSED4, M_FT_BUF }, + { FS_CHANNEL, M_FT_UINT32 }, +}; + +COMPILER_ASSERT(sizeof(fabric_mt) / sizeof(msg_template) == NUM_FS_FIELDS); + +#define FS_MSG_SCRATCH_SIZE 128 + +#define DEFAULT_EVENTS (EPOLLERR | EPOLLHUP | EPOLLRDHUP | EPOLLONESHOT) + +// Block size for allocating fabric hb plugin data. +#define HB_PLUGIN_DATA_BLOCK_SIZE 128 + +typedef struct fabric_recv_thread_pool_s { + cf_vector threads; + cf_poll poll; + uint32_t pool_id; +} fabric_recv_thread_pool; + +typedef struct send_entry_s { + struct send_entry_s *next; + uint32_t id; + uint32_t count; + cf_poll poll; +} send_entry; + +typedef struct fabric_state_s { + as_fabric_msg_fn msg_cb[M_TYPE_MAX]; + void *msg_udata[M_TYPE_MAX]; + + cf_queue msg_pool_queue[M_TYPE_MAX]; // a pool of reusable msgs + cf_vector fb_free; + + fabric_recv_thread_pool recv_pool[AS_FABRIC_N_CHANNELS]; + + pthread_mutex_t send_lock; + send_entry *sends; + send_entry *send_head; + + pthread_mutex_t node_hash_lock; + cf_rchash *node_hash; // key is cf_node, value is (fabric_node *) +} fabric_state; + +typedef struct fabric_buffer_s { + uint8_t *buf; + uint8_t *progress; + const uint8_t *end; + uint8_t membuf[FABRIC_BUFFER_MEM_SZ]; +} fabric_buffer; + +typedef struct fabric_node_s { + cf_node node_id; // remote node + bool live; // set to false on shutdown + uint32_t connect_count[AS_FABRIC_N_CHANNELS]; + bool connect_full; + + pthread_mutex_t connect_lock; + + pthread_mutex_t fc_hash_lock; + cf_shash *fc_hash; // key is (fabric_connection *), value unused + + pthread_mutex_t send_idle_fc_queue_lock; + cf_queue send_idle_fc_queue[AS_FABRIC_N_CHANNELS]; + + cf_queue send_queue[AS_FABRIC_N_CHANNELS]; + + uint8_t send_counts[]; +} fabric_node; + +typedef struct fabric_connection_s { + cf_socket sock; + cf_sock_addr peer; + fabric_node *node; + + bool failed; + bool started_via_connect; + + fabric_buffer s_buf; + msg *s_msg_in_progress; + size_t s_count; + + fabric_buffer *r_buf_in_progress; + uint32_t r_msg_size; + msg_type r_type; + uint64_t benchmark_time; + + // The send_ptr != NULL means that the FC's sock has registered with + // send_poll. This is needed because epoll's API doesn't allow registering + // a socket without event triggers (ERR and HUP are enabled even when + // unspecified). + send_entry *send_ptr; + fabric_recv_thread_pool *pool; + + uint64_t s_bytes; + uint64_t s_bytes_last; + uint64_t r_bytes; + uint64_t r_bytes_last; +} fabric_connection; + +typedef struct node_list_s { + uint32_t count; + cf_node nodes[AS_CLUSTER_SZ]; // must support the maximum cluster size. +} node_list; + +const char *CHANNEL_NAMES[] = { + [AS_FABRIC_CHANNEL_RW] = "rw", + [AS_FABRIC_CHANNEL_CTRL] = "ctrl", + [AS_FABRIC_CHANNEL_BULK] = "bulk", + [AS_FABRIC_CHANNEL_META] = "meta", +}; + +COMPILER_ASSERT(sizeof(CHANNEL_NAMES) / sizeof(const char *) == + AS_FABRIC_N_CHANNELS); + + +//========================================================== +// Globals. +// + +cf_serv_cfg g_fabric_bind = { .n_cfgs = 0 }; +cf_tls_info *g_fabric_tls; + +static fabric_state g_fabric; +static cf_poll g_accept_poll; + +static as_endpoint_list *g_published_endpoint_list; +static bool g_published_endpoint_list_ipv4_only; + +// Max connections formed via connect. Others are formed via accept. +static uint32_t g_fabric_connect_limit[AS_FABRIC_N_CHANNELS]; + + +//========================================================== +// Forward declarations. +// + +// Support functions. +static void send_entry_insert(send_entry **se_pp, send_entry *se); + +static void fabric_published_serv_cfg_fill(const cf_serv_cfg *bind_cfg, cf_serv_cfg *published_cfg, bool ipv4_only); +static bool fabric_published_endpoints_refresh(void); + +// fabric_node +static fabric_node *fabric_node_create(cf_node node_id); +static fabric_node *fabric_node_get(cf_node node_id); +static fabric_node *fabric_node_get_or_create(cf_node node_id); +static fabric_node *fabric_node_pop(cf_node node_id); +static int fabric_node_disconnect_reduce_fn(const void *key, void *data, void *udata); +static void fabric_node_disconnect(cf_node node_id); + +static fabric_connection *fabric_node_connect(fabric_node *node, uint32_t ch); +static int fabric_node_send(fabric_node *node, msg *m, as_fabric_channel channel); +static void fabric_node_connect_all(fabric_node *node); +static void fabric_node_destructor(void *pnode); +inline static void fabric_node_reserve(fabric_node *node); +inline static void fabric_node_release(fabric_node *node); +static bool fabric_node_add_connection(fabric_node *node, fabric_connection *fc); +static uint8_t fabric_node_find_min_send_count(const fabric_node *node); +static bool fabric_node_is_connect_full(const fabric_node *node); + +static int fabric_get_node_list_fn(const void *key, uint32_t keylen, void *data, void *udata); +static uint32_t fabric_get_node_list(node_list *nl); + +// fabric_buffer +static fabric_buffer *fabric_buffer_create(size_t sz); +static void fabric_buffer_init(fabric_buffer *fb, size_t sz); +static void fabric_buffer_destroy(fabric_buffer *fb); +inline static void fabric_buffer_free_extra(fabric_buffer *fb); +inline static bool fabric_buffer_resize(fabric_buffer *fb, size_t sz); + +// fabric_connection +fabric_connection *fabric_connection_create(cf_socket *sock, cf_sock_addr *peer); +static bool fabric_connection_accept_tls(fabric_connection *fc); +static bool fabric_connection_connect_tls(fabric_connection *fc); +inline static void fabric_connection_reserve(fabric_connection *fc); +static void fabric_connection_release(fabric_connection *fc); +inline static cf_node fabric_connection_get_id(const fabric_connection *fc); + +static void fabric_connection_send_assign(fabric_connection *fc); +static void fabric_connection_send_unassign(fabric_connection *fc); +inline static void fabric_connection_recv_rearm(fabric_connection *fc); +inline static void fabric_connection_send_rearm(fabric_connection *fc); +static void fabric_connection_disconnect(fabric_connection *fc); +static void fabric_connection_set_keepalive_options(fabric_connection *fc); + +static void fabric_connection_reroute_msg(fabric_connection *fc); +static void fabric_connection_send_progress(fabric_connection *fc, bool is_last); +static bool fabric_connection_process_writable(fabric_connection *fc); + +static bool fabric_connection_process_fabric_msg(fabric_connection *fc, const msg *m); +static bool fabric_connection_read_fabric_msg(fabric_connection *fc); + +static bool fabric_connection_process_msg(fabric_connection *fc, bool do_rearm); +static bool fabric_connection_process_readable(fabric_connection *fc); + +// fabric_recv_thread_pool +static void fabric_recv_thread_pool_init(fabric_recv_thread_pool *pool, uint32_t size, uint32_t pool_id); +static void fabric_recv_thread_pool_set_size(fabric_recv_thread_pool *pool, uint32_t size); +static void fabric_recv_thread_pool_add_fc(fabric_recv_thread_pool *pool, fabric_connection *fc); + +// fabric_endpoint +static bool fabric_endpoint_list_get(cf_node nodeid, as_endpoint_list *endpoint_list, size_t *endpoint_list_size); +static bool fabric_connect_endpoint_filter(const as_endpoint *endpoint, void *udata); + +// Thread functions. +static void *run_fabric_recv(void *arg); +static void run_fabric_recv_cleanup(void *arg); +static void *run_fabric_send(void *arg); +static void *run_fabric_accept(void *arg); + +// Ticker helpers. +static int fabric_rate_node_reduce_fn(const void *key, uint32_t keylen, void *data, void *udata); +static int fabric_rate_fc_reduce_fn(const void *key, void *data, void *udata); + +// Heartbeat. +static void fabric_hb_plugin_set_fn(msg *m); +static void fabric_hb_plugin_parse_data_fn(msg *m, cf_node source, as_hb_plugin_node_data *plugin_data); +static void fabric_heartbeat_event(int nevents, as_hb_event_node *events, void *udata); + + +//========================================================== +// Public API. +// + +//------------------------------------------------ +// msg +// + +msg * +as_fabric_msg_get(msg_type type) +{ + if (type >= M_TYPE_MAX) { + return NULL; + } + + msg *m = NULL; + + if (cf_queue_pop(&g_fabric.msg_pool_queue[type], &m, CF_QUEUE_NOWAIT) != + CF_QUEUE_OK) { + m = msg_create(type); + } + else { + msg_incr_ref(m); + } + + return m; +} + +void +as_fabric_msg_put(msg *m) +{ + int cnt = cf_rc_release(m); + + if (cnt == 0) { + msg_reset(m); + + if (cf_queue_sz(&g_fabric.msg_pool_queue[m->type]) > 128) { + msg_put(m); + } + else { + cf_queue_push(&g_fabric.msg_pool_queue[m->type], &m); + } + } + else if (cnt < 0) { + msg_dump(m, "extra put"); + cf_crash(AS_FABRIC, "extra put for msg type %d", m->type); + } +} + +// Log information about existing "msg" objects and queues. +void +as_fabric_msg_queue_dump() +{ + cf_info(AS_FABRIC, "All currently-existing msg types:"); + + int total_q_sz = 0; + int total_alloced_msgs = 0; + + for (int i = 0; i < M_TYPE_MAX; i++) { + int q_sz = cf_queue_sz(&g_fabric.msg_pool_queue[i]); + int num_of_type = cf_atomic_int_get(g_num_msgs_by_type[i]); + + total_alloced_msgs += num_of_type; + + if (q_sz || num_of_type) { + cf_info(AS_FABRIC, "|msgq[%d]| = %d ; alloc'd = %d", i, q_sz, num_of_type); + total_q_sz += q_sz; + } + } + + int num_msgs = cf_atomic_int_get(g_num_msgs); + + if (abs(num_msgs - total_alloced_msgs) > 2) { + cf_warning(AS_FABRIC, "num msgs (%d) != total alloc'd msgs (%d)", num_msgs, total_alloced_msgs); + } + + cf_info(AS_FABRIC, "Total num. msgs = %d ; Total num. queued = %d ; Delta = %d", num_msgs, total_q_sz, num_msgs - total_q_sz); +} + +//------------------------------------------------ +// as_fabric +// + +int +as_fabric_init() +{ + for (uint32_t i = 0; i < AS_FABRIC_N_CHANNELS; i++) { + g_fabric_connect_limit[i] = g_config.n_fabric_channel_fds[i]; + + fabric_recv_thread_pool_init(&g_fabric.recv_pool[i], + g_config.n_fabric_channel_recv_threads[i], i); + } + + pthread_mutex_init(&g_fabric.send_lock, 0); + + as_fabric_register_msg_fn(M_TYPE_FABRIC, fabric_mt, sizeof(fabric_mt), + FS_MSG_SCRATCH_SIZE, NULL, NULL); + + pthread_mutex_init(&g_fabric.node_hash_lock, 0); + + cf_rchash_create(&g_fabric.node_hash, cf_nodeid_rchash_fn, + fabric_node_destructor, sizeof(cf_node), 128, 0); + + for (int i = 0; i < M_TYPE_MAX; i++) { + cf_queue_init(&g_fabric.msg_pool_queue[i], sizeof(msg *), + CF_QUEUE_ALLOCSZ, true); + } + + cf_vector_init(&g_fabric.fb_free, sizeof(fabric_buffer *), 64, + VECTOR_FLAG_BIGLOCK); + + g_published_endpoint_list = NULL; + g_published_endpoint_list_ipv4_only = cf_ip_addr_legacy_only(); + + if (! fabric_published_endpoints_refresh()) { + cf_crash(AS_FABRIC, "error creating fabric published endpoint list"); + } + + as_hb_plugin fabric_plugin; + + memset(&fabric_plugin, 0, sizeof(fabric_plugin)); + fabric_plugin.id = AS_HB_PLUGIN_FABRIC; + fabric_plugin.wire_size_fixed = 0; // includes the size for the protocol version + as_endpoint_list_sizeof(g_published_endpoint_list, + &fabric_plugin.wire_size_fixed); + fabric_plugin.wire_size_per_node = 0; // size per node node in succession list + fabric_plugin.set_fn = fabric_hb_plugin_set_fn; + fabric_plugin.parse_fn = fabric_hb_plugin_parse_data_fn; + fabric_plugin.change_listener = NULL; + as_hb_plugin_register(&fabric_plugin); + + as_hb_register_listener(fabric_heartbeat_event, &g_fabric); + + as_fabric_transact_init(); + + return 0; +} + +int +as_fabric_start() +{ + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + g_fabric.sends = + cf_malloc(sizeof(send_entry) * g_config.n_fabric_send_threads); + g_fabric.send_head = g_fabric.sends; + + cf_info(AS_FABRIC, "starting %u fabric send threads", g_config.n_fabric_send_threads); + + for (int i = 0; i < g_config.n_fabric_send_threads; i++) { + cf_poll_create(&g_fabric.sends[i].poll); + g_fabric.sends[i].id = i; + g_fabric.sends[i].count = 0; + g_fabric.sends[i].next = g_fabric.sends + i + 1; + + if (pthread_create(&thread, &attrs, run_fabric_send, + &g_fabric.sends[i]) != 0) { + cf_crash(AS_FABRIC, "could not create fabric send thread"); + } + } + + g_fabric.sends[g_config.n_fabric_send_threads - 1].next = NULL; + + for (uint32_t i = 0; i < AS_FABRIC_N_CHANNELS; i++) { + cf_info(AS_FABRIC, "starting %u fabric %s channel recv threads", g_config.n_fabric_channel_recv_threads[i], CHANNEL_NAMES[i]); + + fabric_recv_thread_pool_set_size(&g_fabric.recv_pool[i], + g_config.n_fabric_channel_recv_threads[i]); + } + + cf_info(AS_FABRIC, "starting fabric accept thread"); + + if (pthread_create(&thread, &attrs, run_fabric_accept, NULL) != 0) { + cf_crash(AS_FABRIC, "could not create fabric accept thread"); + } + + return 0; +} + +void +as_fabric_set_recv_threads(as_fabric_channel channel, uint32_t count) +{ + g_config.n_fabric_channel_recv_threads[channel] = count; + + fabric_recv_thread_pool_set_size(&g_fabric.recv_pool[channel], count); +} + +int +as_fabric_send(cf_node node_id, msg *m, as_fabric_channel channel) +{ + m->benchmark_time = g_config.fabric_benchmarks_enabled ? cf_getns() : 0; + + if (g_config.self_node == node_id) { + cf_assert(g_fabric.msg_cb[m->type], AS_FABRIC, "m->type %d not registered", m->type); + (g_fabric.msg_cb[m->type])(node_id, m, g_fabric.msg_udata[m->type]); + + return AS_FABRIC_SUCCESS; + } + + fabric_node *node = fabric_node_get(node_id); + int ret = fabric_node_send(node, m, channel); + + if (node) { + fabric_node_release(node); // from fabric_node_get + } + + return ret; +} + +int +as_fabric_send_list(const cf_node *nodes, uint32_t node_count, msg *m, + as_fabric_channel channel) +{ + if (! nodes) { + node_list nl; + + fabric_get_node_list(&nl); + return as_fabric_send_list(nl.nodes, nl.count, m, channel); + } + + int ret = AS_FABRIC_SUCCESS; + + for (uint32_t i = 0; i < node_count; i++) { + msg_incr_ref(m); + + if ((ret = as_fabric_send(nodes[i], m, channel)) != AS_FABRIC_SUCCESS) { + // Leave the reference for the sake of caller. + break; + } + } + + as_fabric_msg_put(m); // release main reference + + return ret; +} + +// TODO - make static registration +void +as_fabric_register_msg_fn(msg_type type, const msg_template *mt, size_t mt_sz, + size_t scratch_sz, as_fabric_msg_fn msg_cb, void *msg_udata) +{ + msg_type_register(type, mt, mt_sz, scratch_sz); + + g_fabric.msg_cb[type] = msg_cb; + g_fabric.msg_udata[type] = msg_udata; +} + +void +as_fabric_info_peer_endpoints_get(cf_dyn_buf *db) +{ + node_list nl; + fabric_get_node_list(&nl); + + for (uint32_t i = 0; i < nl.count; i++) { + if (nl.nodes[i] == g_config.self_node) { + continue; + } + + fabric_node *node = fabric_node_get(nl.nodes[i]); + + if (! node) { + cf_info(AS_FABRIC, "\tnode %lx not found in hash although reported available", nl.nodes[i]); + continue; + } + + size_t endpoint_list_capacity = 1024; + bool retry = true; + + while (true) { + uint8_t stack_mem[endpoint_list_capacity]; + as_endpoint_list *endpoint_list = (as_endpoint_list *)stack_mem; + + if (! fabric_endpoint_list_get(node->node_id, endpoint_list, + &endpoint_list_capacity)) { + if (errno == ENOENT) { + // No entry present for this node in heartbeat. + cf_detail(AS_FABRIC, "could not get endpoint list for %lx", node->node_id); + break; + } + + if (! retry) { + break; + } + + retry = false; + continue; + } + + cf_dyn_buf_append_string(db, "fabric.peer="); + cf_dyn_buf_append_string(db, "node-id="); + cf_dyn_buf_append_uint64_x(db, node->node_id); + cf_dyn_buf_append_string(db, ":"); + as_endpoint_list_info(endpoint_list, db); + cf_dyn_buf_append_string(db, ";"); + break; + } + + fabric_node_release(node); + } +} + +bool +as_fabric_is_published_endpoint_list(const as_endpoint_list *list) +{ + return as_endpoint_lists_are_equal(g_published_endpoint_list, list); +} + +// Used by heartbeat subsystem only, for duplicate node-id detection. +as_endpoint_list * +as_fabric_hb_plugin_get_endpoint_list(as_hb_plugin_node_data *plugin_data) +{ + return (plugin_data && plugin_data->data_size != 0) ? + (as_endpoint_list *)plugin_data->data : NULL; +} + +void +as_fabric_rate_capture(fabric_rate *rate) +{ + pthread_mutex_lock(&g_fabric.node_hash_lock); + cf_rchash_reduce(g_fabric.node_hash, fabric_rate_node_reduce_fn, rate); + pthread_mutex_unlock(&g_fabric.node_hash_lock); +} + +void +as_fabric_dump(bool verbose) +{ + node_list nl; + fabric_get_node_list(&nl); + + cf_info(AS_FABRIC, " Fabric Dump: nodes known %d", nl.count); + + for (uint32_t i = 0; i < nl.count; i++) { + if (nl.nodes[i] == g_config.self_node) { + cf_info(AS_FABRIC, "\tnode %lx is self", nl.nodes[i]); + continue; + } + + fabric_node *node = fabric_node_get(nl.nodes[i]); + + if (! node) { + cf_info(AS_FABRIC, "\tnode %lx not found in hash although reported available", nl.nodes[i]); + continue; + } + + pthread_mutex_lock(&node->fc_hash_lock); + cf_info(AS_FABRIC, "\tnode %lx fds {via_connect={h=%d m=%d l=%d} all=%d} live %d q {h=%d m=%d l=%d}", + node->node_id, + node->connect_count[AS_FABRIC_CHANNEL_CTRL], + node->connect_count[AS_FABRIC_CHANNEL_RW], + node->connect_count[AS_FABRIC_CHANNEL_BULK], + cf_shash_get_size(node->fc_hash), node->live, + cf_queue_sz(&node->send_queue[AS_FABRIC_CHANNEL_CTRL]), + cf_queue_sz(&node->send_queue[AS_FABRIC_CHANNEL_RW]), + cf_queue_sz(&node->send_queue[AS_FABRIC_CHANNEL_BULK])); + pthread_mutex_unlock(&node->fc_hash_lock); + + fabric_node_release(node); // node_get + } +} + + +//========================================================== +// Support functions. +// + +static void +send_entry_insert(send_entry **se_pp, send_entry *se) +{ + while (*se_pp && se->count > (*se_pp)->count) { + se_pp = &(*se_pp)->next; + } + + se->next = *se_pp; + *se_pp = se; +} + +// Get addresses to publish as serv config. Expand "any" addresses. +static void +fabric_published_serv_cfg_fill(const cf_serv_cfg *bind_cfg, + cf_serv_cfg *published_cfg, bool ipv4_only) +{ + cf_serv_cfg_init(published_cfg); + + cf_sock_cfg sock_cfg; + + for (int i = 0; i < bind_cfg->n_cfgs; i++) { + cf_sock_cfg_copy(&bind_cfg->cfgs[i], &sock_cfg); + + // Expand "any" address to all interfaces. + if (cf_ip_addr_is_any(&sock_cfg.addr)) { + cf_ip_addr all_addrs[CF_SOCK_CFG_MAX]; + uint32_t n_all_addrs = CF_SOCK_CFG_MAX; + + if (cf_inter_get_addr_all(all_addrs, &n_all_addrs) != 0) { + cf_warning(AS_FABRIC, "error getting all interface addresses"); + n_all_addrs = 0; + } + + for (int j = 0; j < n_all_addrs; j++) { + // Skip local address if any is specified. + if (cf_ip_addr_is_local(&all_addrs[j]) || + (ipv4_only && ! cf_ip_addr_is_legacy(&all_addrs[j]))) { + continue; + } + + cf_ip_addr_copy(&all_addrs[j], &sock_cfg.addr); + + if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) { + cf_crash(AS_FABRIC, "error initializing published address list"); + } + } + } + else { + if (ipv4_only && ! cf_ip_addr_is_legacy(&bind_cfg->cfgs[i].addr)) { + continue; + } + + if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) { + cf_crash(AS_FABRIC, "error initializing published address list"); + } + } + } +} + +// Refresh the fabric published endpoint list. +// Return true on success. +static bool +fabric_published_endpoints_refresh() +{ + if (g_published_endpoint_list && + g_published_endpoint_list_ipv4_only == cf_ip_addr_legacy_only()) { + return true; + } + + // The global flag has changed, refresh the published address list. + if (g_published_endpoint_list) { + // Free the obsolete list. + cf_free(g_published_endpoint_list); + } + + cf_serv_cfg published_cfg; + fabric_published_serv_cfg_fill(&g_fabric_bind, &published_cfg, + g_published_endpoint_list_ipv4_only); + + g_published_endpoint_list = as_endpoint_list_from_serv_cfg(&published_cfg); + cf_assert(g_published_endpoint_list, AS_FABRIC, "error initializing mesh published address list"); + + g_published_endpoint_list_ipv4_only = cf_ip_addr_legacy_only(); + + if (g_published_endpoint_list->n_endpoints == 0) { + if (g_published_endpoint_list_ipv4_only) { + cf_warning(AS_FABRIC, "no IPv4 addresses configured for fabric"); + } + else { + cf_warning(AS_FABRIC, "no addresses configured for fabric"); + } + + return false; + } + + char endpoint_list_str[512]; + as_endpoint_list_to_string(g_published_endpoint_list, endpoint_list_str, + sizeof(endpoint_list_str)); + + cf_info(AS_FABRIC, "updated fabric published address list to {%s}", endpoint_list_str); + + return true; +} + + +//========================================================== +// fabric_node +// + +static fabric_node * +fabric_node_create(cf_node node_id) +{ + size_t size = sizeof(fabric_node) + + (sizeof(uint8_t) * g_config.n_fabric_send_threads); + fabric_node *node = cf_rc_alloc(size); + + memset(node, 0, size); + + node->node_id = node_id; + node->live = true; + + if (pthread_mutex_init(&node->send_idle_fc_queue_lock, NULL) != 0) { + cf_crash(AS_FABRIC, "fabric_node_create(%lx) failed to init send_idle_fc_queue_lock", node_id); + } + + for (int i = 0; i < AS_FABRIC_N_CHANNELS; i++) { + cf_queue_init(&node->send_idle_fc_queue[i], sizeof(fabric_connection *), + CF_QUEUE_ALLOCSZ, false); + + cf_queue_init(&node->send_queue[i], sizeof(msg *), CF_QUEUE_ALLOCSZ, + true); + } + + if (pthread_mutex_init(&node->connect_lock, NULL) != 0) { + cf_crash(AS_FABRIC, "fabric_node_create(%lx) failed to init connect_lock", node_id); + } + + if (pthread_mutex_init(&node->fc_hash_lock, NULL) != 0) { + cf_crash(AS_FABRIC, "fabric_node_create(%lx) failed to init fc_hash_lock", node_id); + } + + node->fc_hash = cf_shash_create(cf_shash_fn_ptr, + sizeof(fabric_connection *), 0, 32, 0); + + cf_detail(AS_FABRIC, "fabric_node_create(%lx) node %p", node_id, node); + + return node; +} + +static fabric_node * +fabric_node_get(cf_node node_id) +{ + fabric_node *node; + + pthread_mutex_lock(&g_fabric.node_hash_lock); + int rv = cf_rchash_get(g_fabric.node_hash, &node_id, sizeof(cf_node), + (void **)&node); + pthread_mutex_unlock(&g_fabric.node_hash_lock); + + if (rv != CF_RCHASH_OK) { + return NULL; + } + + return node; +} + +static fabric_node * +fabric_node_get_or_create(cf_node node_id) +{ + fabric_node *node; + + pthread_mutex_lock(&g_fabric.node_hash_lock); + + if (cf_rchash_get(g_fabric.node_hash, &node_id, sizeof(cf_node), + (void **)&node) == CF_RCHASH_OK) { + pthread_mutex_unlock(&g_fabric.node_hash_lock); + + fabric_node_connect_all(node); + + return node; + } + + node = fabric_node_create(node_id); + + if (cf_rchash_put_unique(g_fabric.node_hash, &node_id, sizeof(cf_node), + node) != CF_RCHASH_OK) { + cf_crash(AS_FABRIC, "fabric_node_get_or_create(%lx)", node_id); + } + + fabric_node_reserve(node); // for return + + pthread_mutex_unlock(&g_fabric.node_hash_lock); + + fabric_node_connect_all(node); + + return node; +} + +static fabric_node * +fabric_node_pop(cf_node node_id) +{ + fabric_node *node = NULL; + + pthread_mutex_lock(&g_fabric.node_hash_lock); + + if (cf_rchash_get(g_fabric.node_hash, &node_id, sizeof(cf_node), + (void **)&node) == CF_RCHASH_OK) { + if (cf_rchash_delete(g_fabric.node_hash, &node_id, sizeof(node_id)) != + CF_RCHASH_OK) { + cf_crash(AS_FABRIC, "fabric_node_pop(%lx)", node_id); + } + } + + pthread_mutex_unlock(&g_fabric.node_hash_lock); + + return node; +} + +static int +fabric_node_disconnect_reduce_fn(const void *key, void *data, void *udata) +{ + fabric_connection *fc = *(fabric_connection **)key; + + cf_assert(fc, AS_FABRIC, "fc == NULL, don't put NULLs into fc_hash"); + cf_socket_shutdown(&fc->sock); + fabric_connection_release(fc); // for delete from node->fc_hash + + return CF_SHASH_REDUCE_DELETE; +} + +static void +fabric_node_disconnect(cf_node node_id) +{ + fabric_node *node = fabric_node_pop(node_id); + + if (! node) { + cf_warning(AS_FABRIC, "fabric_node_disconnect(%lx) not connected", node_id); + return; + } + + cf_info(AS_FABRIC, "fabric_node_disconnect(%lx)", node_id); + + pthread_mutex_lock(&node->fc_hash_lock); + + node->live = false; + // Clean up all fc's attached to this node. + cf_shash_reduce(node->fc_hash, fabric_node_disconnect_reduce_fn, NULL); + + pthread_mutex_unlock(&node->fc_hash_lock); + + pthread_mutex_lock(&node->send_idle_fc_queue_lock); + + for (int i = 0; i < AS_FABRIC_N_CHANNELS; i++) { + while (true) { + fabric_connection *fc; + + int rv = cf_queue_pop(&node->send_idle_fc_queue[i], &fc, + CF_QUEUE_NOWAIT); + + if (rv != CF_QUEUE_OK) { + break; + } + + fabric_connection_send_unassign(fc); + fabric_connection_release(fc); + } + } + + pthread_mutex_unlock(&node->send_idle_fc_queue_lock); + + fabric_node_release(node); // from fabric_node_pop() +} + +static fabric_connection * +fabric_node_connect(fabric_node *node, uint32_t ch) +{ + cf_detail(AS_FABRIC, "fabric_node_connect(%p, %u)", node, ch); + + pthread_mutex_lock(&node->connect_lock); + + uint32_t fds = node->connect_count[ch] + 1; + + if (fds > g_fabric_connect_limit[ch]) { + pthread_mutex_unlock(&node->connect_lock); + return NULL; + } + + cf_socket sock; + cf_sock_addr addr; + size_t endpoint_list_capacity = 1024; + int tries_remaining = 3; + + while (tries_remaining--) { + uint8_t endpoint_list_mem[endpoint_list_capacity]; + as_endpoint_list *endpoint_list = (as_endpoint_list *)endpoint_list_mem; + + if (fabric_endpoint_list_get(node->node_id, endpoint_list, + &endpoint_list_capacity)) { + char endpoint_list_str[1024]; + + as_endpoint_list_to_string(endpoint_list, endpoint_list_str, + sizeof(endpoint_list_str)); + cf_detail(AS_FABRIC, "fabric_node_connect(%p, %u) node_id %lx with endpoints {%s}", node, ch, node->node_id, endpoint_list_str); + + // Initiate connect to the remote endpoint. + const as_endpoint *connected_endpoint = as_endpoint_connect_any( + endpoint_list, fabric_connect_endpoint_filter, NULL, 0, + &sock); + + if (! connected_endpoint) { + cf_detail(AS_FABRIC, "fabric_node_connect(%p, %u) node_id %lx failed for endpoints {%s}", node, ch, node->node_id, endpoint_list_str); + pthread_mutex_unlock(&node->connect_lock); + return NULL; + } + + as_endpoint_to_sock_addr(connected_endpoint, &addr); + + if (as_endpoint_capability_is_supported(connected_endpoint, + AS_ENDPOINT_TLS_MASK)) { + tls_socket_prepare_client(g_fabric_tls, &sock); + } + + break; // read success + } + + if (errno == ENOENT) { + // No entry present for this node in heartbeat. + cf_detail(AS_FABRIC, "fabric_node_connect(%p, %u) unknown remote node %lx", node, ch, node->node_id); + pthread_mutex_unlock(&node->connect_lock); + return NULL; + } + + // The list capacity was not enough. Retry with suggested list size. + } + + if (tries_remaining < 0) { + cf_warning(AS_FABRIC,"fabric_node_connect(%p, %u) List get error for remote node %lx", node, ch, node->node_id); + pthread_mutex_unlock(&node->connect_lock); + return NULL; + } + + msg *m = as_fabric_msg_get(M_TYPE_FABRIC); + + cf_atomic64_incr(&g_stats.fabric_connections_opened); + msg_set_uint64(m, FS_FIELD_NODE, g_config.self_node); + msg_set_uint32(m, FS_CHANNEL, ch); + m->benchmark_time = g_config.fabric_benchmarks_enabled ? cf_getns() : 0; + + fabric_connection *fc = fabric_connection_create(&sock, &addr); + + fc->s_msg_in_progress = m; + fc->started_via_connect = true; + fc->pool = &g_fabric.recv_pool[ch]; + + if (! fabric_node_add_connection(node, fc)) { + fabric_connection_release(fc); + pthread_mutex_unlock(&node->connect_lock); + return NULL; + } + + node->connect_count[ch]++; + node->connect_full = fabric_node_is_connect_full(node); + + pthread_mutex_unlock(&node->connect_lock); + + return fc; +} + +static int +fabric_node_send(fabric_node *node, msg *m, as_fabric_channel channel) +{ + if (! node || ! node->live) { + return AS_FABRIC_ERR_NO_NODE; + } + + while (true) { + // Sync with fabric_connection_process_writable() to avoid non-empty + // send_queue with every fc being in send_idle_fc_queue. + pthread_mutex_lock(&node->send_idle_fc_queue_lock); + + fabric_connection *fc; + int rv = cf_queue_pop(&node->send_idle_fc_queue[(int)channel], &fc, + CF_QUEUE_NOWAIT); + + if (rv != CF_QUEUE_OK) { + cf_queue_push(&node->send_queue[(int)channel], &m); + pthread_mutex_unlock(&node->send_idle_fc_queue_lock); + + if (! node->connect_full) { + fabric_node_connect_all(node); + } + + break; + } + + pthread_mutex_unlock(&node->send_idle_fc_queue_lock); + + if ((! cf_socket_exists(&fc->sock)) || fc->failed) { + fabric_connection_release(fc); // send_idle_fc_queue + continue; + } + + fc->s_msg_in_progress = m; + + // Wake up. + if (fc->send_ptr) { + fabric_connection_send_rearm(fc); // takes fc ref + } + else { + fabric_connection_send_assign(fc); // takes fc ref + } + + break; + } + + return AS_FABRIC_SUCCESS; +} + +static void +fabric_node_connect_all(fabric_node *node) +{ + if (! node->live) { + return; + } + + for (uint32_t ch = 0; ch < AS_FABRIC_N_CHANNELS; ch++) { + uint32_t n = g_fabric_connect_limit[ch] - node->connect_count[ch]; + + for (uint32_t i = 0; i < n; i++) { + fabric_connection *fc = fabric_node_connect(node, ch); + + if (! fc) { + break; + } + + // TLS connections are one-way. Outgoing connections are for + // outgoing data. + if (fc->sock.state == CF_SOCKET_STATE_NON_TLS) { + fabric_recv_thread_pool_add_fc(&g_fabric.recv_pool[ch], fc); + cf_detail(AS_FABRIC, "{%16lX, %u} activated", fabric_connection_get_id(fc), fc->sock.fd); + } + + // Takes the remaining ref for send_poll and idle queue. + fabric_connection_send_assign(fc); + } + } +} + +static void +fabric_node_destructor(void *pnode) +{ + fabric_node *node = (fabric_node *)pnode; + cf_detail(AS_FABRIC, "fabric_node_destructor(%p)", node); + + for (int i = 0; i < AS_FABRIC_N_CHANNELS; i++) { + // send_idle_fc_queue section. + cf_assert(cf_queue_sz(&node->send_idle_fc_queue[i]) == 0, AS_FABRIC, "send_idle_fc_queue not empty as expected"); + cf_queue_destroy(&node->send_idle_fc_queue[i]); + + // send_queue section. + while (true) { + msg *m; + + if (cf_queue_pop(&node->send_queue[i], &m, CF_QUEUE_NOWAIT) != + CF_QUEUE_OK) { + break; + } + + as_fabric_msg_put(m); + } + + cf_queue_destroy(&node->send_queue[i]); + } + + pthread_mutex_destroy(&node->send_idle_fc_queue_lock); + + // connection_hash section. + cf_assert(cf_shash_get_size(node->fc_hash) == 0, AS_FABRIC, "fc_hash not empty as expected"); + cf_shash_destroy(node->fc_hash); + + pthread_mutex_destroy(&node->fc_hash_lock); +} + +inline static void +fabric_node_reserve(fabric_node *node) { + cf_rc_reserve(node); +} + +inline static void +fabric_node_release(fabric_node *node) +{ + int cnt = cf_rc_release(node); + + if (cnt == 0) { + fabric_node_destructor(node); + cf_rc_free(node); + } + else if (cnt < 0) { + cf_crash(AS_FABRIC, "fabric_node_release(%p) extra call", node); + } +} + +static bool +fabric_node_add_connection(fabric_node *node, fabric_connection *fc) +{ + pthread_mutex_lock(&node->fc_hash_lock); + + if (! node->live) { + pthread_mutex_unlock(&node->fc_hash_lock); + return false; + } + + fabric_node_reserve(node); + fc->node = node; + + fabric_connection_set_keepalive_options(fc); + fabric_connection_reserve(fc); // for put into node->fc_hash + + uint8_t value = 0; + int rv = cf_shash_put_unique(node->fc_hash, &fc, &value); + + cf_assert(rv == CF_SHASH_OK, AS_FABRIC, "fabric_node_add_connection(%p, %p) failed to add with rv %d", node, fc, rv); + + pthread_mutex_unlock(&node->fc_hash_lock); + + return true; +} + +static uint8_t +fabric_node_find_min_send_count(const fabric_node *node) +{ + uint8_t min = node->send_counts[0]; + + for (uint32_t i = 1; i < g_config.n_fabric_send_threads; i++) { + if (node->send_counts[i] < min) { + min = node->send_counts[i]; + } + } + + return min; +} + +static bool +fabric_node_is_connect_full(const fabric_node *node) +{ + for (int ch = 0; ch < AS_FABRIC_N_CHANNELS; ch++) { + if (node->connect_count[ch] < g_fabric_connect_limit[ch]) { + return false; + } + } + + return true; +} + + +static int +fabric_get_node_list_fn(const void *key, uint32_t keylen, void *data, + void *udata) +{ + node_list *nl = (node_list *)udata; + + if (nl->count == AS_CLUSTER_SZ) { + return 0; + } + + nl->nodes[nl->count] = *(const cf_node *)key; + nl->count++; + + return 0; +} + +// Get a list of all the nodes - use a dynamic array, which requires inline. +static uint32_t +fabric_get_node_list(node_list *nl) +{ + nl->count = 1; + nl->nodes[0] = g_config.self_node; + + pthread_mutex_lock(&g_fabric.node_hash_lock); + cf_rchash_reduce(g_fabric.node_hash, fabric_get_node_list_fn, nl); + pthread_mutex_unlock(&g_fabric.node_hash_lock); + + return nl->count; +} + + +//========================================================== +// fabric_buffer +// + +static fabric_buffer * +fabric_buffer_create(size_t sz) +{ + fabric_buffer *fb; + + if (cf_vector_pop(&g_fabric.fb_free, &fb) != 0) { + fb = cf_malloc(sizeof(fabric_buffer)); + } + + fabric_buffer_init(fb, sz); + + return fb; +} + +static void +fabric_buffer_init(fabric_buffer *fb, size_t sz) +{ + if (sz > FABRIC_BUFFER_MEM_SZ) { + fb->buf = (uint8_t *)cf_malloc(sz); + } + else { + fb->buf = fb->membuf; + } + + fb->progress = fb->buf; + fb->end = fb->buf + sz; +} + +static void +fabric_buffer_destroy(fabric_buffer *fb) +{ + fabric_buffer_free_extra(fb); + + if (cf_vector_size(&g_fabric.fb_free) > 64) { + cf_free(fb); + } + else if (cf_vector_append(&g_fabric.fb_free, &fb) != 0) { + cf_crash(AS_FABRIC, "push into %p failed on fb %p", &g_fabric.fb_free, fb); + } +} + +inline static void +fabric_buffer_free_extra(fabric_buffer *fb) +{ + if (fb->buf != fb->membuf) { + cf_free(fb->buf); + } +} + +// Resize fb after we know the msg_size. +inline static bool +fabric_buffer_resize(fabric_buffer *fb, size_t sz) +{ + if (sz > FABRIC_BUFFER_MEM_SZ) { + if (sz > FABRIC_BUFFER_MAX_SZ) { + return false; + } + + cf_assert(fb->buf == fb->membuf, AS_FABRIC, "function misuse"); + + size_t old_sz = fb->progress - fb->membuf; + + fb->buf = (uint8_t *)cf_malloc(sz); + + memcpy(fb->buf, fb->membuf, old_sz); + fb->progress = fb->buf + old_sz; + } + + fb->end = fb->buf + sz; + return true; +} + + +//========================================================== +// fabric_connection +// + +fabric_connection * +fabric_connection_create(cf_socket *sock, cf_sock_addr *peer) +{ + fabric_connection *fc = cf_rc_alloc(sizeof(fabric_connection)); + + memset(fc, 0, sizeof(fabric_connection)); + + cf_socket_copy(sock, &fc->sock); + cf_sock_addr_copy(peer, &fc->peer); + + fc->r_buf_in_progress = fabric_buffer_create(sizeof(msg_hdr)); + fc->r_type = M_TYPE_FABRIC; + + return fc; +} + +static bool +fabric_connection_accept_tls(fabric_connection *fc) +{ + int32_t tls_ev = tls_socket_accept(&fc->sock); + + if (tls_ev == EPOLLERR) { + cf_warning(AS_FABRIC, "fabric TLS server handshake with %s failed", cf_sock_addr_print(&fc->peer)); + return false; + } + + if (tls_ev == 0) { + tls_socket_must_not_have_data(&fc->sock, "fabric server handshake"); + tls_ev = EPOLLIN; + } + + cf_poll_modify_socket(g_accept_poll, &fc->sock, + tls_ev | EPOLLERR | EPOLLHUP | EPOLLRDHUP, fc); + return true; +} + +static bool +fabric_connection_connect_tls(fabric_connection *fc) +{ + int32_t tls_ev = tls_socket_connect(&fc->sock); + + if (tls_ev == EPOLLERR) { + cf_warning(AS_FABRIC, "fabric TLS client handshake with %s failed", cf_sock_addr_print(&fc->peer)); + return false; + } + + if (tls_ev == 0) { + tls_socket_must_not_have_data(&fc->sock, "fabric client handshake"); + tls_ev = EPOLLOUT; + } + + cf_poll_modify_socket(fc->send_ptr->poll, &fc->sock, + tls_ev | DEFAULT_EVENTS, fc); + return true; +} + +inline static void +fabric_connection_reserve(fabric_connection *fc) +{ + cf_rc_reserve(fc); +} + +static void +fabric_connection_release(fabric_connection *fc) +{ + int cnt = cf_rc_release(fc); + + if (cnt == 0) { + if (fc->s_msg_in_progress) { + // First message (s_count == 0) is initial M_TYPE_FABRIC message + // and does not need to be saved. + if (! fc->started_via_connect || fc->s_count != 0) { + cf_queue_push(&fc->node->send_queue[fc->pool->pool_id], + &fc->s_msg_in_progress); + } + else { + as_fabric_msg_put(fc->s_msg_in_progress); + } + } + + if (fc->node) { + fabric_node_release(fc->node); + fc->node = NULL; + } + else { + cf_detail(AS_FABRIC, "releasing fc %p not attached to a node", fc); + } + + cf_socket_close(&fc->sock); + cf_socket_term(&fc->sock); + cf_atomic64_incr(&g_stats.fabric_connections_closed); + + fabric_buffer_destroy(fc->r_buf_in_progress); + fabric_buffer_free_extra(&fc->s_buf); + + cf_rc_free(fc); + } + else if (cnt < 0) { + cf_crash(AS_FABRIC, "extra fabric_connection_release %p", fc); + } +} + +inline static cf_node +fabric_connection_get_id(const fabric_connection *fc) +{ + if (fc->node) { + return fc->node->node_id; + } + + return 0; +} + +// epoll takes the reference of fc. +static void +fabric_connection_send_assign(fabric_connection *fc) +{ + pthread_mutex_lock(&g_fabric.send_lock); + + send_entry **pp = &g_fabric.send_head; + uint8_t min = fabric_node_find_min_send_count(fc->node); + + while (true) { + uint32_t send_id = (*pp)->id; + + if (fc->node->send_counts[send_id] == min) { + break; + } + + cf_assert((*pp)->next, AS_FABRIC, "fabric_connection_send_assign() invalid send_count state"); + + pp = &(*pp)->next; + } + + send_entry *se = *pp; + + se->count++; + fc->node->send_counts[se->id]++; + + if (se->next && se->next->count < se->count) { + *pp = se->next; + send_entry_insert(pp, se); + } + + fc->send_ptr = se; + + pthread_mutex_unlock(&g_fabric.send_lock); + + cf_poll_add_socket(se->poll, &fc->sock, EPOLLOUT | DEFAULT_EVENTS, fc); +} + +static void +fabric_connection_send_unassign(fabric_connection *fc) +{ + pthread_mutex_lock(&g_fabric.send_lock); + + if (! fc->send_ptr) { + pthread_mutex_unlock(&g_fabric.send_lock); + return; + } + + send_entry **pp = &g_fabric.send_head; + send_entry *se = fc->send_ptr; + + while (*pp != se) { + cf_assert((*pp)->next, AS_FABRIC, "fabric_connection_send_unassign() invalid send_count state"); + + pp = &(*pp)->next; + } + + cf_assert(se->count != 0 || fc->node->send_counts[se->id] != 0, AS_FABRIC, "invalid send_count accounting se %p id %u count %u node send_count %u", + se, se->id, se->count, fc->node->send_counts[se->id]); + + se->count--; + fc->node->send_counts[se->id]--; + + *pp = se->next; + send_entry_insert(&g_fabric.send_head, se); + + fc->send_ptr = NULL; + + pthread_mutex_unlock(&g_fabric.send_lock); +} + +inline static void +fabric_connection_recv_rearm(fabric_connection *fc) +{ + cf_poll_modify_socket(fc->pool->poll, &fc->sock, + EPOLLIN | DEFAULT_EVENTS, fc); +} + +// epoll takes the reference of fc. +inline static void +fabric_connection_send_rearm(fabric_connection *fc) +{ + cf_poll_modify_socket(fc->send_ptr->poll, &fc->sock, + EPOLLOUT | DEFAULT_EVENTS, fc); +} + +static void +fabric_connection_disconnect(fabric_connection *fc) +{ + fc->failed = true; + cf_socket_shutdown(&fc->sock); + + fabric_node *node = fc->node; + + if (! node) { + return; + } + + pthread_mutex_lock(&node->fc_hash_lock); + + if (cf_shash_delete(node->fc_hash, &fc) != CF_SHASH_OK) { + cf_detail(AS_FABRIC, "fc %p is not in (node %p)->fc_hash", fc, node); + pthread_mutex_unlock(&node->fc_hash_lock); + return; + } + + pthread_mutex_unlock(&node->fc_hash_lock); + + if (fc->started_via_connect) { + pthread_mutex_lock(&node->connect_lock); + + cf_atomic32_decr(&node->connect_count[fc->pool->pool_id]); + node->connect_full = false; + + pthread_mutex_unlock(&node->connect_lock); + } + + pthread_mutex_lock(&node->send_idle_fc_queue_lock); + + if (cf_queue_delete(&node->send_idle_fc_queue[fc->pool->pool_id], &fc, + true) == CF_QUEUE_OK) { + fabric_connection_release(fc); // for delete from send_idle_fc_queue + } + + pthread_mutex_unlock(&node->send_idle_fc_queue_lock); + + cf_detail(AS_FABRIC, "fabric_connection_disconnect(%p) {pool=%u id=%lx fd=%u}", + fc, fc->pool ? fc->pool->pool_id : 0, + node ? node->node_id : (cf_node)0, fc->sock.fd); + + fabric_connection_release(fc); // for delete from node->fc_hash +} + +static void +fabric_connection_set_keepalive_options(fabric_connection *fc) +{ + if (g_config.fabric_keepalive_enabled) { + cf_socket_keep_alive(&fc->sock, g_config.fabric_keepalive_time, + g_config.fabric_keepalive_intvl, + g_config.fabric_keepalive_probes); + } +} + +static void +fabric_connection_reroute_msg(fabric_connection *fc) +{ + if (! fc->s_msg_in_progress) { + return; + } + + // Don't reroute initial M_TYPE_FABRIC message. + if ((fc->started_via_connect && fc->s_count == 0) || + fabric_node_send(fc->node, fc->s_msg_in_progress, + fc->pool->pool_id) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(fc->s_msg_in_progress); + } + + fc->s_msg_in_progress = NULL; +} + +static void +fabric_connection_send_progress(fabric_connection *fc, bool is_last) +{ + uint8_t *send_progress; + size_t send_full; + + if (fc->s_buf.buf) { + // Partially sent msg. + send_progress = fc->s_buf.progress; + send_full = fc->s_buf.end - send_progress; + } + else { + // Fresh msg. + msg *m = fc->s_msg_in_progress; + + send_full = msg_get_wire_size(m); + fabric_buffer_init(&fc->s_buf, send_full); + + send_progress = fc->s_buf.progress; + msg_to_wire(m, send_progress); + + if (m->benchmark_time != 0) { + m->benchmark_time = histogram_insert_data_point( + g_stats.fabric_send_init_hists[fc->pool->pool_id], + m->benchmark_time); + } + } + + int32_t flags = MSG_NOSIGNAL | (is_last ? 0 : MSG_MORE); + int32_t send_sz = cf_socket_send(&fc->sock, send_progress, send_full, + flags); + + if (send_sz < 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + fc->failed = true; + cf_socket_write_shutdown(&fc->sock); + return; + } + + send_sz = 0; // treat as sending 0 + } + + if (fc->s_msg_in_progress->benchmark_time != 0) { + fc->s_msg_in_progress->benchmark_time = histogram_insert_data_point( + g_stats.fabric_send_fragment_hists[fc->pool->pool_id], + fc->s_msg_in_progress->benchmark_time); + } + + fc->s_bytes += send_sz; + + if ((size_t)send_sz == send_full) { + // Complete send. + as_fabric_msg_put(fc->s_msg_in_progress); + fc->s_msg_in_progress = NULL; + fabric_buffer_free_extra(&fc->s_buf); + fc->s_buf.buf = NULL; + fc->s_count++; + } + else { + // Partial send. + fc->s_buf.progress += send_sz; + } +} + +// Must rearm or place into idle queue on success. +static bool +fabric_connection_process_writable(fabric_connection *fc) +{ + // Strategy with MSG_MORE to prevent small packets during migration. + // Case 1 - socket buffer not full: + // Send all messages except last with MSG_MORE. Last message flushes + // buffer. + // Case 2 - socket buffer full: + // All messages get sent with MSG_MORE but because buffer full, small + // packets still won't happen. + fabric_node *node = fc->node; + uint32_t pool = fc->pool->pool_id; + + if (! fc->s_msg_in_progress) { + // TODO - Change to load op when atomic API is ready. + // Also should be rare or not even happen in x86_64. + cf_warning(AS_FABRIC, "fc(%p)->s_msg_in_progress NULL on entry", fc); + return false; + } + + while (fc->s_msg_in_progress) { + msg *pending = NULL; + + cf_queue_pop(&node->send_queue[pool], &pending, CF_QUEUE_NOWAIT); + fabric_connection_send_progress(fc, ! pending); + + if (fc->s_msg_in_progress) { + if (pending) { + cf_queue_push_head(&node->send_queue[pool], &pending); + } + + fabric_connection_send_rearm(fc); + return true; + } + + fc->s_msg_in_progress = pending; + } + + if (! fc->node->live || fc->failed) { + return false; + } + + // Try with bigger lock block to sync with as_fabric_send(). + pthread_mutex_lock(&node->send_idle_fc_queue_lock); + + if (! fc->node->live || fc->failed) { + pthread_mutex_unlock(&node->send_idle_fc_queue_lock); + return false; + } + + if (cf_queue_pop(&node->send_queue[pool], &fc->s_msg_in_progress, + CF_QUEUE_NOWAIT) == CF_QUEUE_EMPTY) { + cf_queue_push(&node->send_idle_fc_queue[pool], &fc); + pthread_mutex_unlock(&node->send_idle_fc_queue_lock); + return true; + } + + pthread_mutex_unlock(&node->send_idle_fc_queue_lock); + + fabric_connection_send_rearm(fc); + + return true; +} + +// Return true on success. +static bool +fabric_connection_process_fabric_msg(fabric_connection *fc, const msg *m) +{ + cf_poll_delete_socket(g_accept_poll, &fc->sock); + + cf_node node_id; + + if (msg_get_uint64(m, FS_FIELD_NODE, &node_id) != 0) { + cf_warning(AS_FABRIC, "process_fabric_msg: failed to read M_TYPE_FABRIC node"); + return false; + } + + cf_detail(AS_FABRIC, "process_fabric_msg: M_TYPE_FABRIC from node %lx", node_id); + + fabric_node *node = fabric_node_get_or_create(node_id); + + if (! fabric_node_add_connection(node, fc)) { + fabric_node_release(node); // from cf_rchash_get + return false; + } + + uint32_t pool_id = AS_FABRIC_N_CHANNELS; // illegal value + + msg_get_uint32(m, FS_CHANNEL, &pool_id); + + if (pool_id >= AS_FABRIC_N_CHANNELS) { + fabric_node_release(node); // from cf_rchash_get + return false; + } + + fabric_buffer_free_extra(fc->r_buf_in_progress); + fabric_buffer_init(fc->r_buf_in_progress, sizeof(msg_hdr)); + fc->r_msg_size = 0; + + // fc->pool needs to be set before placing into send_idle_fc_queue. + fabric_recv_thread_pool_add_fc(&g_fabric.recv_pool[pool_id], fc); + + // TLS connections are one-way. Incoming connections are for + // incoming data. + if (fc->sock.state == CF_SOCKET_STATE_NON_TLS) { + pthread_mutex_lock(&node->send_idle_fc_queue_lock); + + if (node->live && ! fc->failed) { + fabric_connection_reserve(fc); // for send poll & idleQ + + if (cf_queue_pop(&node->send_queue[pool_id], &fc->s_msg_in_progress, + CF_QUEUE_NOWAIT) == CF_QUEUE_EMPTY) { + cf_queue_push(&node->send_idle_fc_queue[pool_id], &fc); + } + else { + fabric_connection_send_assign(fc); + } + } + + pthread_mutex_unlock(&node->send_idle_fc_queue_lock); + } + + fabric_node_release(node); // from cf_rchash_get + fabric_connection_release(fc); // from g_accept_poll + + return true; +} + +static bool +fabric_connection_read_fabric_msg(fabric_connection *fc) +{ + fabric_buffer *fb = fc->r_buf_in_progress; + + while (true) { + size_t recv_full = fb->end - fb->progress; + int32_t recv_sz = cf_socket_recv(&fc->sock, fb->progress, recv_full, 0); + + if (recv_sz < 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + cf_warning(AS_FABRIC, "fabric_connection_read_fabric_msg() recv_sz %d errno %d %s", recv_sz, errno, cf_strerror(errno)); + return false; + } + + break; + } + + if (recv_sz == 0) { + cf_detail(AS_FABRIC, "fabric_connection_read_fabric_msg(%p) fb=%p recv_sz == 0 / %zu", fc, fb, recv_full); + return false; + } + + fb->progress += recv_sz; + fc->r_bytes += recv_sz; + + if ((size_t)recv_sz < recv_full) { + tls_socket_must_not_have_data(&fc->sock, "partial fabric read"); + break; + } + + if (fc->r_msg_size == 0) { + size_t hdr_sz = fb->progress - fb->buf; + + if (msg_get_initial( + &fc->r_msg_size, &fc->r_type, fb->buf, hdr_sz) != 0) { + cf_crash(AS_FABRIC, "fb->end was not initialized correctly"); + } + + if (! fabric_buffer_resize(fb, fc->r_msg_size)) { + cf_warning(AS_FABRIC, "fabric_connection_read_fabric_msg(%p) invalid msg_size %u remote 0x%lx", fc, fc->r_msg_size, fabric_connection_get_id(fc)); + return false; + } + + continue; + } + + tls_socket_must_not_have_data(&fc->sock, "full fabric read"); + + if (fc->r_type != M_TYPE_FABRIC) { + cf_warning(AS_FABRIC, "fabric_connection_read_fabric_msg() expected type M_TYPE_FABRIC(%d) got type %d", M_TYPE_FABRIC, fc->r_type); + return false; + } + + msg *m = as_fabric_msg_get(M_TYPE_FABRIC); + + if (msg_parse(m, fb->buf, fc->r_msg_size) != 0) { + cf_warning(AS_FABRIC, "msg_parse failed for fc %p fb %p", fc, fb); + as_fabric_msg_put(m); + return false; + } + + bool ret = fabric_connection_process_fabric_msg(fc, m); + as_fabric_msg_put(m); + + return ret; + } + + return true; +} + +// Return true on success. +// Must have re-armed on success. +static bool +fabric_connection_process_msg(fabric_connection *fc, bool do_rearm) +{ + msg *m = as_fabric_msg_get(fc->r_type); + + if (! m) { + cf_warning(AS_FABRIC, "Failed to create message for type %d (max %d)", fc->r_type, M_TYPE_MAX); + return false; + } + + fabric_buffer *fb = fc->r_buf_in_progress; + + if (msg_parse(m, fb->buf, fc->r_msg_size) != 0) { + cf_warning(AS_FABRIC, "msg_parse failed for fc %p fb %p", fc, fb); + as_fabric_msg_put(m); + return false; + } + + cf_assert(fc->node, AS_FABRIC, "process_msg: no node assigned"); + + // Save some state for after re-arm. + cf_node node = fc->node->node_id; + uint64_t bt = fc->benchmark_time; + uint32_t ch = fc->pool->pool_id; + + fc->r_msg_size = 0; + + if (do_rearm) { + // Re-arm for next message (possibly handled in another thread). + fc->r_buf_in_progress = fabric_buffer_create(sizeof(msg_hdr)); + fabric_connection_recv_rearm(fc); // do not use fc after this point + } + + if (g_fabric.msg_cb[m->type]) { + (g_fabric.msg_cb[m->type])(node, m, g_fabric.msg_udata[m->type]); + + if (bt != 0) { + histogram_insert_data_point(g_stats.fabric_recv_cb_hists[ch], bt); + } + } + else { + cf_warning(AS_FABRIC, "process_msg: could not deliver message type %d", m->type); + as_fabric_msg_put(m); + } + + if (do_rearm) { + fabric_buffer_destroy(fb); + } + + return true; +} + +// Return true on success. +// Must have re-armed on success. +static bool +fabric_connection_process_readable(fabric_connection *fc) +{ + fabric_buffer *fb = fc->r_buf_in_progress; + size_t recv_all = 0; + + while (true) { + size_t recv_full = fb->end - fb->progress; + int32_t recv_sz = cf_socket_recv(&fc->sock, fb->progress, recv_full, 0); + + if (recv_sz < 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + cf_warning(AS_FABRIC, "fabric_connection_process_readable() recv_sz %d errno %d %s", recv_sz, errno, cf_strerror(errno)); + return false; + } + + break; + } + + if (recv_sz == 0) { + cf_detail(AS_FABRIC, "fabric_connection_process_readable(%p) fb=%p recv_sz == 0 / %zu", fc, fb, recv_full); + return false; + } + + fb->progress += recv_sz; + fc->r_bytes += recv_sz; + recv_all += recv_sz; + + if (fc->r_msg_size == 0) { + fc->benchmark_time = g_config.fabric_benchmarks_enabled ? + cf_getns() : 0; + } + + if ((size_t)recv_sz < recv_full) { + if (fc->benchmark_time != 0) { + fc->benchmark_time = histogram_insert_data_point( + g_stats.fabric_recv_fragment_hists[fc->pool->pool_id], + fc->benchmark_time); + } + + break; + } + + if (fc->r_msg_size == 0) { + size_t hdr_sz = fb->progress - fb->buf; + + if (msg_get_initial( + &fc->r_msg_size, &fc->r_type, fb->buf, hdr_sz) != 0) { + cf_crash(AS_FABRIC, "fb->end was not initialized correctly"); + } + + if (! fabric_buffer_resize(fb, fc->r_msg_size)) { + cf_warning(AS_FABRIC, "fabric_connection_process_readable(%p) invalid msg_size %u remote 0x%lx", fc, fc->r_msg_size, fabric_connection_get_id(fc)); + return false; + } + + continue; + } + + bool do_rearm = recv_all > (size_t)g_config.fabric_recv_rearm_threshold; + + if (! fabric_connection_process_msg(fc, do_rearm)) { + return false; + } + + if (do_rearm) { + // Already rearmed. + return true; + } + + fabric_buffer_free_extra(fc->r_buf_in_progress); + fabric_buffer_init(fc->r_buf_in_progress, sizeof(msg_hdr)); + } + + fabric_connection_recv_rearm(fc); + return true; +} + + +//========================================================== +// fabric_recv_thread_pool +// + +static void +fabric_recv_thread_pool_init(fabric_recv_thread_pool *pool, uint32_t size, + uint32_t pool_id) +{ + cf_vector_init(&pool->threads, sizeof(pthread_t), size, 0); + cf_poll_create(&pool->poll); + pool->pool_id = pool_id; +} + +// Called only at startup or under set-config lock. Caller has checked size. +static void +fabric_recv_thread_pool_set_size(fabric_recv_thread_pool *pool, uint32_t size) +{ + while (size < cf_vector_size(&pool->threads)) { + pthread_t th; + cf_vector_pop(&pool->threads, &th); + pthread_cancel(th); + } + + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + while (size > cf_vector_size(&pool->threads)) { + if (pthread_create(&thread, &attrs, run_fabric_recv, pool) != 0) { + cf_crash(AS_FABRIC, "could not create fabric recv thread"); + } + + cf_vector_append(&pool->threads, &thread); + } +} + +static void +fabric_recv_thread_pool_add_fc(fabric_recv_thread_pool *pool, + fabric_connection *fc) +{ + fabric_connection_reserve(fc); // extra ref for poll + fc->pool = pool; + + uint32_t recv_events = EPOLLIN | DEFAULT_EVENTS; + + cf_poll_add_socket(pool->poll, &fc->sock, recv_events, fc); +} + + +//========================================================== +// fabric_endpoint +// + +// Get the endpoint list to connect to the remote node. +// Returns true on success where errno will be set to ENOENT if there is no +// endpoint list could be obtained for this node and ENOMEM if the input +// endpoint_list_size is less than actual size. Var endpoint_list_size will be +// updated with the required capacity. +static bool +fabric_endpoint_list_get(cf_node nodeid, as_endpoint_list *endpoint_list, + size_t *endpoint_list_size) +{ + as_hb_plugin_node_data plugin_data = { + .data_capacity = *endpoint_list_size, + .data = endpoint_list, + .data_size = 0, + }; + + if (as_hb_plugin_data_get(nodeid, AS_HB_PLUGIN_FABRIC, &plugin_data, NULL, + NULL) == 0) { + return plugin_data.data_size != 0; + } + + if (errno == ENOENT) { + return false; + } + + // Not enough allocated memory. + *endpoint_list_size = plugin_data.data_size; + + return false; +} + +// Filter out endpoints not matching this node's capabilities. +static bool +fabric_connect_endpoint_filter(const as_endpoint *endpoint, void *udata) +{ + if (cf_ip_addr_legacy_only() && + endpoint->addr_type == AS_ENDPOINT_ADDR_TYPE_IPv6) { + return false; + } + + // If we don't offer TLS, then we won't connect via TLS, either. + if (g_config.tls_fabric.bind_port == 0 && + as_endpoint_capability_is_supported(endpoint, + AS_ENDPOINT_TLS_MASK)) { + return false; + } + + return true; +} + + +//========================================================== +// Thread functions. +// + +static void * +run_fabric_recv(void *arg) +{ + int oldstate; + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate); + + fabric_recv_thread_pool *pool = (fabric_recv_thread_pool *)arg; + static int worker_id_counter = 0; + uint64_t worker_id = worker_id_counter++; + cf_poll poll = pool->poll; + + cf_detail(AS_FABRIC, "run_fabric_recv() created index %lu", worker_id); + + pthread_cleanup_push(run_fabric_recv_cleanup, (void *)worker_id); + + while (true) { + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, &oldstate); + pthread_testcancel(); + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &oldstate); + + cf_poll_event events[FABRIC_EPOLL_RECV_EVENTS]; + int32_t n = cf_poll_wait(poll, events, FABRIC_EPOLL_RECV_EVENTS, -1); + + for (int32_t i = 0; i < n; i++) { + fabric_connection *fc = events[i].data; + + if (fc->node && ! fc->node->live) { + fabric_connection_disconnect(fc); + fabric_connection_release(fc); + continue; + } + + // Handle remote close, socket errors. + // Also triggered by call to cf_socket_shutdown(fc->sock), but only + // first call. + // Not triggered by cf_socket_close(fc->sock), which automatically + // does EPOLL_CTL_DEL. + if (events[i].events & (EPOLLERR | EPOLLHUP | EPOLLRDHUP)) { + cf_detail(AS_FABRIC, "%lu: epoll : error, will close: fc %p fd %d errno %d signal {err:%d, hup:%d, rdhup:%d}", + worker_id, + fc, CSFD(&fc->sock), errno, + ((events[i].events & EPOLLERR) ? 1 : 0), + ((events[i].events & EPOLLHUP) ? 1 : 0), + ((events[i].events & EPOLLRDHUP) ? 1 : 0)); + fabric_connection_disconnect(fc); + fabric_connection_release(fc); + continue; + } + + cf_assert(events[i].events == EPOLLIN, AS_FABRIC, "epoll not setup correctly for %p", fc); + + if (! fabric_connection_process_readable(fc)) { + fabric_connection_disconnect(fc); + fabric_connection_release(fc); + continue; + } + } + } + + pthread_cleanup_pop(0); + return NULL; +} + +static void +run_fabric_recv_cleanup(void *arg) +{ + uint64_t worker_id = (uint64_t)arg; + + cf_detail(AS_FABRIC, "run_fabric_recv() canceling index %lu", worker_id); +} + +static void * +run_fabric_send(void *arg) +{ + send_entry *se = (send_entry *)arg; + cf_poll poll = se->poll; + + cf_detail(AS_FABRIC, "run_fabric_send() fd %d id %u", poll.fd, se->id); + + while (true) { + cf_poll_event events[FABRIC_EPOLL_SEND_EVENTS]; + int32_t n = cf_poll_wait(poll, events, FABRIC_EPOLL_SEND_EVENTS, -1); + + for (int32_t i = 0; i < n; i++) { + fabric_connection *fc = events[i].data; + + if (fc->node && ! fc->node->live) { + fabric_connection_disconnect(fc); + fabric_connection_send_unassign(fc); + fabric_connection_release(fc); + continue; + } + + // Handle remote close, socket errors. Also triggered by call to + // cf_socket_shutdown(fb->sock), but only first call. Not triggered + // by cf_socket_close(fb->sock), which automatically EPOLL_CTL_DEL. + if (events[i].events & (EPOLLERR | EPOLLHUP | EPOLLRDHUP)) { + cf_detail(AS_FABRIC, "epoll : error, will close: fc %p fd %d errno %d signal {err:%d, hup:%d, rdhup:%d}", + fc, CSFD(&fc->sock), errno, + ((events[i].events & EPOLLERR) ? 1 : 0), + ((events[i].events & EPOLLHUP) ? 1 : 0), + ((events[i].events & EPOLLRDHUP) ? 1 : 0)); + fabric_connection_disconnect(fc); + fabric_connection_send_unassign(fc); + fabric_connection_reroute_msg(fc); + fabric_connection_release(fc); + continue; + } + + if (tls_socket_needs_handshake(&fc->sock)) { + if (! fabric_connection_connect_tls(fc)) { + fabric_connection_disconnect(fc); + fabric_connection_send_unassign(fc); + fabric_connection_reroute_msg(fc); + fabric_connection_release(fc); + } + + continue; + } + + cf_assert(events[i].events == EPOLLOUT, AS_FABRIC, "epoll not setup correctly for %p", fc); + + if (! fabric_connection_process_writable(fc)) { + fabric_connection_disconnect(fc); + fabric_connection_send_unassign(fc); + fabric_connection_reroute_msg(fc); + fabric_connection_release(fc); + continue; + } + } + } + + return 0; +} + +static void * +run_fabric_accept(void *arg) +{ + cf_sockets sockset; + + if (cf_socket_init_server(&g_fabric_bind, &sockset) < 0) { + cf_crash(AS_FABRIC, "Could not create fabric listener socket - check configuration"); + } + + cf_poll_create(&g_accept_poll); + cf_poll_add_sockets(g_accept_poll, &sockset, EPOLLIN | EPOLLERR | EPOLLHUP); + cf_socket_show_server(AS_FABRIC, "fabric", &sockset); + + while (true) { + // Accept new connections on the service socket. + cf_poll_event events[64]; + int32_t n = cf_poll_wait(g_accept_poll, events, 64, -1); + + for (int32_t i = 0; i < n; i++) { + cf_socket *ssock = events[i].data; + + if (cf_sockets_has_socket(&sockset, ssock)) { + cf_socket csock; + cf_sock_addr sa; + + if (cf_socket_accept(ssock, &csock, &sa) < 0) { + if (errno == EMFILE) { + cf_warning(AS_FABRIC, "low on file descriptors"); + continue; + } + else { + cf_crash(AS_FABRIC, "cf_socket_accept: %d %s", errno, cf_strerror(errno)); + } + } + + cf_detail(AS_FABRIC, "fabric_accept: accepting new sock %d", CSFD(&csock)); + cf_atomic64_incr(&g_stats.fabric_connections_opened); + + fabric_connection *fc = fabric_connection_create(&csock, &sa); + + cf_sock_cfg *cfg = ssock->cfg; + + if (cfg->owner == CF_SOCK_OWNER_FABRIC_TLS) { + tls_socket_prepare_server(g_fabric_tls, &fc->sock); + } + + uint32_t events = EPOLLIN | EPOLLERR | EPOLLHUP | EPOLLRDHUP; + cf_poll_add_socket(g_accept_poll, &fc->sock, events, fc); + } + else { + fabric_connection *fc = events[i].data; + + if (events[i].events & (EPOLLERR | EPOLLHUP | EPOLLRDHUP)) { + fabric_connection_release(fc); + continue; + } + + if (tls_socket_needs_handshake(&fc->sock)) { + if (! fabric_connection_accept_tls(fc)) { + fabric_connection_release(fc); + } + + continue; + } + + if (! fabric_connection_read_fabric_msg(fc)) { + fabric_connection_release(fc); + continue; + } + } + } + } + + return 0; +} + +static int +fabric_rate_node_reduce_fn(const void *key, uint32_t keylen, void *data, + void *udata) +{ + fabric_node *node = (fabric_node *)data; + fabric_rate *rate = (fabric_rate *)udata; + + pthread_mutex_lock(&node->fc_hash_lock); + cf_shash_reduce(node->fc_hash, fabric_rate_fc_reduce_fn, rate); + pthread_mutex_unlock(&node->fc_hash_lock); + + return 0; +} + +static int +fabric_rate_fc_reduce_fn(const void *key, void *data, void *udata) +{ + fabric_connection *fc = *(fabric_connection **)key; + fabric_rate *rate = (fabric_rate *)udata; + + if (! fc->pool) { + return 0; + } + + uint32_t pool_id = fc->pool->pool_id; + uint64_t r_bytes = fc->r_bytes; + uint64_t s_bytes = fc->s_bytes; + + rate->r_bytes[pool_id] += r_bytes - fc->r_bytes_last; + rate->s_bytes[pool_id] += s_bytes - fc->s_bytes_last; + + fc->r_bytes_last = r_bytes; + fc->s_bytes_last = s_bytes; + + return 0; +} + + +//========================================================== +// Heartbeat. +// + +// Set the fabric advertised endpoints. +static void +fabric_hb_plugin_set_fn(msg *m) +{ + if (m->type == M_TYPE_HEARTBEAT_V2) { + // In v1 and v2 fabric does not advertise its endpoints and they + // do not support plugged in data. + return; + } + + if (! fabric_published_endpoints_refresh()) { + cf_warning(AS_FABRIC, "No publish addresses found for fabric."); + return; + } + + size_t payload_size = 0; + + if (as_endpoint_list_sizeof( + g_published_endpoint_list, &payload_size) != 0) { + cf_crash(AS_FABRIC, "Error getting endpoint list size for published addresses."); + } + + if (msg_set_buf(m, AS_HB_MSG_FABRIC_DATA, + (uint8_t *)g_published_endpoint_list, payload_size, + MSG_SET_COPY) != 0) { + cf_crash(AS_FABRIC, "Error setting succession list on msg."); + } +} + +// Plugin function that parses succession list out of a heartbeat pulse message. +static void +fabric_hb_plugin_parse_data_fn(msg *m, cf_node source, + as_hb_plugin_node_data *plugin_data) +{ + if (m->type == M_TYPE_HEARTBEAT_V2) { + plugin_data->data_size = 0; + return; + } + + uint8_t *payload = NULL; + size_t payload_size = 0; + + if (msg_get_buf(m, AS_HB_MSG_FABRIC_DATA, &payload, &payload_size, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_FABRIC, "Unable to read fabric published endpoint list from heartbeat from node %lx", source); + return; + } + + if (payload_size > plugin_data->data_capacity) { + // Round up to nearest multiple of block size to prevent very frequent + // reallocation. + size_t data_capacity = ((payload_size + HB_PLUGIN_DATA_BLOCK_SIZE - 1) / + HB_PLUGIN_DATA_BLOCK_SIZE) * HB_PLUGIN_DATA_BLOCK_SIZE; + + // Reallocate since we have outgrown existing capacity. + plugin_data->data = cf_realloc(plugin_data->data, data_capacity); + + plugin_data->data_capacity = data_capacity; + } + + plugin_data->data_size = payload_size; + + memcpy(plugin_data->data, payload, payload_size); +} + +// Function is called when a new node created or destroyed on the heartbeat +// system. +// This will insert a new element in the hashtable that keeps track of all TCP +// connections. +static void +fabric_heartbeat_event(int nevents, as_hb_event_node *events, void *udata) +{ + if ((nevents < 1) || (nevents > AS_CLUSTER_SZ) || ! events) { + cf_warning(AS_FABRIC, "fabric: received event count of %d", nevents); + return; + } + + for (int i = 0; i < nevents; i++) { + switch (events[i].evt) { + case AS_HB_NODE_ARRIVE: { + fabric_node *node = fabric_node_get_or_create(events[i].nodeid); + fabric_node_release(node); // for node_get_or_create() + + cf_info(AS_FABRIC, "fabric: node %lx arrived", events[i].nodeid); + } + break; + case AS_HB_NODE_DEPART: + cf_info(AS_FABRIC, "fabric: node %lx departed", events[i].nodeid); + fabric_node_disconnect(events[i].nodeid); + break; + case AS_HB_NODE_ADJACENCY_CHANGED: + // Not relevant to fabric. + break; + default: + cf_warning(AS_FABRIC, "fabric: received unknown event type %d %lx", events[i].evt, events[i].nodeid); + break; + } + } +} + + +//============================================================================== +// Fabric transact. +// + +//========================================================== +// Constants and typedefs. +// + +typedef enum { + TRANSACT_CODE_REQUEST = 1, + TRANSACT_CODE_RESPONSE = 2, +} transact_code; + +// Operation to be performed on transaction in retransmission hash. +typedef enum { + TRANSACT_OP_TIMEOUT = 1, + TRANSACT_OP_RETRANSMIT = 2, +} transact_op; + +typedef struct fabric_transact_xmit_s { + uint64_t tid; + cf_node node_id; + msg *m; + pthread_mutex_t lock; + + uint64_t deadline_ms; + uint64_t retransmit_ms; + int retransmit_wait; + + as_fabric_transact_complete_fn cb; + void *udata; +} fabric_transact_xmit; + +typedef struct fabric_transact_recv_s { + cf_node node_id; // where it came from + uint64_t tid; // inbound tid +} fabric_transact_recv; + +typedef struct transact_recv_key_s { + uint64_t tid; + cf_node node_id; +} __attribute__ ((__packed__)) transact_recv_key; + +typedef struct ll_fabric_transact_xmit_element_s { + cf_ll_element ll_e; + int op; + uint64_t tid; +} ll_fabric_transact_xmit_element; + + +//========================================================== +// Globals. +// + +static cf_atomic64 g_fabric_transact_tid = 0; +static cf_rchash *g_fabric_transact_xmit_hash = NULL; +static as_fabric_transact_recv_fn fabric_transact_recv_cb[M_TYPE_MAX] = { 0 }; +static void *fabric_transact_recv_udata[M_TYPE_MAX] = { 0 }; + + +//========================================================== +// Forward declarations and inlines. +// + +static void fabric_transact_xmit_destructor(void *object); +static void fabric_transact_xmit_release(fabric_transact_xmit *ft); +static int fabric_transact_msg_fn(cf_node node_id, msg *m, void *udata); +static void *run_fabric_transact(void *arg); +static void ll_ftx_destructor_fn(cf_ll_element *e); +static int fabric_transact_xmit_reduce_fn(const void *key, uint32_t keylen, void *o, void *udata); +static int ll_ftx_reduce_fn(cf_ll_element *le, void *udata); + +inline static transact_code +tid_code_get(uint64_t tid) +{ + return (transact_code)(tid >> 56); +} + +inline static uint64_t +tid_code_set(uint64_t tid, transact_code code) +{ + return tid | (((uint64_t)code) << 56); +} + +inline static uint64_t +tid_code_clear(uint64_t tid) +{ + return tid & 0xFFffffFFFFffff; +} + + +//========================================================== +// Public API. +// + +void +as_fabric_transact_init() +{ + cf_rchash_create(&g_fabric_transact_xmit_hash, cf_rchash_fn_u32, + fabric_transact_xmit_destructor, sizeof(uint64_t), 64, + CF_RCHASH_MANY_LOCK); + + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + if (pthread_create(&thread, &attrs, run_fabric_transact, NULL) != 0) { + cf_crash(AS_FABRIC, "could not create fabric transact thread"); + } +} + +void +as_fabric_transact_start(cf_node dest, msg *m, int timeout_ms, + as_fabric_transact_complete_fn cb, void *udata) +{ + // TODO - could check it against the list of global message ids. + + if (msg_field_get_type(m, 0) != M_FT_UINT64) { + // error + cf_warning(AS_FABRIC, "as_fabric_transact: first field must be int64"); + (cb)(NULL, udata, AS_FABRIC_ERR_UNKNOWN); + return; + } + + fabric_transact_xmit *ft = cf_rc_alloc(sizeof(fabric_transact_xmit)); + + ft->tid = cf_atomic64_incr(&g_fabric_transact_tid); + ft->node_id = dest; + ft->m = m; + + pthread_mutex_init(&ft->lock, NULL); + uint64_t now = cf_getms(); + + ft->deadline_ms = now + timeout_ms; + ft->retransmit_wait = 10; // 10 ms start + ft->retransmit_ms = now + ft->retransmit_wait; // hard start at 10 milliseconds + ft->cb = cb; + ft->udata = udata; + + uint64_t xmit_tid = tid_code_set(ft->tid, TRANSACT_CODE_REQUEST); + + // Set message tid. + msg_set_uint64(m, 0, xmit_tid); + + // Put will take the reference, need to keep one around for the send. + cf_rc_reserve(ft); + cf_rchash_put(g_fabric_transact_xmit_hash, &ft->tid, sizeof(ft->tid), ft); + + // Transmit the initial message. + msg_incr_ref(m); + + if (as_fabric_send(ft->node_id, ft->m, AS_FABRIC_CHANNEL_META) != + AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } + + fabric_transact_xmit_release(ft); + + return; +} + +// Registers all of this message type as a +// transaction type message, which means the main message. +int +as_fabric_transact_register(msg_type type, const msg_template *mt, size_t mt_sz, + size_t scratch_sz, as_fabric_transact_recv_fn cb, void *udata) +{ + // Put details in the global structure. + fabric_transact_recv_cb[type] = cb; + fabric_transact_recv_udata[type] = udata; + + // Register my internal callback with the main message callback. + as_fabric_register_msg_fn(type, mt, mt_sz, scratch_sz, + fabric_transact_msg_fn, NULL); + + return 0; +} + +int +as_fabric_transact_reply(msg *m, void *transact_data) +{ + fabric_transact_recv *ftr = (fabric_transact_recv *)transact_data; + + // This is a response - overwrite tid with response code etc. + uint64_t xmit_tid = tid_code_set(ftr->tid, TRANSACT_CODE_RESPONSE); + msg_set_uint64(m, 0, xmit_tid); + + if (as_fabric_send(ftr->node_id, m, AS_FABRIC_CHANNEL_META) != + AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } + + return 0; +} + + +//========================================================== +// Local helpers - various initializers and destructors. +// + +static void +fabric_transact_xmit_release(fabric_transact_xmit *ft) +{ + if (cf_rc_release(ft) == 0) { + fabric_transact_xmit_destructor(ft); + cf_rc_free(ft); + } +} + +// Received a message. Could be a response to an outgoing message, or a new +// incoming transaction message. +static int +fabric_transact_msg_fn(cf_node node_id, msg *m, void *udata) +{ + // Assume m->type is correct. + + // Received a message, make sure we have a registered callback. + if (fabric_transact_recv_cb[m->type] == 0) { + cf_warning(AS_FABRIC, "transact: received message for transact with bad type %d, internal error", m->type); + as_fabric_msg_put(m); // return to pool unexamined + return 0; + } + + // Check to see that we have an outstanding request (only cb once!). + uint64_t tid = 0; + + if (msg_get_uint64(m, 0 /*field_id*/, &tid) != 0) { + cf_warning(AS_FABRIC, "transact_msg: received message with no tid"); + as_fabric_msg_put(m); + return 0; + } + + transact_code code = tid_code_get(tid); + tid = tid_code_clear(tid); + + // If it's a response, check against what you sent. + if (code == TRANSACT_CODE_RESPONSE) { + fabric_transact_xmit *ft; + + if (cf_rchash_get(g_fabric_transact_xmit_hash, &tid, sizeof(tid), + (void **)&ft) != CF_RCHASH_OK) { + cf_detail(AS_FABRIC, "transact_msg: {%lu} no fabric transmit structure in global hash", tid); + as_fabric_msg_put(m); + return 0; + } + + if (cf_rchash_delete(g_fabric_transact_xmit_hash, &tid, sizeof(tid)) == + CF_RCHASH_ERR_NOT_FOUND) { + cf_detail(AS_FABRIC, "transact_msg: {%lu} concurrent thread has already removed transaction", tid); + fabric_transact_xmit_release(ft); + as_fabric_msg_put(m); + return 0; + } + + pthread_mutex_lock(&ft->lock); + + // Make sure we haven't notified some other way, then notify caller. + if (ft->cb) { + (ft->cb)(m, ft->udata, AS_FABRIC_SUCCESS); + ft->cb = NULL; + } + + pthread_mutex_unlock(&ft->lock); + + // This will often be the final release. + fabric_transact_xmit_release(ft); + } + else if (code == TRANSACT_CODE_REQUEST) { + fabric_transact_recv *ftr = cf_malloc(sizeof(fabric_transact_recv)); + + ftr->tid = tid; // has already been cleared + ftr->node_id = node_id; + + // Notify caller - they will likely respond inline. + (*fabric_transact_recv_cb[m->type])(node_id, m, ftr, + fabric_transact_recv_udata[m->type]); + cf_free(ftr); + } + else { + cf_warning(AS_FABRIC, "transact_msg: {%lu} bad code on incoming message: %d", tid, code); + as_fabric_msg_put(m); + } + + return 0; +} + +static void +fabric_transact_xmit_destructor(void *object) +{ + fabric_transact_xmit *ft = object; + as_fabric_msg_put(ft->m); +} + +// Long running thread for transaction maintenance. +static void * +run_fabric_transact(void *arg) +{ + // Create a list of transactions to be processed in each pass. + cf_ll ll_fabric_transact_xmit; + // Initialize list to empty list. + // This list is processed by single thread. No need of a lock. + cf_ll_init(&ll_fabric_transact_xmit, &ll_ftx_destructor_fn, false); + + while (true) { + usleep(10000); // 10 ms for now + + // Visit each entry in g_fabric_transact_xmit_hash and select entries to + // be retransmitted or timed out. Add that transaction id (tid) in the + // linked list 'll_fabric_transact_xmit'. + cf_rchash_reduce(g_fabric_transact_xmit_hash, + fabric_transact_xmit_reduce_fn, + (void *)&ll_fabric_transact_xmit); + + if (cf_ll_size(&ll_fabric_transact_xmit)) { + // There are transactions to be processed. + // Process each transaction in list. + cf_ll_reduce(&ll_fabric_transact_xmit, true /*forward*/, + ll_ftx_reduce_fn, NULL); + } + } + + return 0; +} + +static void +ll_ftx_destructor_fn(cf_ll_element *e) +{ + cf_free(e); +} + +static int +fabric_transact_xmit_reduce_fn(const void *key, uint32_t keylen, void *o, + void *udata) +{ + fabric_transact_xmit *ftx = (fabric_transact_xmit *)o; + int op = 0; + + uint64_t now = cf_getms(); + + pthread_mutex_lock(&ftx->lock); + + if (now > ftx->deadline_ms) { + // Expire and remove transactions that are timed out. + // Need to call application: we've timed out. + op = (int)TRANSACT_OP_TIMEOUT; + } + else if (now > ftx->retransmit_ms) { + // Retransmit, update time counters, etc. + ftx->retransmit_ms = now + ftx->retransmit_wait; + ftx->retransmit_wait *= 2; + op = (int)TRANSACT_OP_RETRANSMIT; + } + + if (op > 0) { + // Add the transaction in linked list of transactions to be processed. + // Process such transactions *outside* retransmit hash lock, because... + // + // Fabric short circuits the message to self by directly calling + // receiver function of corresponding module. Receiver constructs + // "reply" and hands over to fabric to deliver. + // + // On receiving "reply", fabric removes original message, for which + // this is a reply, from retransmit hash. + // + // "fabric_transact_xmit_reduce_fn" is invoked by reduce_delete, which + // holds the lock over corresponding hash (here "retransmit hash"). If + // the message, sent by this function, is short circuited by fabric, + // the same thread will again try to get lock over "retransmit hash", + // resulting in deadlock. + + cf_ll *ll_fabric_transact_xmit = (cf_ll *)udata; + + // Create new node for list. + ll_fabric_transact_xmit_element *ll_ftx_ele = + (ll_fabric_transact_xmit_element *) + cf_malloc(sizeof(ll_fabric_transact_xmit_element)); + + ll_ftx_ele->tid = ftx->tid; + ll_ftx_ele->op = op; + // Append into list. + cf_ll_append(ll_fabric_transact_xmit, (cf_ll_element *)ll_ftx_ele); + } + + pthread_mutex_unlock(&ftx->lock); + + return 0; +} + +static int +ll_ftx_reduce_fn(cf_ll_element *le, void *udata) +{ + const ll_fabric_transact_xmit_element *ll_ftx_ele = + (const ll_fabric_transact_xmit_element *)le; + fabric_transact_xmit *ftx; + uint64_t tid = ll_ftx_ele->tid; + + // cf_rchash_get increments ref count on transaction ftx. + int rv = cf_rchash_get(g_fabric_transact_xmit_hash, &tid, sizeof(tid), + (void **)&ftx); + + if (rv != 0) { + cf_warning(AS_FABRIC, "No fabric transmit structure in global hash for fabric transaction-id %lu", tid); + return CF_LL_REDUCE_DELETE; + } + + if (ll_ftx_ele->op == (int)TRANSACT_OP_TIMEOUT) { + // Call application: we've timed out. + if (ftx->cb) { + (ftx->cb)(0, ftx->udata, AS_FABRIC_ERR_TIMEOUT); + ftx->cb = NULL; + } + + cf_detail(AS_FABRIC, "fabric transact: %lu timed out", tid); + // cf_rchash_delete removes ftx from hash and decrements ref count. + cf_rchash_delete(g_fabric_transact_xmit_hash, &tid, sizeof(tid)); + // It should be final release of transaction ftx. On final release, it + // also decrements message ref count, taken by initial fabric_send(). + fabric_transact_xmit_release(ftx); + } + else if (ll_ftx_ele->op == (int)TRANSACT_OP_RETRANSMIT) { + if (ftx->m) { + msg_incr_ref(ftx->m); + + msg *m = ftx->m; + cf_node node = ftx->node_id; + + if (as_fabric_send(node, m, AS_FABRIC_CHANNEL_META) != 0) { + cf_detail(AS_FABRIC, "fabric: transact: %lu retransmit send failed", tid); + as_fabric_msg_put(m); + } + else { + cf_detail(AS_FABRIC, "fabric: transact: %lu retransmit send success", tid); + } + } + + // Decrement ref count, incremented by cf_rchash_get. + fabric_transact_xmit_release(ftx); + } + + // Remove it from linked list. + return CF_LL_REDUCE_DELETE; +} diff --git a/as/src/fabric/hb.c b/as/src/fabric/hb.c new file mode 100644 index 00000000..b293846f --- /dev/null +++ b/as/src/fabric/hb.c @@ -0,0 +1,9055 @@ +/* + * hb.c + * + * Copyright (C) 2012-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "fabric/hb.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_hash_math.h" +#include "citrusleaf/cf_queue.h" + +#include "fault.h" +#include "node.h" +#include "shash.h" +#include "socket.h" + +#include "base/cfg.h" +#include "base/stats.h" +#include "base/thr_info.h" +#include "fabric/endpoint.h" +#include "fabric/fabric.h" +#include "fabric/partition_balance.h" + +/* + * Overview + * ======== + * The heartbeat subsystem is a core clustering module that discovers nodes in + * the cluster and monitors connectivity to them. This subsystem maintains an + * "adjacency list", which is the list of nodes deemed to be alive and connected + * at any instance in time. + * + * The heartbeat subsystem is divided into three sub modules + * 1. Config + * 2. Channel + * 3. Mesh + * 4. Main + * + * Config + * ------ + * This sub module deals with overall heartbeat subsystem configuration and + * dynamic updates to configuration. + * + * Channel + * ------- + * This sub module is responsible for maintaining a channel between this node + * and all known nodes. The channel sub module provides the ability to broadcast + * or uni cast messages to known nodes. + * + * Other modules interact with the channel sub module primarily through events + * raised by the channel sub module. The events help other sub modules infer + * connectivity status to known nodes and react to incoming heartbeat message + * from other nodes. + * + * Depending on the configured mode (mesh. multicast) the channels between this + * node and other nodes could be + * 1. TCP and hence unicast. One per pair of nodes. + * 2. Multicast with UDP. One per cluster. + * + * Mesh + * ---- + * This sub module is responsible for discovering cluster members. New nodes are + * discovered via adjacency lists published in their heartbeats of know nodes. + * The mesh module boots up using configured seed nodes. + * + * Main + * ---- + * This sub module orchestrates other modules and hence main. Its primary + * responsibility is to maintain the adjacency list. + * + * Heartbeat messages + * ================== + * + * Every heartbeat message contains + * 1. the source node's nodeid + * 2. the source node's published ip address + * 3. the source node's published port. + * + * There are the following types of heartbeat messages + * 1. Pulse - messages sent at periodic intervals. Will contain current + * adjacency lists + * 2. Info request - message sent in the mesh mode, to a known mesh node, + * in order to get ip address and port of a newly discovered node. + * 3. Info reply - message sent in response to an info request. Returns + * the node's ip address and port. + * + * Message conventions + * ------------------- + * 1. Published adjacency will always contain the source node. + * + * Design philosophy + * ================= + * + * Locking vs single threaded event loop. + * -------------------------------------- + * This first cut leans toward using locks instead of single threaded event + * loops to protect critical data. The choice is driven by the fact that + * synchronous external and inter-sub module interaction looked like more work + * with single threaded event loops. The design chooses simplicity over + * performance given the lower volumes of events that need to be processed here + * as compared to the transaction processing code. The locks are coarse, one per + * sub module and re-entrant. They are used generously and no function makes an + * assumption of locks prior locks being held. + * + * Inter-module interactions in some cases are via synchronous function calls, + * which run the risk of deadlocks. For now, deadlocks should not happen. + * However, if this ideology complicates code, inter-module interaction will be + * rewritten to use asynchronous event queues. + * + * Locking policy + * ============== + * + * 1. Lock as much as you can. The locks are re-entrant. This is not a critical + * high volume code path, and hence correctness with simplicity is preferred. + * Any read / write access to module state should be under a lock. + * 2. Preventing deadlocks + * a. The enforced lock order is + * 1. Protocol lock (SET_PROTOCOL_LOCK) Uses to ensure protocol set is + * atomic. + * 2. Main module (HB_LOCK) + * 3. Mesh and multicast modules (MESH_LOCK) + * 4. Channel (CHANNEL_LOCK) + * 5. Config (HB_CONFIG_LOCK) + * Always make sure every thread acquires locks in this order ONLY. In terms + * of functions calls only lower numbered modules can call functions from the + * higher numbered modules while holding their onto their locks. + * 3. Events raised / messages passed to listeners should be outside the + * module's lock. + * + * Guidelines for message plugins + * ============================== + * The parse data functions should NOT hold any locks and thus avert deadlocks. + * + * TODO + * ==== + * 1. Extend to allow hostnames in mesh mode across the board. + */ + +/* + * ---------------------------------------------------------------------------- + * Macros + * ---------------------------------------------------------------------------- + */ + +/* + * ---------------------------------------------------------------------------- + * Channel + * ---------------------------------------------------------------------------- + */ + +/** + * Size of the poll events set. + */ +#define POLL_SZ 1024 + +/** + * The number of bytes for the message length on the wire. + */ +#define MSG_WIRE_LENGTH_SIZE 4 + +/** + * Channel idle interval after which check for inactive channel is triggered. + */ +#define CHANNEL_IDLE_CHECK_PERIOD (CHANNEL_NODE_READ_IDLE_TIMEOUT() / 2) + +/** + * A channel times out if there is no msg received from a node in this interval. + * Set to a fraction of node timeout so that a new channel could be set up to + * recover from a potentially bad connection before the node times out. + */ +#define CHANNEL_NODE_READ_IDLE_TIMEOUT() \ +(PULSE_TRANSMIT_INTERVAL() \ + * MAX(2, config_max_intervals_missed_get() / 3)) + +/** + * Acquire a lock on the entire channel sub module. + */ +#define CHANNEL_LOCK() (pthread_mutex_lock(&g_channel_lock)) + +/** + * Relinquish the lock on the entire channel sub module. + */ +#define CHANNEL_UNLOCK() (pthread_mutex_unlock(&g_channel_lock)) + +/* + * ---------------------------------------------------------------------------- + * Mesh and Multicast + * ---------------------------------------------------------------------------- + */ + +/** + * Read write timeout (in ms). + */ +#define MESH_RW_TIMEOUT 5 + +/** + * Size of the network header. + * + * Maximum size of IPv4 header - 20 bytes (assuming no variable length fields) + * Fixed size of IPv6 header - 40 bytes (assuming no extension headers) + * Maximum size of TCP header - 60 Bytes + * Size of UDP header (fixed) - 8 bytes + * So maximum size of empty TCP datagram - 60 + 20 = 80 bytes + * So maximum size of empty IPv4 UDP datagram - 20 + 8 = 28 bytes + * So maximum size of empty IPv6 UDP datagram - 40 + 8 = 48 bytes + * + * Being conservative and assuming 30 bytes for IPv4 UDP header and 50 bytes for + * IPv6 UDP header. + */ +#define UDP_HEADER_SIZE_MAX 50 + +/** + * Expected ratio - (input size) / (compressed size). Assuming 40% decrease in + * size after compression. + */ +#define MSG_COMPRESSION_RATIO (1.0 / 0.60) + +/** + * Mesh timeout for pending nodes. + */ +#define MESH_PENDING_TIMEOUT (CONNECT_TIMEOUT()) + +/** + * Mesh inactive timeout after which a mesh node will be forgotten. + */ +#define MESH_INACTIVE_TIMEOUT (10 * HB_NODE_TIMEOUT()) + +/** + * Mesh timeout for getting the endpoint for a node after which this node will + * be forgotten. + */ +#define MESH_ENDPOINT_UNKNOWN_TIMEOUT (HB_NODE_TIMEOUT()) + +/** + * Intervals at which mesh tender runs. + */ +#define MESH_TEND_INTERVAL (PULSE_TRANSMIT_INTERVAL()) + +/** + * Intervals at which attempts to resolve unresolved seed hostname will be made. + */ +#define MESH_SEED_RESOLVE_ATTEMPT_INTERVAL() (HB_NODE_TIMEOUT()) + +/** + * Intervals at which conflict checks is enabled. + */ +#define MESH_CONFLICT_CHECK_INTERVAL() (5 * HB_NODE_TIMEOUT()) + +/** + * Duration for which conflicts are checked. + */ +#define MESH_CONFLICT_CHECK_DURATION() (MESH_CONFLICT_CHECK_INTERVAL() / 5) + +/** + * Acquire a lock on the entire mesh sub module. + */ +#define MESH_LOCK() (pthread_mutex_lock(&g_mesh_lock)) + +/** + * Relinquish the lock on the entire mesh sub module. + */ +#define MESH_UNLOCK() (pthread_mutex_unlock(&g_mesh_lock)) + +/** + * Acquire a lock on the entire multicast sub module. + */ +#define MULTICAST_LOCK() (pthread_mutex_lock(&g_multicast_lock)) + +/** + * Relinquish the lock on the entire multicast sub module. + */ +#define MULTICAST_UNLOCK() (pthread_mutex_unlock(&g_multicast_lock)) + +/* + * ---------------------------------------------------------------------------- + * Main + * ---------------------------------------------------------------------------- + */ + +/** + * The identifier for heartbeat protocol version 3. + */ +#define HB_PROTOCOL_V3_IDENTIFIER 0x6864 + +/** + * Maximum length of hb protocol string. + */ +#define HB_PROTOCOL_STR_MAX_LEN 16 + +/** + * Default allocation size for plugin data. + */ +#define HB_PLUGIN_DATA_DEFAULT_SIZE 128 + +/** + * Block size for allocating node plugin data. Ensure the allocation is in + * multiples of 128 bytes, allowing expansion to 16 nodes without reallocating. + */ +#define HB_PLUGIN_DATA_BLOCK_SIZE 128 + +/** + * Message scratch size for v3 HB messages. To accommodate 64 node cluster. + */ +#define AS_HB_MSG_SCRATCH_SIZE 1024 + +/** + * A soft limit for the maximum cluster size. Meant to be optimize hash and list + * data structures and not as a limit on the number of nodes. + */ +#define AS_HB_CLUSTER_MAX_SIZE_SOFT 200 + +/** + * Maximum event listeners. + */ +#define AS_HB_EVENT_LISTENER_MAX 7 + +/** + * Maximum permissible cluster-name mismatch per node. + */ +#define CLUSTER_NAME_MISMATCH_MAX 2 + +/** + * Timeout for deeming a node dead based on received heartbeats. + */ +#define HB_NODE_TIMEOUT() \ +((config_max_intervals_missed_get() * config_tx_interval_get())) + +/** + * Intervals at which heartbeats are send. + */ +#define PULSE_TRANSMIT_INTERVAL() \ +(MAX(config_tx_interval_get(), AS_HB_TX_INTERVAL_MS_MIN)) + +/** + * Intervals at which adjacency tender runs. + */ +#define ADJACENCY_TEND_INTERVAL (PULSE_TRANSMIT_INTERVAL()) + +/** + * Intervals at which adjacency tender runs in anticipation of addtional node + * depart events. + */ +#define ADJACENCY_FAST_TEND_INTERVAL (MIN(ADJACENCY_TEND_INTERVAL, 10)) + +/** + * Acquire a lock on the external event publisher. + */ +#define EXTERNAL_EVENT_PUBLISH_LOCK() \ +(pthread_mutex_lock(&g_external_event_publish_lock)) + +/** + * Relinquish the lock on the external event publisher. + */ +#define EXTERNAL_EVENT_PUBLISH_UNLOCK() \ +(pthread_mutex_unlock(&g_external_event_publish_lock)) + +/** + * Acquire a lock on the heartbeat main module. + */ +#define HB_LOCK() (pthread_mutex_lock(&g_hb_lock)) + +/** + * Relinquish the lock on the heartbeat main module. + */ +#define HB_UNLOCK() (pthread_mutex_unlock(&g_hb_lock)) + +/** + * Weightage of current latency over current moving average. For now weigh + * recent values heavily over older values. + */ +#define ALPHA (0.65) + +/* + * ---------------------------------------------------------------------------- + * Common + * ---------------------------------------------------------------------------- + */ + +/** + * The default MTU for multicast in case device discovery fails. + */ +#define DEFAULT_MIN_MTU 1500 + +/** + * Maximum memory size allocated on the call stack. + */ +#define STACK_ALLOC_LIMIT (16 * 1024) + +/** + * Max string length for an endpoint list converted to a string. + */ +#define ENDPOINT_LIST_STR_SIZE 1024 + +/** + * A hard limit on the buffer size for parsing incoming messages. + */ +#define MSG_BUFFER_MAX_SIZE (10 * 1024 * 1024) + +#ifndef ASC +#define ASC (2 << 2) +#endif + +/** + * Connection initiation timeout, Capped at 100 ms. + */ +#define CONNECT_TIMEOUT() (MIN(100, config_tx_interval_get())) + +/** + * Allocate a buffer for heart beat messages. Larger buffers are heap allocated + * to prevent stack overflows. + */ +#define MSG_BUFF_ALLOC(size) ( \ + (size) <= MSG_BUFFER_MAX_SIZE ? \ + (((size) > STACK_ALLOC_LIMIT) ? \ + cf_malloc(size) : alloca(size)) : NULL) + +/** + * Allocate a buffer for heart beat messages. Larger buffers are heap allocated + * to prevent stack overflows. Crashes the process on failure to allocate the + * buffer. + */ +#define MSG_BUFF_ALLOC_OR_DIE(size, crash_msg, ...) \ +({ \ + uint8_t* retval = MSG_BUFF_ALLOC((size)); \ + if (!retval) { \ + CRASH(crash_msg, ##__VA_ARGS__); \ + } \ + retval; \ +}) + +/** + * Free the buffer allocated by MSG_BUFF_ALLOC + */ +#define MSG_BUFF_FREE(buffer, size) \ +if (((size) > STACK_ALLOC_LIMIT) && buffer) {cf_free(buffer);} + +/** + * Acquire a lock on the entire config sub module. + */ +#define HB_CONFIG_LOCK() (pthread_mutex_lock(&g_hb_config_lock)) + +/** + * Relinquish the lock on the entire config sub module. + */ +#define HB_CONFIG_UNLOCK() (pthread_mutex_unlock(&g_hb_config_lock)) + +/** + * Acquire a lock while setting heartbeat protocol dynamically. + */ +#define SET_PROTOCOL_LOCK() (pthread_mutex_lock(&g_set_protocol_lock)) + +/** + * Relinquish the lock after setting heartbeat protocol dynamically. + */ +#define SET_PROTOCOL_UNLOCK() (pthread_mutex_unlock(&g_set_protocol_lock)) + +/** + * Logging macros. + */ +#define CRASH(format, ...) cf_crash(AS_HB, format, ##__VA_ARGS__) +#define CRASH_NOSTACK(format, ...) cf_crash_nostack(AS_HB, format, ##__VA_ARGS__) +#define WARNING(format, ...) cf_warning(AS_HB, format, ##__VA_ARGS__) +#define TICKER_WARNING(format, ...) \ +cf_ticker_warning(AS_HB, format, ##__VA_ARGS__) +#define INFO(format, ...) cf_info(AS_HB, format, ##__VA_ARGS__) +#define DEBUG(format, ...) cf_debug(AS_HB, format, ##__VA_ARGS__) +#define DETAIL(format, ...) cf_detail(AS_HB, format, ##__VA_ARGS__) +#define ASSERT(expression, message, ...) \ +if (!(expression)) {WARNING(message, ##__VA_ARGS__);} + +/* + * ---------------------------------------------------------------------------- + * Private internal data structures + * ---------------------------------------------------------------------------- + */ + +/* + * ---------------------------------------------------------------------------- + * Common + * ---------------------------------------------------------------------------- + */ + +/** + * Heartbeat subsystem state. + */ +typedef enum +{ + AS_HB_STATUS_UNINITIALIZED, + AS_HB_STATUS_RUNNING, + AS_HB_STATUS_SHUTTING_DOWN, + AS_HB_STATUS_STOPPED +} as_hb_status; + +/* + * ---------------------------------------------------------------------------- + * Mesh related + * ---------------------------------------------------------------------------- + */ + +/** + * Mesh node status enum. + */ +typedef enum +{ + /** + * The mesh node has an active channel. + */ + AS_HB_MESH_NODE_CHANNEL_ACTIVE, + + /** + * The mesh node is waiting for an active channel. + */ + AS_HB_MESH_NODE_CHANNEL_PENDING, + + /** + * The mesh node does not have an active channel. + */ + AS_HB_MESH_NODE_CHANNEL_INACTIVE, + + /** + * The ip address and port for this node are not yet known. + */ + AS_HB_MESH_NODE_ENDPOINT_UNKNOWN, + + /** + * The sentinel value. Should be the last in the enum. + */ + AS_HB_MESH_NODE_STATUS_SENTINEL +} as_hb_mesh_node_status; + +/** + * The info payload for a single node. + */ +typedef struct as_hb_mesh_info_reply_s +{ + /** + * The nodeid of the node for which info reply is sent. + */ + cf_node nodeid; + + /** + * The advertised endpoint list for this node. List to allow variable size + * endpoint list. Always access as reply.endpoints[0]. + */ + as_endpoint_list endpoint_list[]; +}__attribute__((__packed__)) as_hb_mesh_info_reply; + +/** + * Mesh tend reduce function udata. + */ +typedef struct as_hb_mesh_tend_reduce_udata_s +{ + /** + * The new endpoint lists to connect to. Each list has endpoints for s + * single remote peer. + */ + as_endpoint_list** to_connect; + + /** + * The capacity of the to connect array. + */ + size_t to_connect_capacity; + + /** + * The count of endpoints to connect. + */ + size_t to_connect_count; + + /** + * Pointers to seeds that need matching. + */ + cf_vector* inactive_seeds_p; +} as_hb_mesh_tend_reduce_udata; + +/** + * Mesh endpoint search udata. + */ +typedef struct +{ + /** + * The endpoint to search. + */ + cf_sock_addr* to_search; + + /** + * Indicates is a match is found. + */ + bool found; +} as_hb_endpoint_list_addr_find_udata; + +/** + * Mesh endpoint list search udata. + */ +typedef struct as_hb_mesh_endpoint_list_reduce_udata_s +{ + /** + * The endpoint to search. + */ + as_endpoint_list* to_search; + + /** + * Indicates is a match is found. + */ + bool found; + + /** + * The matched key if found. + */ + cf_node* matched_nodeid; +} as_hb_mesh_endpoint_list_reduce_udata; + +/** + * Information maintained for configured mesh seed nodes. + */ +typedef struct as_hb_mesh_seed_s +{ + /** + * The name / ip address of this seed mesh host. + */ + char seed_host_name[HOST_NAME_MAX]; + + /** + * The port of this seed mesh host. + */ + cf_ip_port seed_port; + + /** + * Identifies TLS mesh seed hosts. + */ + bool seed_tls; + + /** + * The heap allocated end point list for this seed host resolved usiung the + * seeds hostname. + * Will be null if the endpoint list cannot be resolved. + */ + as_endpoint_list* resolved_endpoint_list; + + /** + * Timestamp when the seed hostname was resolved into the endpoint list. + * Used to perform periodic refresh of the endpoint list. + */ + cf_clock resolved_endpoint_list_ts; + + /** + * The state of this seed in terms of established channel. + */ + as_hb_mesh_node_status status; + + /** + * The last time the state of this node was updated. + */ + cf_clock last_status_updated; + + /** + * The node id for a matching mesh node entry. A zero will indicate that + * there exists no matching mesh node entry. + */ + cf_node mesh_nodeid; + + /** + * Timestamp indicating when the matching mesh node's endpoint was updated. + * Used to detect endpoint changes to the matching mesh node entry if it + * exists. + */ + as_hlc_timestamp mesh_node_endpoint_change_ts; +} as_hb_mesh_seed; + +/** + * Information maintained for discovered mesh end points. + */ +typedef struct as_hb_mesh_node_s +{ + /** + * The heap allocated end point list for this mesh host. Should be freed + * once the last mesh entry is removed from the mesh state. + */ + as_endpoint_list* endpoint_list; + + /** + * Timestamp when the mesh node was last updated. + */ + as_hlc_timestamp endpoint_change_ts; + + /** + * The state of this node in terms of established channel. + */ + as_hb_mesh_node_status status; + + /** + * The last time the state of this node was updated. + */ + cf_clock last_status_updated; + + /** + * The time this node's channel become inactive. + */ + cf_clock inactive_since; +} as_hb_mesh_node; + +/** + * State maintained for the mesh mode. + */ +typedef struct as_hb_mesh_state_s +{ + /** + * The sockets on which this instance accepts heartbeat tcp connections. + */ + cf_sockets listening_sockets; + + /** + * Indicates if the published endpoint list is ipv4 only. + */ + bool published_endpoint_list_ipv4_only; + + /** + * The published endpoint list. + */ + as_endpoint_list* published_endpoint_list; + + /** + * Mesh seed data. + */ + cf_vector seeds; + + /** + * A map from an cf_node _key to a mesh node. + */ + cf_shash* nodeid_to_mesh_node; + + /** + * Thread id for the mesh tender thread. + */ + pthread_t mesh_tender_tid; + + /** + * The status of the mesh module. + */ + as_hb_status status; + + /** + * The mtu on the listening device. This is extrapolated to all nodes and + * paths in the cluster. This limits the cluster size possible. + */ + int min_mtu; + + /** + * Indicates if new nodes are discovered. Optimization to start mesh tend + * earlier than normal tend interval on discovering new nodes. + */ + bool nodes_discovered; +} as_hb_mesh_state; + +/* + * ---------------------------------------------------------------------------- + * Multicast data structures + * ---------------------------------------------------------------------------- + */ + +/** + * State maintained for the multicast mode. + */ +typedef struct as_hb_multicast_state_s +{ + /** + * The sockets associated with multicast mode. + */ + cf_mserv_cfg cfg; + + /** + * Multicast listening sockets. + */ + cf_sockets listening_sockets; + + /** + * The mtu on the listening device. This is extrapolated to all nodes and + * paths in the cluster. This limits the cluster size possible. + */ + int min_mtu; +} as_hb_multicast_state; + +/* + * ---------------------------------------------------------------------------- + * Channel state + * ---------------------------------------------------------------------------- + */ + +/** + * The type of a channel event. + */ +typedef enum +{ + /** + * The endpoint has a channel tx/rx channel associated with it. + */ + AS_HB_CHANNEL_NODE_CONNECTED, + + /** + * The endpoint had a tx/rx channel that went down. + */ + AS_HB_CHANNEL_NODE_DISCONNECTED, + + /** + * A message was received on a connected channel. The message in the event, + * is guaranteed to have passed basic sanity check like have protocol id, + * type and source nodeid. + */ + AS_HB_CHANNEL_MSG_RECEIVED, + + /** + * Channel found node whose cluster name does not match. + */ + AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH +} as_hb_channel_event_type; + +/** + * Status for reads from a channel. + */ +typedef enum +{ + /** + * The message was read successfully and parser. + */ + AS_HB_CHANNEL_MSG_READ_SUCCESS, + + /** + * The message read successfully but parsing failed. + */ + AS_HB_CHANNEL_MSG_PARSE_FAIL, + + /** + * The message read failed network io. + */ + AS_HB_CHANNEL_MSG_CHANNEL_FAIL, + + /** + * Sentinel default value. + */ + AS_HB_CHANNEL_MSG_READ_UNDEF +} as_hb_channel_msg_read_status; + +typedef struct +{ + /** + * The endpoint address to search channel by. + */ + as_endpoint_list* endpoint_list; + + /** + * Indicates if the endpoint was found. + */ + bool found; + + /** + * The matching socket, if found. + */ + cf_socket* socket; +} as_hb_channel_endpoint_reduce_udata; + +typedef struct +{ + /** + * The endpoint address to search channel by. + */ + cf_sock_addr* addr_to_search; + + /** + * Indicates if the endpoint was found. + */ + bool found; +} as_hb_channel_endpoint_iterate_udata; + +typedef struct +{ + /** + * The message buffer to send. + */ + uint8_t* buffer; + + /** + * The buffer length. + */ + size_t buffer_len; +} as_hb_channel_buffer_udata; + +/** + * A channel represents a medium to send and receive messages. + */ +typedef struct as_hb_channel_s +{ + /** + * Indicates if this channel is a multicast channel. + */ + bool is_multicast; + + /** + * Indicates if this channel is inbound. Not relevant for multicast + * channels. + */ + bool is_inbound; + + /** + * The id of the associated node. In mesh / unicast case this will initially + * be zero and filled in when the nodeid for the node at the other end is + * learnt. In multicast case this will be zero. + */ + cf_node nodeid; + + /** + * The address of the peer. Will always be specified for outbound channels. + */ + cf_sock_addr endpoint_addr; + + /** + * The last time a message was received from this node. + */ + cf_clock last_received; + + /** + * Time when this channel won a socket resolution. Zero if this channel + * never won resolution. In compatibility mode with older code its possible + * we will keep allowing the same socket to win and enter an infinite loop + * of closing the sockets. + */ + cf_clock resolution_win_ts; +} as_hb_channel; + +/** + * State maintained per heartbeat channel. + */ +typedef struct as_hb_channel_state_s +{ + /** + * The poll handle. All IO wait across all heartbeat connections happens on + * this handle. + */ + cf_poll poll; + + /** + * Channel status. + */ + as_hb_status status; + + /** + * Maps a socket to an as_hb_channel. + */ + cf_shash* socket_to_channel; + + /** + * Maps a nodeid to a channel specific node data structure. This association + * will be made only on receiving the first heartbeat message from the node + * on a channel. + */ + cf_shash* nodeid_to_socket; + + /** + * Sockets accumulated by the channel tender to close at the end of every + * epoll loop. + */ + cf_queue socket_close_queue; + + /** + * The sockets on which heartbeat subsystem listens. + */ + cf_sockets* listening_sockets; + + /** + * Clock to keep track of last time idle connections were checked. + */ + cf_clock last_channel_idle_check; + + /** + * Enables / disables publishing channel events. Events should be disabled + * only when the state changes are temporary / transient and hence would not + * change the overall channel state from an external perspective. + */ + bool events_enabled; + + /** + * Events are batched and published to reduce cluster transitions. Queue of + * unpublished heartbeat events. + */ + cf_queue events_queue; + + /** + * Thread id for the socket tender thread. + */ + pthread_t channel_tender_tid; +} as_hb_channel_state; + +/** + * Entry queued up for socket close. + */ +typedef struct as_hb_channel_socket_close_entry_s +{ + /** + * The node for which this event was generated. + */ + cf_socket* socket; + /** + * Indicates if this close is a remote close. + */ + bool is_remote; + /** + * True if close of this entry should generate a disconnect event. + */ + bool raise_close_event; +} as_hb_channel_socket_close_entry; + +/** + * An event generated by the channel sub module. + */ +typedef struct as_hb_channel_event_s +{ + /** + * The channel event type. + */ + as_hb_channel_event_type type; + + /** + * The node for which this event was generated. + */ + cf_node nodeid; + + /** + * The received message if any over this endpoint. Valid for incoming + * message type event. The message if not NULL never be edited or copied + * over. + */ + msg* msg; + + /** + * The hlc timestamp for message receipt. + */ + as_hlc_msg_timestamp msg_hlc_ts; +} as_hb_channel_event; + +/* + * ---------------------------------------------------------------------------- + * Main sub module state + * ---------------------------------------------------------------------------- + */ + +/** + * Heartbeat message types. + */ +typedef enum +{ + AS_HB_MSG_TYPE_PULSE, + AS_HB_MSG_TYPE_INFO_REQUEST, + AS_HB_MSG_TYPE_INFO_REPLY, + AS_HB_MSG_TYPE_COMPRESSED +} as_hb_msg_type; + +/** + * Events published by the heartbeat subsystem. + */ +typedef enum +{ + AS_HB_INTERNAL_NODE_ARRIVE, + AS_HB_INTERNAL_NODE_DEPART, + AS_HB_INTERNAL_NODE_EVICT, + AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED +} as_hb_internal_event_type; + +/** + * State maintained by the heartbeat subsystem for the selected mode. + */ +typedef struct as_hb_mode_state_s +{ + /** + * The mesh / multicast state. + */ + union + { + as_hb_mesh_state mesh_state; + as_hb_multicast_state multicast_state; + }; +} as_hb_mode_state; + +/** + * Plugin data iterate reduce udata. + */ +typedef struct +{ + /** + * The plugin id. + */ + as_hb_plugin_id pluginid; + + /** + * The iterate function. + */ + as_hb_plugin_data_iterate_fn iterate_fn; + + /** + * The udata for the iterate function. + */ + void* udata; +} as_hb_adjacecny_iterate_reduce_udata; + +/** + * Information tracked for an adjacent nodes. + */ +typedef struct as_hb_adjacent_node_s +{ + /** + * The heart beat protocol version. + */ + uint32_t protocol_version; + + /** + * The remote node's + */ + as_endpoint_list* endpoint_list; + + /** + * Used to cycle between the two copies of plugin data. + */ + int plugin_data_cycler; + + /** + * Plugin specific data accumulated by the heartbeat subsystem. The data is + * heap allocated and should be destroyed the moment this element entry is + * unused. There are two copies of the plugin data, one the current copy and + * one the previous copy. Previous copy is used to generate data change + * notifications. + */ + as_hb_plugin_node_data plugin_data[AS_HB_PLUGIN_SENTINEL][2]; + + /** + * The monotonic local time node information was last updated. + */ + cf_clock last_updated_monotonic_ts; + + /** + * HLC timestamp for the last pulse message. + */ + as_hlc_msg_timestamp last_msg_hlc_ts; + + /** + * Track number of consecutive cluster-name mismatches. + */ + uint32_t cluster_name_mismatch_count; + + /** + * Moving average of the latency in ms. + */ + uint64_t avg_latency; + + /** + * A shift register tracking change of endpoints. On receipt of a heartbeat, + * if source node's endpoints change 1 is inserted at the LSB, else 0 is + * inserted at the LSB. + */ + uint64_t endpoint_change_tracker; +} as_hb_adjacent_node; + +/** + * Internal storage for external event listeners. + */ +typedef struct as_hb_event_listener_s +{ + /** + * Registered callback function. + */ + as_hb_event_fn event_callback; + + /** + * Arguments for the listeners. + */ + void* udata; +} as_hb_event_listener; + +/** + * Heartbeat subsystem internal state. + */ +typedef struct as_hb_s +{ + /** + * The status of the subsystem. + */ + as_hb_status status; + + /** + * The adjacency dictionary. The key is the nodeid. The value is an instance + * of as_hb_adjacent_node. + */ + cf_shash* adjacency; + + /** + * The probation dictionary having nodes that display unexpected behavior. + * Nodeids under probation and adjacency hash are always exclusive. The key + * is the nodeid. The value is an instance of as_hb_adjacent_node. + */ + cf_shash* on_probation; + + /** + * Temporary nodeid to index hash used to compute nodes to evict from a + * clique. + */ + cf_shash* nodeid_to_index; + + /** + * The mode specific state. + */ + as_hb_mode_state mode_state; + + /** + * The channel state. + */ + as_hb_channel_state channel_state; + + /** + * Self node accumulated stats used primarily to detect duplicate node-ids. + */ + as_hb_adjacent_node self_node; + + /** + * Indicates self node-id has duplicates. + */ + bool self_is_duplicate; + + /** + * Monotonic timestamp of when a self duplicate was detected. + */ + cf_clock self_duplicate_detected_ts; + + /** + * The plugin dictionary. The key is the as_hb_plugin entry and the value an + * instance of as_hb_plugin. + */ + as_hb_plugin plugins[AS_HB_PLUGIN_SENTINEL]; + + /** + * Thread id for the transmitter thread. + */ + pthread_t transmitter_tid; + + /** + * Thread id for the thread expiring nodes from the adjacency list. + */ + pthread_t adjacency_tender_tid; +} as_hb; + +/** + * Registered heartbeat listeners. + */ +typedef struct as_hb_external_events_s +{ + /** + * Events are batched and published. Queue of unpublished heartbeat events. + */ + cf_queue external_events_queue; + + /** + * Count of event listeners. + */ + int event_listener_count; + + /** + * External event listeners. + */ + as_hb_event_listener event_listeners[AS_HB_EVENT_LISTENER_MAX]; +} as_hb_external_events; + +/** + * Shash reduce function to read current adjacency list. + */ +typedef struct as_hb_adjacency_reduce_udata_s +{ + /** + * The target adjacency list. + */ + cf_node* adj_list; + + /** + * Count of elements in the adjacency list. + */ + int adj_count; +} as_hb_adjacency_reduce_udata; + +/** + * Udata for finding nodes in the adjacency list not in the input succession + * list. + */ +typedef struct +{ + /** + * Number of events generated. + */ + int event_count; + + /** + * List of generated events. + */ + as_hb_event_node* events; + + /** + * Limit on number of generated events. + */ + int max_events; + + /** + * Current succession list. + */ + cf_node* succession; + + /** + * Number of nodes in succession list. + */ + int succession_size; +} as_hb_find_new_nodes_reduce_udata; + +/** + * Shash reduce function to read current adjacency list. + */ +typedef struct as_hb_adjacency_tender_udata_s +{ + /** + * The list of expired nodes. + */ + cf_node* dead_nodes; + + /** + * Count of elements in the dead node list. + */ + int dead_node_count; + + /** + * The list of evicted nodes , e.g. due to cluster name mismatch. + */ + cf_node* evicted_nodes; + + /** + * Count of elements in the evicted node list. + */ + int evicted_node_count; +} as_hb_adjacency_tender_udata; + +/** + * Udata for tip clear. + */ +typedef struct as_hb_mesh_tip_clear_udata_s +{ + /** + * Host IP or DNS name to be cleared from seed list. + */ + char host[HOST_NAME_MAX]; + /** + * Listening port of the host. + */ + int port; + + /** + * Node id if a specific node-id needs to be removed as well. + */ + cf_node nodeid; + + /** + * Tip-clear status + */ + bool entry_deleted; +} as_hb_mesh_tip_clear_udata; + +/** + * Convert endpoint list to string in a process function. + */ +typedef struct endpoint_list_to_string_udata_s +{ + /** + * The endpoint list in string format. + */ + char* endpoint_list_str; + + /** + * The size of enpoint list. + */ + size_t endpoint_list_str_capacity; +} endpoint_list_to_string_udata; + +/** + * Udata to fill an endpoint list into a message. + */ +typedef struct endpoint_list_to_msg_udata_s +{ + /** + * The target message. + */ + msg* msg; + + /** + * Indicates if we are running in mesh mode. + */ + bool is_mesh; +} endpoint_list_to_msg_udata; + +/** + * Udata to test if this endpoint list overlaps with other endpoint list. + */ +typedef struct endpoint_list_equal_check_udata_s +{ + /** + * The endpoint list of the new node. + */ + as_endpoint_list* other; + + /** + * Output. Indicates if the lists are equal. + */ + bool are_equal; +} endpoint_list_equal_check_udata; + +/** + * Endpoint list process function. + * @param endpoint current endpoint in the iteration. + * @param udata udata passed through from the invoker of the iterate function. + */ +typedef void +(*endpoint_list_process_fn)(const as_endpoint_list* endpoint_list, void* udata); + +/** + * Seed host list reduce udata. + */ +typedef struct as_hb_seed_host_list_udata_s +{ + /** + * The buffer to receive the list. + */ + cf_dyn_buf* db; + + /** + * Selects TLS seed nodes. + */ + bool tls; +} as_hb_seed_host_list_udata; + +/* + * ---------------------------------------------------------------------------- + * Globals + * ---------------------------------------------------------------------------- + */ + +/** + * Global heartbeat instance. + */ +static as_hb g_hb; + +/** + * Global heartbeat events listener instance. + */ +static as_hb_external_events g_hb_event_listeners; + +/** + * The big fat lock for all external event publishing. This ensures that a batch + * of external events are published atomically to preserve the order of external + * events. + */ +static pthread_mutex_t g_external_event_publish_lock = + PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +/** + * Global lock to serialize all read and writes to the heartbeat subsystem. + */ +static pthread_mutex_t g_hb_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +/** + * The big fat lock for all channel state. + */ +static pthread_mutex_t g_channel_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +/** + * The big fat lock for all mesh state. + */ +static pthread_mutex_t g_mesh_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +/** + * The big fat lock for all multicast state. + */ +static pthread_mutex_t g_multicast_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +/** + * The global lock for all heartbeat configuration. + */ +static pthread_mutex_t g_hb_config_lock = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +/** + * The lock used while setting heartbeat protocol. + */ +static pthread_mutex_t g_set_protocol_lock = + PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +/** + * Message templates for heartbeat messages. + */ +static msg_template g_hb_msg_template[] = { + +{ AS_HB_MSG_ID, M_FT_UINT32 }, + +{ AS_HB_MSG_TYPE, M_FT_UINT32 }, + +{ AS_HB_MSG_NODE, M_FT_UINT64 }, + +{ AS_HB_MSG_CLUSTER_NAME, M_FT_STR }, + +{ AS_HB_MSG_HLC_TIMESTAMP, M_FT_UINT64 }, + +{ AS_HB_MSG_ENDPOINTS, M_FT_BUF }, + +{ AS_HB_MSG_COMPRESSED_PAYLOAD, M_FT_BUF }, + +{ AS_HB_MSG_INFO_REQUEST, M_FT_BUF }, + +{ AS_HB_MSG_INFO_REPLY, M_FT_BUF }, + +{ AS_HB_MSG_FABRIC_DATA, M_FT_BUF }, + +{ AS_HB_MSG_HB_DATA, M_FT_BUF }, + +{ AS_HB_MSG_PAXOS_DATA, M_FT_BUF }, + +{ AS_HB_MSG_SKEW_MONITOR_DATA, M_FT_UINT64 } }; + +/* + * ---------------------------------------------------------------------------- + * Private internal function forward declarations. + * ---------------------------------------------------------------------------- + */ + +static void info_append_addrs(cf_dyn_buf *db, const char *name, const cf_addr_list *list); +static uint32_t round_up_pow2(uint32_t v); +static int vector_find(cf_vector* vector, const void* element); + +static void endpoint_list_copy(as_endpoint_list** dest, as_endpoint_list* src); +static void endpoint_list_to_string_process(const as_endpoint_list* endpoint_list, void* udata); +static void endpoint_list_equal_process(const as_endpoint_list* endpoint_list, void* udata); + +static int msg_compression_threshold(int mtu); +static int msg_endpoint_list_get(msg* msg, as_endpoint_list** endpoint_list); +static int msg_id_get(msg* msg, uint32_t* id); +static int msg_nodeid_get(msg* msg, cf_node* nodeid); +static int msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts); +static int msg_type_get(msg* msg, as_hb_msg_type* type); +static int msg_cluster_name_get(msg* msg, char** cluster_name); +static int msg_node_list_get(msg* msg, int field_id, cf_node** adj_list, size_t* adj_length); +static int msg_adjacency_get(msg* msg, cf_node** adj_list, size_t* adj_length); +static void msg_node_list_set(msg* msg, int field_id, cf_node* node_list, size_t node_length); +static void msg_adjacency_set(msg* msg, cf_node* adj_list, size_t adj_length); +static int msg_info_reply_get(msg* msg, as_hb_mesh_info_reply** reply, size_t* reply_count); +static void msg_published_endpoints_fill(const as_endpoint_list* published_endpoint_list, void* udata); +static void msg_src_fields_fill(msg* msg); +static void msg_type_set(msg* msg, as_hb_msg_type msg_type); + +static int config_mcsize(); +static const cf_serv_cfg* config_bind_cfg_get(); +static const cf_mserv_cfg* config_multicast_group_cfg_get(); +static uint32_t config_tx_interval_get(); +static void config_tx_interval_set(uint32_t new_interval); +static uint32_t config_override_mtu_get(); +static void config_override_mtu_set(uint32_t mtu); +static uint32_t config_max_intervals_missed_get(); +static void config_max_intervals_missed_set(uint32_t new_max); +static unsigned char config_multicast_ttl_get(); +static as_hb_protocol config_protocol_get(); +static void config_protocol_set(as_hb_protocol new_protocol); +static cf_node config_self_nodeid_get(); +static as_hb_mode config_mode_get(); +static void config_bind_serv_cfg_expand(const cf_serv_cfg* bind_cfg, cf_serv_cfg* published_cfg, bool ipv4_only); +static bool config_binding_is_valid(char** error, as_hb_protocol protocol); + +static void channel_init_channel(as_hb_channel* channel); +static void channel_event_init(as_hb_channel_event* event); +static bool channel_is_running(); +static bool channel_is_stopped(); +static uint32_t channel_win_grace_ms(); +static void channel_events_enabled_set(bool enabled); +static bool channel_are_events_enabled(); +static void channel_event_queue(as_hb_channel_event* event); +static void channel_event_publish_pending(); +static int channel_get_channel(cf_socket* socket, as_hb_channel* result); +static void channel_socket_shutdown(cf_socket* socket); +static int channel_socket_get(cf_node nodeid, cf_socket** socket); +static bool channel_cf_sockets_contains(cf_sockets* sockets, cf_socket* to_find); +static void channel_socket_destroy(cf_socket* sock); +static void channel_socket_close(cf_socket* socket, bool remote_close, bool raise_close_event); +static void channel_sockets_close(cf_vector* sockets); +static void channel_socket_close_queue(cf_socket* socket, bool is_remote_close, bool raise_close_event); +static void channel_socket_close_pending(); +static void channel_socket_register(cf_socket* socket, bool is_multicast, bool is_inbound, cf_sock_addr* endpoint_addr); +static void channel_accept_connection(cf_socket* lsock); +static as_hb_channel_msg_read_status channel_compressed_message_parse(msg* msg, void* buffer, int buffer_content_len); +static void channel_endpoint_find_iterate_fn(const as_endpoint* endpoint, void* udata); +static int channel_endpoint_search_reduce(const void* key, void* data, void* udata); +static bool channel_endpoint_is_connected(as_endpoint_list* endpoint_list); +static as_hb_channel_msg_read_status channel_multicast_msg_read(cf_socket* socket, msg* msg); +static as_hb_channel_msg_read_status channel_mesh_msg_read(cf_socket* socket, msg* msg); +static void channel_node_attach(cf_socket* socket, as_hb_channel* channel, cf_node nodeid); +static bool channel_socket_should_live(cf_socket* socket, as_hb_channel* channel); +static cf_socket* channel_socket_resolve(cf_socket* socket1, cf_socket* socket2); +static int channel_msg_sanity_check(as_hb_channel_event* msg_event); +static int channel_msg_event_process(cf_socket* socket, as_hb_channel_event* event); +static void channel_msg_read(cf_socket* socket); +static void channel_channels_idle_check(); +void* channel_tender(void* arg); +static bool channel_mesh_endpoint_filter(const as_endpoint* endpoint, void* udata); +static void channel_mesh_channel_establish(as_endpoint_list** endpoint_lists, int endpoint_list_count); +static int channel_node_disconnect(cf_node nodeid); +static void channel_mesh_listening_socks_register(cf_sockets* listening_sockets); +static void channel_mesh_listening_socks_deregister(cf_sockets* listening_sockets); +static void channel_multicast_listening_socks_register(cf_sockets* listening_sockets); +static void channel_multicast_listening_socks_deregister(cf_sockets* listening_sockets); +static void channel_init(); +static void channel_start(); +static int channel_sockets_get_reduce(const void* key, void* data, void* udata); +static void channel_stop(); +static int channel_mesh_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length); +static int channel_multicast_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length); +static bool channel_msg_is_compression_required(msg* msg, int wire_size, int mtu); +static int channel_msg_buffer_size_get(int wire_size, int mtu); +static size_t channel_msg_buffer_fill(msg* original_msg, int wire_size, int mtu, uint8_t* buffer, size_t buffer_len); +static int channel_msg_unicast(cf_node dest, msg* msg); +static int channel_msg_broadcast_reduce(const void* key, void* data, void* udata); +static int channel_msg_broadcast(msg* msg); +static void channel_clear(); +static int channel_dump_reduce(const void* key, void* data, void* udata); +static void channel_dump(bool verbose); + +static bool mesh_is_running(); +static bool mesh_is_stopped(); +static void mesh_published_endpoints_process(endpoint_list_process_fn process_fn, void* udata); +static const char* mesh_node_status_string(as_hb_mesh_node_status status); +static int mesh_seed_delete_unsafe(int seed_index); +static int mesh_seed_find_unsafe(char* host, int port); +static void mesh_tend_udata_capacity_ensure(as_hb_mesh_tend_reduce_udata* tend_reduce_udata, int mesh_node_count); +static void mesh_node_status_change(as_hb_mesh_node* mesh_node, as_hb_mesh_node_status new_status); +static void mesh_listening_sockets_close(); +static void mesh_seed_host_list_get(cf_dyn_buf* db, bool tls); +static void mesh_seed_inactive_refresh_get_unsafe(cf_vector* inactive_seeds_p); +static void mesh_stop(); +static int mesh_tend_reduce(const void* key, void* data, void* udata); +void* mesh_tender(void* arg); +static void mesh_node_destroy(as_hb_mesh_node* mesh_node); +static void mesh_endpoint_addr_find_iterate(const as_endpoint* endpoint, void* udata); +static bool mesh_node_is_discovered(cf_node nodeid); +static bool mesh_node_endpoint_list_is_valid(cf_node nodeid); +static int mesh_node_get(cf_node nodeid, as_hb_mesh_node* mesh_node); +static void mesh_channel_on_node_disconnect(as_hb_channel_event* event); +static bool mesh_node_check_fix_self_msg(as_hb_channel_event* event); +static void mesh_node_data_update(as_hb_channel_event* event); +static int mesh_info_reply_sizeof(as_hb_mesh_info_reply* reply, int reply_count, size_t* reply_size); +static void mesh_nodes_send_info_reply(cf_node dest, as_hb_mesh_info_reply* reply, size_t reply_count); +static msg* mesh_info_msg_init(as_hb_msg_type msg_type); +static void mesh_nodes_send_info_request(msg* in_msg, cf_node dest, cf_node* to_discover, size_t to_discover_count); +static void mesh_channel_on_pulse(msg* msg); +static void mesh_channel_on_info_request(msg* msg); +static void mesh_channel_on_info_reply(msg* msg); +static int mesh_tip(char* host, int port, bool tls); +static void mesh_channel_event_process(as_hb_channel_event* event); +static void mesh_init(); +static int mesh_free_node_data_reduce(const void* key, void* data, void* udata); +static int mesh_tip_clear_reduce(const void* key, void* data, void* udata); +static int mesh_peer_endpoint_reduce(const void* key, void* data, void* udata); +static void mesh_clear(); +static void mesh_listening_sockets_open(); +static void mesh_start(); +static int mesh_dump_reduce(const void* key, void* data, void* udata); +static void mesh_dump(bool verbose); + +static void multicast_init(); +static void multicast_clear(); +static void multicast_listening_sockets_open(); +static void multicast_start(); +static void multicast_listening_sockets_close(); +static void multicast_stop(); +static void multicast_dump(bool verbose); +static int multicast_supported_cluster_size_get(); + +static bool hb_is_initialized(); +static bool hb_is_running(); +static bool hb_is_stopped(); +static void hb_mode_init(); +static void hb_mode_start(); +static int hb_mtu(); +static void hb_msg_init(); +static uint32_t hb_protocol_identifier_get(); +static cf_clock hb_node_depart_time(cf_clock detect_time); +static bool hb_is_mesh(); +static void hb_event_queue(as_hb_internal_event_type event_type, const cf_node* nodes, int node_count); +static void hb_event_publish_pending(); +static int hb_adjacency_free_data_reduce(const void* key, void* data, void* udata); +static void hb_clear(); +static int hb_adjacency_iterate_reduce(const void* key, void* data, void* udata); +static void hb_plugin_set_fn(msg* msg); +static void hb_plugin_parse_data_fn(msg* msg, cf_node source, as_hb_plugin_node_data* plugin_data); +static msg* hb_msg_get(); +static void hb_msg_return(msg* msg); +static void hb_plugin_msg_fill(msg* msg); +static void hb_plugin_msg_parse(msg* msg, as_hb_adjacent_node* adjacent_node, as_hb_plugin* plugins, bool plugin_data_changed[]); +static void hb_plugin_init(); +void* hb_transmitter(void* arg); +static int hb_adjacent_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node); +static void hb_adjacent_node_plugin_data_get(as_hb_adjacent_node* adjacent_node, as_hb_plugin_id plugin_id, void** plugin_data, size_t* plugin_data_size); +static void hb_adjacent_node_adjacency_get(as_hb_adjacent_node* adjacent_node, cf_node** adjacency_list, size_t* adjacency_length); +static bool hb_node_has_expired(cf_node nodeid, as_hb_adjacent_node* adjacent_node); +static bool hb_self_is_duplicate(); +static void hb_self_duplicate_update(); +static void hb_adjacent_node_destroy(as_hb_adjacent_node* adjacent_node); +static int hb_adjacency_tend_reduce(const void* key, void* data, void* udata); +void* hb_adjacency_tender(void* arg); +static void hb_tx_start(); +static void hb_tx_stop(); +static void hb_adjacency_tender_start(); +static void hb_adjacency_tender_stop(); +static void hb_init(); +static void hb_start(); +static void hb_stop(); +static void hb_plugin_register(as_hb_plugin* plugin); +static bool hb_msg_is_obsolete(as_hb_channel_event* event, as_hlc_timestamp send_ts); +static void hb_endpoint_change_tracker_update(uint64_t* tracker, bool endpoint_changed); +static bool hb_endpoint_change_tracker_is_normal(uint64_t tracker); +static bool hb_endpoint_change_tracker_has_changed(uint64_t tracker); +static void hb_adjacent_node_update(as_hb_channel_event* msg_event, as_hb_adjacent_node* adjacent_node, bool plugin_data_changed[]); +static bool hb_node_can_consider_adjacent(as_hb_adjacent_node* adjacent_node); +static void hb_channel_on_self_pulse(as_hb_channel_event* msg_event); +static void hb_channel_on_pulse(as_hb_channel_event* msg_event); +static void hb_channel_on_msg_rcvd(as_hb_channel_event* event); +static void hb_handle_cluster_name_mismatch(as_hb_channel_event* event); +static void hb_channel_event_process(as_hb_channel_event* event); +static void hb_mode_dump(bool verbose); +static int hb_dump_reduce(const void* key, void* data, void* udata); +static void hb_dump(bool verbose); +static void hb_adjacency_graph_invert(cf_vector* nodes, uint8_t** inverted_graph); +static void hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict); +static int hb_plugin_data_iterate_reduce(const void* key, void* data, void* udata); +static void hb_plugin_data_iterate_all(as_hb_plugin_id pluginid, + as_hb_plugin_data_iterate_fn iterate_fn, void* udata); + +/* + * ---------------------------------------------------------------------------- + * Public functions. + * ---------------------------------------------------------------------------- + */ +/** + * Initialize the heartbeat subsystem. + */ +void +as_hb_init() +{ + // Initialize hb subsystem. + hb_init(); + + // Add the mesh seed nodes. + // Using one time seed config outside the config module. + if (hb_is_mesh()) { + for (int i = 0; i < AS_CLUSTER_SZ; i++) { + if (g_config.hb_config.mesh_seed_addrs[i]) { + mesh_tip(g_config.hb_config.mesh_seed_addrs[i], + g_config.hb_config.mesh_seed_ports[i], + g_config.hb_config.mesh_seed_tls[i]); + } + else { + break; + } + } + } +} + +/** + * Start the heartbeat subsystem. + */ +void +as_hb_start() +{ + hb_start(); +} + +/** + * Shut down the heartbeat subsystem. + */ +void +as_hb_shutdown() +{ + hb_stop(); +} + +/** + * Indicates if self node is a duplicate + */ +bool +as_hb_self_is_duplicate() +{ + return hb_self_is_duplicate(); +} + +/** + * Free the data structures of heart beat. + */ +void +as_hb_destroy() +{ + // Destroy the main module. + hb_clear(); +} + +/** + * Return a string representation of a heartbeat protocol type. + * + * @param protocol for which the string is computed + * @param protocol_s string representation of protocol + */ +void +as_hb_protocol_get_s(as_hb_protocol protocol, char* protocol_s) +{ + char *str; + switch (protocol) { + case AS_HB_PROTOCOL_V3: + str = "v3"; + break; + case AS_HB_PROTOCOL_NONE: + str = "none"; + break; + case AS_HB_PROTOCOL_RESET: + str = "reset"; + break; + default: + str = "undefined"; + } + + sprintf(protocol_s, "%s", str); +} + +/** + * Set heartbeat protocol version. + */ +as_hb_protocol +as_hb_protocol_get() +{ + return config_protocol_get(); +} + +/** + * Set heartbeat protocol version. + */ +int +as_hb_protocol_set(as_hb_protocol new_protocol) +{ + SET_PROTOCOL_LOCK(); + int rv = 0; + if (config_protocol_get() == new_protocol) { + INFO("no heartbeat protocol change needed"); + rv = 0; + goto Exit; + } + char old_protocol_s[HB_PROTOCOL_STR_MAX_LEN]; + char new_protocol_s[HB_PROTOCOL_STR_MAX_LEN]; + as_hb_protocol_get_s(config_protocol_get(), old_protocol_s); + as_hb_protocol_get_s(new_protocol, new_protocol_s); + switch (new_protocol) { + case AS_HB_PROTOCOL_V3: + if (hb_is_running()) { + INFO("disabling current heartbeat protocol %s", old_protocol_s); + hb_stop(); + } + INFO("setting heartbeat protocol version number to %s", new_protocol_s); + config_protocol_set(new_protocol); + hb_start(); + INFO("heartbeat protocol version set to %s", new_protocol_s); + break; + + case AS_HB_PROTOCOL_NONE: + INFO("setting heartbeat protocol version to none"); + hb_stop(); + config_protocol_set(new_protocol); + INFO("heartbeat protocol set to none"); + break; + + case AS_HB_PROTOCOL_RESET: + if (config_protocol_get() == AS_HB_PROTOCOL_NONE) { + INFO("heartbeat messaging disabled ~~ not resetting"); + rv = -1; + goto Exit; + } + + // NB: "protocol" is never actually set to "RESET" ~~ + // it is simply a trigger for the reset action. + INFO("resetting heartbeat messaging"); + + hb_stop(); + + hb_clear(); + + hb_start(); + + break; + + default: + WARNING("unknown heartbeat protocol version number: %d", new_protocol); + rv = -1; + goto Exit; + } + +Exit: + SET_PROTOCOL_UNLOCK(); + return rv; +} + +/** + * Register a heartbeat plugin. + */ +void +as_hb_plugin_register(as_hb_plugin* plugin) +{ + if (!hb_is_initialized()) { + WARNING( + "main heartbeat module uninitialized - not registering the plugin"); + return; + } + hb_plugin_register(plugin); +} + +/** + * Register a heartbeat node event listener. + */ +void +as_hb_register_listener(as_hb_event_fn event_callback, void* udata) +{ + if (!hb_is_initialized()) { + WARNING( + "main heartbeat module uninitialized - not registering the listener"); + return; + } + + HB_LOCK(); + + if (g_hb_event_listeners.event_listener_count >= + AS_HB_EVENT_LISTENER_MAX) { + CRASH("cannot register more than %d event listeners", + AS_HB_EVENT_LISTENER_MAX); + } + + g_hb_event_listeners.event_listeners[g_hb_event_listeners.event_listener_count].event_callback = + event_callback; + g_hb_event_listeners.event_listeners[g_hb_event_listeners.event_listener_count].udata = + udata; + g_hb_event_listeners.event_listener_count++; + + HB_UNLOCK(); +} + +/** + * Validate heartbeat config. + */ +void +as_hb_config_validate() +{ + char *error; + // Validate clustering and heartbeat version compatibility. + as_hb_protocol hb_protocol = config_protocol_get(); + + if (hb_protocol != AS_HB_PROTOCOL_V3 + && hb_protocol != AS_HB_PROTOCOL_NONE) { + CRASH_NOSTACK("clustering protocol v5 requires hearbeat version v3"); + } + + if (!config_binding_is_valid(&error, hb_protocol)) { + CRASH_NOSTACK("%s", error); + } +} + +/** + * Override the computed MTU for the network interface used by heartbeat. + */ +void +as_hb_override_mtu_set(int mtu) +{ + config_override_mtu_set(mtu); +} + +/** + * Get the heartbeat pulse transmit interval. + */ +uint32_t +as_hb_tx_interval_get() +{ + return config_tx_interval_get(); +} + +/** + * Set the heartbeat pulse transmit interval. + */ +int +as_hb_tx_interval_set(uint32_t new_interval) +{ + if (new_interval < AS_HB_TX_INTERVAL_MS_MIN + || new_interval > AS_HB_TX_INTERVAL_MS_MAX) { + WARNING("heartbeat interval must be >= %u and <= %u - ignoring %u", + AS_HB_TX_INTERVAL_MS_MIN, AS_HB_TX_INTERVAL_MS_MAX, + new_interval); + return (-1); + } + config_tx_interval_set(new_interval); + return (0); +} + +/** + * Set the maximum number of missed heartbeat intervals after which a node is + * considered expired. + */ +int +as_hb_max_intervals_missed_set(uint32_t new_max) +{ + if (new_max < AS_HB_MAX_INTERVALS_MISSED_MIN) { + WARNING("heartbeat timeout must be >= %u - ignoring %u", + AS_HB_MAX_INTERVALS_MISSED_MIN, new_max); + return (-1); + } + config_max_intervals_missed_set(new_max); + return (0); +} + +/** + * Get the timeout interval to consider a node dead / expired in milliseconds if + * no heartbeat pulse messages are received. + */ +uint32_t +as_hb_node_timeout_get() +{ + return HB_NODE_TIMEOUT(); +} + +/** + * Populate the buffer with heartbeat configuration. + */ +void +as_hb_info_config_get(cf_dyn_buf* db) +{ + if (hb_is_mesh()) { + info_append_string(db, "heartbeat.mode", "mesh"); + info_append_addrs(db, "heartbeat.address", &g_config.hb_serv_spec.bind); + info_append_uint32(db, "heartbeat.port", + (uint32_t)g_config.hb_serv_spec.bind_port); + mesh_seed_host_list_get(db, false); + info_append_addrs(db, "heartbeat.tls-address", + &g_config.hb_tls_serv_spec.bind); + info_append_uint32(db, "heartbeat.tls-port", + g_config.hb_tls_serv_spec.bind_port); + info_append_string_safe(db, "heartbeat.tls-name", + g_config.hb_tls_serv_spec.tls_our_name); + mesh_seed_host_list_get(db, true); + } + else { + info_append_string(db, "heartbeat.mode", "multicast"); + info_append_addrs(db, "heartbeat.address", &g_config.hb_serv_spec.bind); + info_append_addrs(db, "heartbeat.multicast-group", + &g_config.hb_multicast_groups); + info_append_uint32(db, "heartbeat.port", + (uint32_t)g_config.hb_serv_spec.bind_port); + } + + info_append_uint32(db, "heartbeat.interval", config_tx_interval_get()); + info_append_uint32(db, "heartbeat.timeout", + config_max_intervals_missed_get()); + + info_append_int(db, "heartbeat.mtu", hb_mtu()); + + char protocol_s[HB_PROTOCOL_STR_MAX_LEN]; + as_hb_protocol_get_s(config_protocol_get(), protocol_s); + + info_append_string(db, "heartbeat.protocol", protocol_s); +} + +/** + * Populate heartbeat endpoints. + */ +void +as_hb_info_endpoints_get(cf_dyn_buf* db) +{ + const cf_serv_cfg *cfg = config_bind_cfg_get(); + + if (cfg->n_cfgs == 0) { + // Will never happen in practice. + return; + } + + info_append_int(db, "heartbeat.port", g_config.hb_serv_spec.bind_port); + + char *string = as_info_bind_to_string(cfg, CF_SOCK_OWNER_HEARTBEAT); + info_append_string(db, "heartbeat.addresses", string); + cf_free(string); + + info_append_int(db, "heartbeat.tls-port", + g_config.hb_tls_serv_spec.bind_port); + + string = as_info_bind_to_string(cfg, CF_SOCK_OWNER_HEARTBEAT_TLS); + info_append_string(db, "heartbeat.tls-addresses", string); + cf_free(string); + + if (hb_is_mesh()) { + MESH_LOCK(); + cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, + mesh_peer_endpoint_reduce, db); + MESH_UNLOCK(); + } + else { + // Output multicast groups. + const cf_mserv_cfg* multicast_cfg = config_multicast_group_cfg_get(); + if (multicast_cfg->n_cfgs == 0) { + return; + } + + cf_dyn_buf_append_string(db, "heartbeat.multicast-groups="); + uint32_t count = 0; + for (uint32_t i = 0; i < multicast_cfg->n_cfgs; ++i) { + if (count > 0) { + cf_dyn_buf_append_char(db, ','); + } + + cf_dyn_buf_append_string(db, + cf_ip_addr_print(&multicast_cfg->cfgs[i].addr)); + ++count; + } + cf_dyn_buf_append_char(db, ';'); + } +} + +/** + * Generate a string for listening address and port in format ip_address:port + * and return the heartbeat mode. + * + * @param mode (output) current heartbeat subsystem mode. + * @param addr_port (output) listening ip address and port formatted as + * ip_address:port + * @param addr_port_capacity the capacity of the addr_port input. + */ +void +as_hb_info_listen_addr_get(as_hb_mode* mode, char* addr_port, + size_t addr_port_capacity) +{ + *mode = hb_is_mesh() ? AS_HB_MODE_MESH : AS_HB_MODE_MULTICAST; + if (hb_is_mesh()) { + endpoint_list_to_string_udata udata; + udata.endpoint_list_str = addr_port; + udata.endpoint_list_str_capacity = addr_port_capacity; + mesh_published_endpoints_process(endpoint_list_to_string_process, + &udata); + } + else { + const cf_mserv_cfg* multicast_cfg = config_multicast_group_cfg_get(); + + char* write_ptr = addr_port; + int remaining = addr_port_capacity; + + // Ensure we leave space for the terminating NULL delimiter. + for (int i = 0; i < multicast_cfg->n_cfgs && remaining > 1; i++) { + cf_sock_addr temp; + cf_ip_addr_copy(&multicast_cfg->cfgs[i].addr, &temp.addr); + temp.port = multicast_cfg->cfgs[i].port; + int rv = cf_sock_addr_to_string(&temp, write_ptr, remaining); + if (rv <= 0) { + // We exhausted the write buffer. + // Ensure NULL termination. + addr_port[addr_port_capacity - 1] = 0; + return; + } + + write_ptr += rv; + remaining -= rv; + + if (i != multicast_cfg->n_cfgs - 1 && remaining > 1) { + *write_ptr = ','; + write_ptr++; + remaining--; + } + } + + // Ensure NULL termination. + *write_ptr = 0; + } +} + +/** + * Populate the buffer with duplicate nodeids. + */ +void +as_hb_info_duplicates_get(cf_dyn_buf* db) +{ + cf_dyn_buf_append_string(db, "cluster_duplicate_nodes="); + + HB_LOCK(); + bool self_is_duplicate = hb_self_is_duplicate(); + int num_probation = cf_shash_get_size(g_hb.on_probation); + cf_node duplicate_list[num_probation + 1]; + + if (!self_is_duplicate && num_probation == 0) { + cf_dyn_buf_append_string(db, "null"); + goto Exit; + } + + as_hb_adjacency_reduce_udata probation_reduce_udata = { duplicate_list, 0 }; + + cf_shash_reduce(g_hb.on_probation, hb_adjacency_iterate_reduce, + &probation_reduce_udata); + + if (hb_self_is_duplicate()) { + duplicate_list[probation_reduce_udata.adj_count++] = + config_self_nodeid_get(); + } + + int num_duplicates = probation_reduce_udata.adj_count; + qsort(duplicate_list, num_duplicates, sizeof(cf_node), + cf_node_compare_desc); + + for (int i = 0; i < num_duplicates; i++) { + cf_dyn_buf_append_uint64_x(db, duplicate_list[i]); + cf_dyn_buf_append_char(db, ','); + } + cf_dyn_buf_chomp(db); + +Exit: + HB_UNLOCK(); + cf_dyn_buf_append_char(db, ';'); +} + +/* + * ----------------------------------------------------------------- + * Mesh mode public API + * ----------------------------------------------------------------- + */ + +/** + * Add an aerospike instance from the mesh seed list. + */ +int +as_hb_mesh_tip(char* host, int port, bool tls) +{ + if (!hb_is_mesh()) { + WARNING("tip not applicable for multicast"); + return (-1); + } + + return mesh_tip(host, port, tls); +} + +/** + * Remove a mesh node instance from the mesh list. + */ +int +as_hb_mesh_tip_clear(char* host, int port) +{ + if (!hb_is_mesh()) { + WARNING("tip clear not applicable for multicast"); + return (-1); + } + + if (host == NULL || host[0] == '\0' + || strnlen(host, HOST_NAME_MAX) == HOST_NAME_MAX) { + WARNING("incorrect host or port"); + return (-1); + } + + MESH_LOCK(); + DETAIL("executing tip clear for %s:%d", host, port); + + // FIXME: Remove the mesh host entry and close channel was done to meet + // AER-5241 ??? + // tip-clear is not a mechanism to throw a connected node out of the + // cluster. + // We should not be required to use this mechanism now. + // tip-clear should only be used to cleanup seed list after decommisioning + // an ip. + as_hb_mesh_tip_clear_udata mesh_tip_clear_reduce_udata; + strncpy(mesh_tip_clear_reduce_udata.host, host, HOST_NAME_MAX); + mesh_tip_clear_reduce_udata.port = port; + mesh_tip_clear_reduce_udata.entry_deleted = false; + mesh_tip_clear_reduce_udata.nodeid = 0; + + int seed_index = mesh_seed_find_unsafe(host, port); + if (seed_index >= 0) { + as_hb_mesh_seed* seed = cf_vector_getp( + &g_hb.mode_state.mesh_state.seeds, seed_index); + mesh_tip_clear_reduce_udata.nodeid = seed->mesh_nodeid; + } + + // Refresh the mapping between the seeds and the mesh hosts. + mesh_seed_inactive_refresh_get_unsafe(NULL); + cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, + mesh_tip_clear_reduce, &mesh_tip_clear_reduce_udata); + + // Remove the seed entry in case we do not find a matching mesh entry. + // Will happen trivially if this seed could not be connected. + mesh_tip_clear_reduce_udata.entry_deleted |= mesh_seed_delete_unsafe( + mesh_seed_find_unsafe(host, port)) == 0; + + MESH_UNLOCK(); + return mesh_tip_clear_reduce_udata.entry_deleted ? 0 : -1; +} + +/** + * Clear the entire mesh list. + */ +int +as_hb_mesh_tip_clear_all(uint32_t* cleared) +{ + if (!hb_is_mesh()) { + WARNING("tip clear not applicable for multicast"); + return (-1); + } + + MESH_LOCK(); + *cleared = cf_shash_get_size( + g_hb.mode_state.mesh_state.nodeid_to_mesh_node); + + // Refresh the mapping between the seeds and the mesh hosts. + mesh_seed_inactive_refresh_get_unsafe(NULL); + cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, + mesh_tip_clear_reduce, NULL); + + // Remove all entries that did not have a matching mesh endpoint. + cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; + int element_count = cf_vector_size(seeds); + for (int i = 0; i < element_count; i++) { + if (mesh_seed_delete_unsafe(i) == 0) { + i--; + element_count--; + } + else { + // Should not happen in practice. + as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); + CRASH("error deleting mesh seed entry %s:%d", seed->seed_host_name, + seed->seed_port); + } + } + + MESH_UNLOCK(); + return (0); +} + +/** + * Read the plugin data for a node in the adjacency list. The plugin_data->data + * input param should be pre allocated and plugin_data->data_capacity should + * indicate its capacity. + * + * @param nodeid the node id + * @param pluginid the plugin identifier. + * @param plugin_data (input/output) on success plugin_data->data will be the + * plugin's data for the node and plugin_data->data_size will be the data size. + * node. NULL if there is no plugin data. + * @praram msg_hlc_ts (output) if not NULL will be filled with the timestamp of + * when the hb message for this data was received. + * @param recv_monotonic_ts (output) if not NULL will be filled with monotonic + * wall clock receive timestamp for this plugin data. + * @return 0 on success and -1 on error, where errno will be set to ENOENT if + * there is no entry for this node and ENOMEM if the input plugin data's + * capacity is less than plugin's data. In ENOMEM case plugin_data->data_size + * will be set to the required capacity. + */ +int +as_hb_plugin_data_get(cf_node nodeid, as_hb_plugin_id plugin, + as_hb_plugin_node_data* plugin_data, as_hlc_msg_timestamp* msg_hlc_ts, + cf_clock* recv_monotonic_ts) +{ + int rv = 0; + + HB_LOCK(); + + as_hb_adjacent_node adjacent_node; + if (hb_adjacent_node_get(nodeid, &adjacent_node) != 0) { + rv = -1; + plugin_data->data_size = 0; + errno = ENOENT; + goto Exit; + } + + as_hb_plugin_node_data* plugin_data_internal = + &adjacent_node.plugin_data[plugin][adjacent_node.plugin_data_cycler + % 2]; + + if (plugin_data_internal->data && plugin_data_internal->data_size) { + // Set the plugin data size + plugin_data->data_size = plugin_data_internal->data_size; + + if (plugin_data_internal->data_size > plugin_data->data_capacity) { + rv = -1; + errno = ENOMEM; + goto Exit; + } + + // Copy over the stored copy of the plugin data. + memcpy(plugin_data->data, plugin_data_internal->data, + plugin_data_internal->data_size); + + // Copy the message timestamp. + if (msg_hlc_ts) { + memcpy(msg_hlc_ts, &adjacent_node.last_msg_hlc_ts, + sizeof(as_hlc_msg_timestamp)); + } + + if (recv_monotonic_ts) { + *recv_monotonic_ts = adjacent_node.last_updated_monotonic_ts; + } + + rv = 0; + } + else { + // No plugin data set. + plugin_data->data_size = 0; + if (recv_monotonic_ts) { + *recv_monotonic_ts = 0; + } + if (msg_hlc_ts) { + memset(msg_hlc_ts, 0, sizeof(as_hlc_msg_timestamp)); + } + rv = 0; + } + +Exit: + HB_UNLOCK(); + return rv; +} + +/** + * Call the iterate method on plugin data for all nodes in the input vector. The + * iterate function will be invoked for all nodes in the input vector even if + * they are not in the adjacency list or they have no plugin data. Plugin data + * will be NULL with size zero in such cases. + * + * @param nodes the iterate on. + * @param plugin the plugin identifier. + * @param iterate_fn the iterate function invoked for plugin data for every + * node. + * @param udata passed as is to the iterate function. Useful for getting results + * out of the iteration. + * NULL if there is no plugin data. + * @return the size of the plugin data. 0 if there is no plugin data. + */ +void +as_hb_plugin_data_iterate(cf_vector* nodes, as_hb_plugin_id plugin, + as_hb_plugin_data_iterate_fn iterate_fn, void* udata) + +{ + HB_LOCK(); + + int size = cf_vector_size(nodes); + + for (int i = 0; i < size; i++) { + cf_node* nodeid = cf_vector_getp(nodes, i); + + if (nodeid == NULL || *nodeid == 0) { + continue; + } + + as_hb_adjacent_node nodeinfo; + + if (hb_adjacent_node_get(*nodeid, &nodeinfo) == 0) { + size_t data_size = 0; + void* data = NULL; + + hb_adjacent_node_plugin_data_get(&nodeinfo, plugin, &data, + &data_size); + + iterate_fn(*nodeid, data, data_size, + nodeinfo.last_updated_monotonic_ts, + &nodeinfo.last_msg_hlc_ts, udata); + } + else { + // This node is not known to the heartbeat subsystem. + iterate_fn(*nodeid, NULL, 0, 0, NULL, udata); + } + } + + HB_UNLOCK(); +} + +/** + * Call the iterate method on all nodes in current adjacency list. Note plugin + * data can still be NULL if the plugin data failed to parse the plugin data. + * + * @param pluginid the plugin identifier. + * @param iterate_fn the iterate function invoked for plugin data for every + * node. + * @param udata passed as is to the iterate function. Useful for getting results + * out of the iteration. + * NULL if there is no plugin data. + * @return the size of the plugin data. 0 if there is no plugin data. + */ +void +as_hb_plugin_data_iterate_all(as_hb_plugin_id pluginid, + as_hb_plugin_data_iterate_fn iterate_fn, void* udata) +{ + hb_plugin_data_iterate_all(pluginid, iterate_fn, udata); +} + +/** + * Log the state of the heartbeat module. + */ +void +as_hb_dump(bool verbose) +{ + INFO("Heartbeat Dump:"); + + as_hb_mode mode; + char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; + as_hb_info_listen_addr_get(&mode, endpoint_list_str, + sizeof(endpoint_list_str)); + + // Dump the config. + INFO("HB Mode: %s (%d)", + (mode == AS_HB_MODE_MULTICAST ? + "multicast" : + (mode == AS_HB_MODE_MESH ? "mesh" : "undefined")), mode); + + INFO("HB Addresses: {%s}", endpoint_list_str); + INFO("HB MTU: %d", hb_mtu()); + + INFO("HB Interval: %d", config_tx_interval_get()); + INFO("HB Timeout: %d", config_max_intervals_missed_get()); + char protocol_s[HB_PROTOCOL_STR_MAX_LEN]; + as_hb_protocol_get_s(config_protocol_get(), protocol_s); + INFO("HB Protocol: %s (%d)", protocol_s, config_protocol_get()); + + // dump mode specific state. + hb_mode_dump(verbose); + + // Dump the channel state. + channel_dump(verbose); + + // Dump the adjacency list. + hb_dump(verbose); +} + +/** + * Indicates if a node is alive. + */ +bool +as_hb_is_alive(cf_node nodeid) +{ + bool is_alive; + HB_LOCK(); + + as_hb_adjacent_node adjacent_node; + is_alive = (nodeid == config_self_nodeid_get()) + || (hb_adjacent_node_get(nodeid, &adjacent_node) == 0); + + HB_UNLOCK(); + return is_alive; +} + +/** + * Compute the nodes to evict from the input nodes so that remaining nodes form + * a clique, based on adjacency lists. Self nodeid is never considered for + * eviction. + * + * @param nodes input cf_node vector. + * @param nodes_to_evict output cf_node clique array, that is initialized. + */ +void +as_hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict) +{ + hb_maximal_clique_evict(nodes, nodes_to_evict); +} + +/** + * Read the hlc timestamp for the message. + * Note: A protected API for the sole benefit of skew monitor. + * + * @param msg the incoming message. + * @param send_ts the output hlc timestamp. + * @return 0 if the time stamp could be parsed -1 on failure. + */ +int +as_hb_msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts) +{ + return msg_send_hlc_ts_get(msg, send_ts); +} + +/* + * ---------------------------------------------------------------------------- + * Common sub module. + * ---------------------------------------------------------------------------- + */ + +/* + * ---------------------------------------------------------------------------- + * Utility + * ---------------------------------------------------------------------------- + */ + +/** + * Round up input int to the nearest power of two. + */ +static uint32_t +round_up_pow2(uint32_t v) +{ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; +} + +/** + * Generate a hash code for a cf_socket. + */ +static uint32_t +hb_socket_hash_fn(const void* key) +{ + const cf_socket** socket = (const cf_socket**)key; + return cf_hash_jen32((const uint8_t*)socket, sizeof(cf_socket*)); +} + +/** + * Reduce function to delete all entries in a map + */ +static int +hb_delete_all_reduce(const void* key, void* data, void* udata) +{ + return CF_SHASH_REDUCE_DELETE; +} + +/* + * ---------------------------------------------------------------------------- + * Info call related + * ---------------------------------------------------------------------------- + */ + +/** + * Append a address spec to a cf_dyn_buf. + */ +static void +info_append_addrs(cf_dyn_buf *db, const char *name, const cf_addr_list *list) +{ + for (uint32_t i = 0; i < list->n_addrs; ++i) { + info_append_string(db, name, list->addrs[i]); + } +} + +/* + * ---------------------------------------------------------------------------- + * Vector operations + * ---------------------------------------------------------------------------- + */ + +/** + * TODO: Move this to cf_vector. + * Find the index of an element in the vector. Equality is based on mem compare. + * + * @param vector the source vector. + * @param element the element to find. + * @return the index if the element is found, -1 otherwise. + */ +static int +vector_find(cf_vector* vector, const void* element) +{ + int element_count = cf_vector_size(vector); + size_t value_len = cf_vector_element_size(vector); + for (int i = 0; i < element_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + void* src_element = cf_vector_getp(vector, i); + if (src_element) { + if (memcmp(element, src_element, value_len) == 0) { + return i; + } + } + } + return -1; +} + +/* + * ---------------------------------------------------------------------------- + * Endpoint list related + * ---------------------------------------------------------------------------- + */ + +/** + * Copy an endpoint list to the destination, while possible reallocating the + * destination space. + * @param dest the double pointer to the destination list, because it might need + * reallocation to accommodate a larger source list. + * @param src the source endpoint list. + */ +static void +endpoint_list_copy(as_endpoint_list** dest, as_endpoint_list* src) +{ + size_t src_size; + + if (as_endpoint_list_sizeof(src, &src_size) != 0) { + // Bad endpoint list passed. + CRASH("invalid adjacency list passed for copying"); + } + + *dest = cf_realloc(*dest, src_size); + + memcpy(*dest, src, src_size); +} + +/** + * Process function to convert endpoint list to a string. + */ +static void +endpoint_list_to_string_process(const as_endpoint_list* endpoint_list, + void* udata) +{ + endpoint_list_to_string_udata* to_string_udata = + (endpoint_list_to_string_udata*)udata; + as_endpoint_list_to_string(endpoint_list, + to_string_udata->endpoint_list_str, + to_string_udata->endpoint_list_str_capacity); +} + +/** + * Process function to check if endpoint lists overlap. + */ +static void +endpoint_list_equal_process(const as_endpoint_list* endpoint_list, void* udata) +{ + endpoint_list_equal_check_udata* equal_udata = + (endpoint_list_equal_check_udata*)udata; + + equal_udata->are_equal |= as_endpoint_lists_are_equal(endpoint_list, + equal_udata->other); +} + +/* + * ---------------------------------------------------------------------------- + * Messge related + * ---------------------------------------------------------------------------- + */ + +/** + * The size of a buffer beyond which compression should be applied. For now set + * to 60% of the interface mtu. + */ +static int +msg_compression_threshold(int mtu) +{ + return (int)(mtu * 0.6); +} + +/** + * Read advertised endpoint list from an incoming message. + * @param msg the incoming message. + * @param endpoint_list the output endpoint. The endpoint_list will point to + * input message. + * internal location and should not be freed. + * @return 0 on success -1 on failure. + */ +static int +msg_endpoint_list_get(msg* msg, as_endpoint_list** endpoint_list) +{ + size_t endpoint_list_size; + if (msg_get_buf(msg, AS_HB_MSG_ENDPOINTS, (uint8_t**)endpoint_list, + &endpoint_list_size, MSG_GET_DIRECT) != 0) { + return -1; + } + + size_t parsed_size; + if (as_endpoint_list_nsizeof(*endpoint_list, &parsed_size, + endpoint_list_size) || parsed_size != endpoint_list_size) { + return -1; + } + return 0; +} + +/** + * Read the protocol identifier for this heartbeat message. These functions can + * get called multiple times for a single message. Hence they do not increment + * error counters. + * + * @param msg the incoming message. + * @param id the output id. + * @return 0 if the id could be parsed -1 on failure. + */ +static int +msg_id_get(msg* msg, uint32_t* id) +{ + if (msg_get_uint32(msg, AS_HB_MSG_ID, id) != 0) { + return -1; + } + + return 0; +} + +/** + * Read the source nodeid for a node. These functions can get called multiple + * times for a single message. Hence they do not increment error counters. + * @param msg the incoming message. + * @param nodeid the output nodeid. + * @return 0 if the nodeid could be parsed -1 on failure. + */ +static int +msg_nodeid_get(msg* msg, cf_node* nodeid) +{ + if (msg_get_uint64(msg, AS_HB_MSG_NODE, nodeid) != 0) { + return -1; + } + + return 0; +} + +/** + * Read the HLC send timestamp for the message. These functions can get called + * multiple times for a single message. Hence they do not increment error + * counters. + * @param msg the incoming message. + * @param send_ts the output hlc timestamp. + * @return 0 if the time stamp could be parsed -1 on failure. + */ +static int +msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts) +{ + if (msg_get_uint64(msg, AS_HB_MSG_HLC_TIMESTAMP, send_ts) != 0) { + return -1; + } + + return 0; +} + +/** + * Read the message type. These functions can get called multiple times for a + * single message. Hence they do not increment error counters. + * @param msg the incoming message. + * @param type the output message type. + * @return 0 if the type could be parsed -1 on failure. + */ +static int +msg_type_get(msg* msg, as_hb_msg_type* type) +{ + if (msg_get_uint32(msg, AS_HB_MSG_TYPE, type) != 0) { + return -1; + } + + return 0; +} + +/** + * Read the cluster name. + * @param msg the incoming message. + * @param cluster name of the output message type. + * @return 0 if the cluster name could be parsed -1 on failure. + */ +static int +msg_cluster_name_get(msg* msg, char** cluster_name) +{ + if (msg_get_str(msg, AS_HB_MSG_CLUSTER_NAME, cluster_name, NULL, + MSG_GET_DIRECT) != 0) { + return -1; + } + + return 0; +} + +/** + * Get a pointer to a node list in the message. + * + * @param msg the incoming message. + * @param field_id the field id. + * @param adj_list output. on success will point to the adjacency list in the + * message. + * @para adj_length output. on success will contain the length of the adjacency + * list. + * @return 0 on success. -1 if the adjacency list is absent. + */ +static int +msg_node_list_get(msg* msg, int field_id, cf_node** adj_list, + size_t* adj_length) +{ + if (msg_get_buf(msg, field_id, (uint8_t**)adj_list, adj_length, + MSG_GET_DIRECT) != 0) { + return -1; + } + + // correct adjacency list length. + *adj_length /= sizeof(cf_node); + + return 0; +} + +/** + * Get a pointer to the adjacency list in the message. + * + * @param msg the incoming message. + * @param adj_list output. on success will point to the adjacency list in the + * message. + * @para adj_length output. on success will contain the length of the adjacency + * list. + * @return 0 on success. -1 if the adjacency list is absent. + */ +static int +msg_adjacency_get(msg* msg, cf_node** adj_list, size_t* adj_length) +{ + return msg_node_list_get(msg, AS_HB_MSG_HB_DATA, adj_list, adj_length); +} + +/** + * Set a node list on an outgoing messages for a field. + * + * @param msg the outgoing message. + * @param field_id the id of the list field. + * @param node_list the adjacency list to set. + * @para node_length the length of the adjacency list. + */ +static void +msg_node_list_set(msg* msg, int field_id, cf_node* node_list, + size_t node_length) +{ + if (msg_set_buf(msg, field_id, (uint8_t*)node_list, + sizeof(cf_node) * node_length, MSG_SET_COPY) != 0) { + CRASH("error setting adjacency list on msg"); + } + + return; +} + +/** + * Set the adjacency list on an outgoing messages. + * + * @param msg the outgoing message. + * @param adj_list the adjacency list to set. + * @para adj_length the length of the adjacency list. + */ +static void +msg_adjacency_set(msg* msg, cf_node* adj_list, size_t adj_length) +{ + msg_node_list_set(msg, AS_HB_MSG_HB_DATA, adj_list, adj_length); +} + +/** + * Set the info reply on an outgoing messages. + * + * @param msg the outgoing message. + * @param response the response list to set. + * @para response_count the length of the response list. + */ +static void +msg_info_reply_set(msg* msg, as_hb_mesh_info_reply* response, + size_t response_count) +{ + size_t response_size = 0; + if (mesh_info_reply_sizeof(response, response_count, &response_size)) { + CRASH("error setting info reply on msg"); + } + + if (msg_set_buf(msg, AS_HB_MSG_INFO_REPLY, (uint8_t*)response, + response_size, MSG_SET_COPY) != 0) { + CRASH("error setting info reply on msg"); + } + + return; +} + +/** + * Get a pointer to the info reply list in the message. + * + * @param msg the incoming message. + * @param reply output. on success will point to the reply list in the message. + * @param reply_count output. on success will contain the length of the reply + * list. + * @return 0 on success. -1 if the reply list is absent. + */ +static int +msg_info_reply_get(msg* msg, as_hb_mesh_info_reply** reply, size_t* reply_count) +{ + size_t reply_size; + if (msg_get_buf(msg, AS_HB_MSG_INFO_REPLY, (uint8_t**)reply, &reply_size, + MSG_GET_DIRECT) != 0) { + return -1; + } + + *reply_count = 0; + + // Go over reply and compute the count of replies and also validate the + // endpoint lists. + uint8_t* start_ptr = (uint8_t*)*reply; + int64_t remaining_size = reply_size; + + while (remaining_size > 0) { + as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr; + remaining_size -= sizeof(as_hb_mesh_info_reply); + start_ptr += sizeof(as_hb_mesh_info_reply); + if (remaining_size <= 0) { + // Incomplete / garbled info reply message. + *reply_count = 0; + return -1; + } + + size_t endpoint_list_size = 0; + if (as_endpoint_list_nsizeof(reply_ptr->endpoint_list, + &endpoint_list_size, remaining_size) != 0) { + // Incomplete / garbled info reply message. + *reply_count = 0; + return -1; + } + + remaining_size -= endpoint_list_size; + start_ptr += endpoint_list_size; + (*reply_count)++; + } + + return 0; +} + +/** + * Fill a message with an endpoint list. + */ +static void +msg_published_endpoints_fill(const as_endpoint_list* published_endpoint_list, + void* udata) +{ + endpoint_list_to_msg_udata* to_msg_udata = + (endpoint_list_to_msg_udata*)udata; + msg* msg = to_msg_udata->msg; + bool is_mesh = to_msg_udata->is_mesh; + + if (!published_endpoint_list) { + if (is_mesh) { + // Something is messed up. Except for v3 multicast, + // published list should not be empty. + WARNING("published endpoint list is empty"); + } + return; + } + + // Makes sense only for mesh. + if (is_mesh && published_endpoint_list) { + // Set the source address + size_t endpoint_list_size = 0; + as_endpoint_list_sizeof(published_endpoint_list, &endpoint_list_size); + if (msg_set_buf(msg, AS_HB_MSG_ENDPOINTS, + (uint8_t*)published_endpoint_list, endpoint_list_size, + MSG_SET_COPY) != 0) { + CRASH("error setting heartbeat address on msg"); + } + } +} + +/** + * Fill source fields for the message. + * @param msg the message to fill the source fields into. + */ +static void +msg_src_fields_fill(msg* msg) +{ + bool is_mesh = hb_is_mesh(); + + // Set the hb protocol id / version. + if (msg_set_uint32(msg, AS_HB_MSG_ID, hb_protocol_identifier_get()) != 0) { + CRASH("error setting heartbeat protocol on msg"); + } + + // Set the source node. + if (msg_set_uint64(msg, AS_HB_MSG_NODE, config_self_nodeid_get()) != 0) { + CRASH("error setting node id on msg"); + } + + endpoint_list_to_msg_udata udata; + udata.msg = msg; + udata.is_mesh = is_mesh; + + if (is_mesh) { + // Endpoint list only valid for mesh mode. + mesh_published_endpoints_process(msg_published_endpoints_fill, &udata); + } + + // Set the send hlc timestamp + if (msg_set_uint64(msg, AS_HB_MSG_HLC_TIMESTAMP, as_hlc_timestamp_now()) + != 0) { + CRASH("error setting send timestamp on msg"); + } +} + +/** + * Set the type for an outgoing message. + * @param msg the outgoing message. + * @param msg_type the type to set. + */ +static void +msg_type_set(msg* msg, as_hb_msg_type msg_type) +{ + // Set the message type. + if (msg_set_uint32(msg, AS_HB_MSG_TYPE, msg_type) != 0) { + CRASH("error setting type on msg"); + } +} + +/* + * ---------------------------------------------------------------------------- + * Config sub module. + * ---------------------------------------------------------------------------- + */ + +/** + * Get mcsize. + */ +static int +config_mcsize() +{ + int mode_cluster_size = 0; + if (hb_is_mesh()) { + // Only bounded by available memory. But let's say its infinite. + mode_cluster_size = INT_MAX; + } + else { + mode_cluster_size = multicast_supported_cluster_size_get(); + } + + // Ensure we are always upper bounded by the absolute max cluster size. + int supported_cluster_size = MIN(ASC, mode_cluster_size); + + DETAIL("supported cluster size %d", supported_cluster_size); + return supported_cluster_size; +} + +/** + * Get the binding addresses for the heartbeat subsystem. + */ +static const cf_serv_cfg* +config_bind_cfg_get() +{ + // Not protected by config_lock because it is not changed. + return &g_config.hb_config.bind_cfg; +} + +/** + * Get the multicast groups for the multicast mode. + */ +static const cf_mserv_cfg* +config_multicast_group_cfg_get() +{ + // Not protected by config_lock. Never updated after config parsing.. + return &g_config.hb_config.multicast_group_cfg; +} + +/** + * Get the heartbeat pulse transmit interval. + */ +static uint32_t +config_tx_interval_get() +{ + HB_CONFIG_LOCK(); + uint32_t interval = g_config.hb_config.tx_interval; + HB_CONFIG_UNLOCK(); + return interval; +} + +/** + * Set the heartbeat pulse transmit interval. + */ +static void +config_tx_interval_set(uint32_t new_interval) +{ + HB_CONFIG_LOCK(); + INFO("changing value of interval from %d to %d ", + g_config.hb_config.tx_interval, new_interval); + g_config.hb_config.tx_interval = new_interval; + HB_CONFIG_UNLOCK(); +} + +/** + * Get the heartbeat pulse transmit interval. + */ +static uint32_t +config_override_mtu_get() +{ + HB_CONFIG_LOCK(); + uint32_t override_mtu = g_config.hb_config.override_mtu; + HB_CONFIG_UNLOCK(); + return override_mtu; +} + +/** + * Set the heartbeat pulse transmit interval. + */ +static void +config_override_mtu_set(uint32_t mtu) +{ + HB_CONFIG_LOCK(); + INFO("changing value of override mtu from %d to %d ", + g_config.hb_config.override_mtu, mtu); + g_config.hb_config.override_mtu = mtu; + HB_CONFIG_UNLOCK(); + INFO("max supported cluster size is %d", config_mcsize()); +} + +/** + * Get the maximum number of missed heartbeat intervals after which a node is + * considered expired. + */ +static uint32_t +config_max_intervals_missed_get() +{ + uint32_t rv = 0; + HB_CONFIG_LOCK(); + rv = g_config.hb_config.max_intervals_missed; + HB_CONFIG_UNLOCK(); + return rv; +} + +/** + * Get the number intervals endpoints should be tracked for. + */ +static uint32_t +config_endpoint_track_intervals_get() +{ + // Allow a grace period of half heartbeat timeout, but lower bounded to at + // least 3. + return MAX(3, config_max_intervals_missed_get() / 2); +} + +/** + * Get the maximum number of allowed changes, per endpoint track intervals. + */ +static uint32_t +config_endpoint_changes_allowed_get() +{ + // Allow no change to the endpoint list for now. + return 0; +} + +/** + * Set the maximum number of missed heartbeat intervals after which a node is + * considered expired. + */ +static void +config_max_intervals_missed_set(uint32_t new_max) +{ + HB_CONFIG_LOCK(); + INFO("changing value of timeout from %d to %d ", + g_config.hb_config.max_intervals_missed, new_max); + g_config.hb_config.max_intervals_missed = new_max; + HB_CONFIG_UNLOCK(); +} + +/** + * Return ttl for multicast packets. Set to zero for default TTL. + */ +static unsigned char +config_multicast_ttl_get() +{ + return g_config.hb_config.multicast_ttl; +} + +/** + * Return the current heartbeat protocol. + */ +static as_hb_protocol +config_protocol_get() +{ + as_hb_protocol rv = 0; + HB_CONFIG_LOCK(); + rv = g_config.hb_config.protocol; + HB_CONFIG_UNLOCK(); + return rv; +} + +/** + * Return the current heartbeat protocol. + */ +static void +config_protocol_set(as_hb_protocol new_protocol) +{ + HB_CONFIG_LOCK(); + g_config.hb_config.protocol = new_protocol; + HB_CONFIG_UNLOCK(); +} + +/** + * The nodeid for this node. + */ +static cf_node +config_self_nodeid_get() +{ + // Not protected by config_lock. Never updated after config parsing.. + return g_config.self_node; +} + +/** + * Return the heartbeat subsystem mode. + */ +static as_hb_mode +config_mode_get() +{ + // Not protected by config_lock. Never updated after config parsing.. + return g_config.hb_config.mode; +} + +/** + * Expand "any" binding addresses to actual interface addresses. + * @param bind_cfg the binding configuration. + * @param published_cfg (output) the server configuration to expand. + * @param ipv4_only indicates if only legacy addresses should be allowed. + */ +static void +config_bind_serv_cfg_expand(const cf_serv_cfg* bind_cfg, + cf_serv_cfg* published_cfg, bool ipv4_only) +{ + cf_serv_cfg_init(published_cfg); + cf_sock_cfg sock_cfg; + + for (int i = 0; i < bind_cfg->n_cfgs; i++) { + cf_sock_cfg_copy(&bind_cfg->cfgs[i], &sock_cfg); + + // Expand "any" address to all interfaces. + if (cf_ip_addr_is_any(&sock_cfg.addr)) { + cf_ip_addr all_addrs[CF_SOCK_CFG_MAX]; + uint32_t n_all_addrs = CF_SOCK_CFG_MAX; + if (cf_inter_get_addr_all(all_addrs, &n_all_addrs) != 0) { + WARNING("error getting all interface addresses"); + n_all_addrs = 0; + } + + for (int j = 0; j < n_all_addrs; j++) { + // Skip local address if any is specified. + if (cf_ip_addr_is_local(&all_addrs[j]) + || (ipv4_only && !cf_ip_addr_is_legacy(&all_addrs[j]))) { + continue; + } + + cf_ip_addr_copy(&all_addrs[j], &sock_cfg.addr); + if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) { + CRASH("error initializing published address list"); + } + } + + // TODO: Does not look like the right warning or the right message. + if (published_cfg->n_cfgs == 0) { + WARNING( + "no network interface addresses detected for heartbeat access"); + } + } + else { + if (ipv4_only && !cf_ip_addr_is_legacy(&bind_cfg->cfgs[i].addr)) { + continue; + } + + if (cf_serv_cfg_add_sock_cfg(published_cfg, &sock_cfg)) { + CRASH("error initializing published address list"); + } + } + } +} + +/** + * Checks if the heartbeat binding configuration is valid. + * @param error pointer to a static error message if validation fails, else will + * be set to NULL. + */ +static bool +config_binding_is_valid(char** error, as_hb_protocol protocol) +{ + const cf_serv_cfg* bind_cfg = config_bind_cfg_get(); + const cf_mserv_cfg* multicast_group_cfg = config_multicast_group_cfg_get(); + + if (hb_is_mesh()) { + if (bind_cfg->n_cfgs == 0) { + // Should not happen in practice. + *error = "no bind addresses found for heartbeat"; + return false; + } + + // Ensure we have a valid port for all bind endpoints. + for (int i = 0; i < bind_cfg->n_cfgs; i++) { + if (bind_cfg->cfgs[i].port == 0) { + *error = "invalid mesh listening port"; + return false; + } + } + + cf_serv_cfg publish_serv_cfg; + cf_serv_cfg_init(&publish_serv_cfg); + + if (multicast_group_cfg->n_cfgs != 0) { + *error = + "invalid config option: multicast-group not supported in mesh mode"; + return false; + } + } + else { + const cf_mserv_cfg* multicast_group_cfg = + config_multicast_group_cfg_get(); + + if (multicast_group_cfg->n_cfgs == 0) { + *error = "no multicast groups specified"; + return false; + } + + // Ensure multicast groups have valid ports. + // TODO: We could check if the address is valid multicast. + for (int i = 0; i < multicast_group_cfg->n_cfgs; i++) { + if (multicast_group_cfg->cfgs[i].port == 0) { + *error = "invalid multicast port"; + return false; + } + } + + if (g_config.hb_config.mesh_seed_addrs[0]) { + *error = + "invalid config option: mesh-seed-address-port not supported for multicast mode"; + return false; + } + + cf_serv_cfg publish_serv_cfg; + cf_serv_cfg_init(&publish_serv_cfg); + } + + *error = NULL; + return true; +} + +/* + * ---------------------------------------------------------------------------- + * Channel sub module. + * ---------------------------------------------------------------------------- + */ + +/** + * Initialize the channel structure. + */ +static void +channel_init_channel(as_hb_channel* channel) +{ + memset(channel, 0, sizeof(as_hb_channel)); + cf_ip_addr_set_any(&channel->endpoint_addr.addr); +} + +/** + * Initialize the channel event structure. + */ +static void +channel_event_init(as_hb_channel_event* event) +{ + memset(event, 0, sizeof(as_hb_channel_event)); +} + +/** + * Is channel running. + */ +static bool +channel_is_running() +{ + CHANNEL_LOCK(); + bool retval = + (g_hb.channel_state.status == AS_HB_STATUS_RUNNING) ? true : false; + CHANNEL_UNLOCK(); + return retval; +} + +/** + * Is channel stopped. + */ +static bool +channel_is_stopped() +{ + CHANNEL_LOCK(); + bool retval = + (g_hb.channel_state.status == AS_HB_STATUS_STOPPED) ? true : false; + CHANNEL_UNLOCK(); + return retval; +} + +/** + * Keep a winning socket as a winner for at least this amount of time to prevent + * constant flip flopping and give the winning socket a chance to send + * heartbeats. + */ +static uint32_t +channel_win_grace_ms() +{ + return 3 * config_tx_interval_get(); +} + +/** + * Enable / disable events. + */ +static void +channel_events_enabled_set(bool enabled) +{ + CHANNEL_LOCK(); + g_hb.channel_state.events_enabled = enabled; + CHANNEL_UNLOCK(); +} + +/** + * Know if events are enabled. + */ +static bool +channel_are_events_enabled() +{ + bool result; + CHANNEL_LOCK(); + result = g_hb.channel_state.events_enabled; + CHANNEL_UNLOCK(); + return result; +} + +/** + * Discard an event that has been processed. + */ +static void +channel_event_discard(as_hb_channel_event* event) +{ + // Free the message structure for message received events. + if (event->type == AS_HB_CHANNEL_MSG_RECEIVED) { + hb_msg_return(event->msg); + } +} + +/** + * Queues a channel event for publishing by the channel tender. + */ +static void +channel_event_queue(as_hb_channel_event* event) +{ + if (!channel_are_events_enabled()) { + channel_event_discard(event); + DETAIL( + "events disabled. Ignoring event of type %d with nodeid %" PRIx64, + event->type, event->nodeid); + return; + } + + DETAIL("queuing channel event of type %d for node %" PRIx64, event->type, + event->nodeid); + cf_queue_push(&g_hb.channel_state.events_queue, event); +} + +/** + * Publish queued up channel events. Should be called outside a channel lock to + * prevent deadlocks. + */ +static void +channel_event_publish_pending() +{ + // No channel lock here to prevent deadlocks. + as_hb_channel_event event; + while (cf_queue_pop(&g_hb.channel_state.events_queue, &event, 0) + == CF_QUEUE_OK) { + // Nothing elaborate, using hardcoded list of event recipients. + mesh_channel_event_process(&event); + hb_channel_event_process(&event); + + channel_event_discard(&event); + } +} + +/** + * Return the endpoint associated with this socket if it exists. + * + * @param socket the socket to query for. + * @param result the output result. + * @return 0 if the socket was found and the result value is filled. -1 if a + * mapping for the socket could not be found. + */ +static int +channel_get_channel(cf_socket* socket, as_hb_channel* result) +{ + int status; + CHANNEL_LOCK(); + + if (cf_shash_get(g_hb.channel_state.socket_to_channel, &socket, result) + == CF_SHASH_OK) { + status = 0; + } + else { + status = -1; + } + + CHANNEL_UNLOCK(); + return status; +} + +/** + * Shutdown a channel socket without closing, forcing the channel tender to + * cleanup associated data structures. + */ +static void +channel_socket_shutdown(cf_socket* socket) +{ + cf_socket_shutdown(socket); +} + +/** + * Return the socket associated with this node. + * Returns 0 on success and -1 if there is no socket attached to this node. + */ +static int +channel_socket_get(cf_node nodeid, cf_socket** socket) +{ + int rv = -1; + CHANNEL_LOCK(); + if (cf_shash_get(g_hb.channel_state.nodeid_to_socket, &nodeid, socket) + == CF_SHASH_ERR_NOT_FOUND) { + rv = -1; + } + else { + rv = 0; + } + + CHANNEL_UNLOCK(); + return rv; +} + +/** + * Indicate if a socket is present in a sockets list. + */ +static bool +channel_cf_sockets_contains(cf_sockets* sockets, cf_socket* to_find) +{ + for (int i = 0; i < sockets->n_socks; i++) { + if (&sockets->socks[i] == to_find) { + return true; + } + } + + return false; +} + +/** + * Destroy an allocated socket. + */ +static void +channel_socket_destroy(cf_socket* sock) +{ + cf_socket_close(sock); + cf_socket_term(sock); + cf_free(sock); +} + +/** + * Close a channel socket. Precondition is that the socket is registered with + * the channel module using channel_socket_register. + */ +static void +channel_socket_close(cf_socket* socket, bool remote_close, + bool raise_close_event) +{ + if (remote_close) { + DEBUG("remote close: fd %d event", CSFD(socket)); + } + + CHANNEL_LOCK(); + + if (channel_cf_sockets_contains(g_hb.channel_state.listening_sockets, + socket)) { + // Listening sockets will be closed by the mode (mesh/multicast + // ) modules. + goto Exit; + } + + // Clean up data structures. + as_hb_channel channel; + int status = channel_get_channel(socket, &channel); + + if (status == 0) { + if (channel.nodeid != 0) { + cf_socket* node_socket; + if (channel_socket_get(channel.nodeid, &node_socket) == 0 + && node_socket == socket) { + // Remove associated node for this socket. + cf_shash_delete(g_hb.channel_state.nodeid_to_socket, + &channel.nodeid); + + if (!channel.is_multicast && raise_close_event) { + as_hb_channel_event event; + channel_event_init(&event); + + // Notify others that this node is no longer connected. + event.type = AS_HB_CHANNEL_NODE_DISCONNECTED; + event.nodeid = channel.nodeid; + event.msg = NULL; + + channel_event_queue(&event); + } + } + } + + DETAIL("removed channel associated with fd %d polarity %s Type: %s", + CSFD(socket), channel.is_inbound ? "inbound" : "outbound", + channel.is_multicast ? "multicast" : "mesh"); + // Remove associated channel. + cf_shash_delete(g_hb.channel_state.socket_to_channel, &socket); + } + else { + // Will only happen if we are closing this socket twice. Cannot + // deference the underlying fd because the socket has been freed. + WARNING("found a socket %p without an associated channel", socket); + goto Exit; + } + + static int32_t err_ok[] = { ENOENT, EBADF, EPERM }; + int32_t err = cf_poll_delete_socket_forgiving(g_hb.channel_state.poll, + socket, sizeof(err_ok) / sizeof(int32_t), err_ok); + + if (err == ENOENT) { + // There is no valid code path where epoll ctl should fail. + CRASH("unable to remove fd %d from epoll fd list: %s", CSFD(socket), + cf_strerror(errno)); + goto Exit; + } + + cf_atomic_int_incr(&g_stats.heartbeat_connections_closed); + DEBUG("closing channel with fd %d", CSFD(socket)); + + channel_socket_destroy(socket); + +Exit: + CHANNEL_UNLOCK(); +} + +/** + * Close multiple sockets. Should be invoked only by channel stop. + * @param sockets the vector consisting of sockets to be closed. + */ +static void +channel_sockets_close(cf_vector* sockets) +{ + uint32_t socket_count = cf_vector_size(sockets); + for (int index = 0; index < socket_count; index++) { + cf_socket* socket; + if (cf_vector_get(sockets, index, &socket) != 0) { + WARNING("error finding the fd %d to be deleted", CSFD(socket)); + continue; + } + channel_socket_close(socket, false, true); + } +} + +/** + * Queues a socket for closing by the channel tender. Should be used by all code + * paths other than the channel stop code path. + */ +static void +channel_socket_close_queue(cf_socket* socket, bool is_remote_close, + bool raise_close_event) +{ + as_hb_channel_socket_close_entry close_entry = { + socket, + is_remote_close, + raise_close_event }; + DETAIL("queuing close of fd %d", CSFD(socket)); + cf_queue_push(&g_hb.channel_state.socket_close_queue, &close_entry); +} + +/** + * Close queued up sockets. + */ +static void +channel_socket_close_pending() +{ + // No channel lock required here. + as_hb_channel_socket_close_entry close_entry; + while (cf_queue_pop(&g_hb.channel_state.socket_close_queue, &close_entry, 0) + == CF_QUEUE_OK) { + channel_socket_close(close_entry.socket, close_entry.is_remote, + close_entry.raise_close_event); + } +} + +/** + * Register a new socket. + * + * @param socket the socket. + * @param is_multicast indicates if this socket is a multicast socket. + * @param is_inbound indicates if this socket is an inbound / outbound. + * @param endpoint peer endpoint this socket connects to. Will be NULL for + * inbound sockets. + */ +static void +channel_socket_register(cf_socket* socket, bool is_multicast, bool is_inbound, + cf_sock_addr* endpoint_addr) +{ + CHANNEL_LOCK(); + + as_hb_channel channel; + channel_init_channel(&channel); + + // This socket should not be part of the socket to channel map. + ASSERT(channel_get_channel(socket, &channel) == -1, + "error the channel already exists for fd %d", CSFD(socket)); + + channel.is_multicast = is_multicast; + channel.is_inbound = is_inbound; + channel.last_received = cf_getms(); + + if (endpoint_addr) { + memcpy(&channel.endpoint_addr, endpoint_addr, sizeof(*endpoint_addr)); + } + + // Add socket to poll list + cf_poll_add_socket(g_hb.channel_state.poll, socket, + EPOLLIN | EPOLLERR | EPOLLRDHUP, socket); + + cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, &channel); + + DEBUG("channel created for fd %d - polarity %s type: %s", CSFD(socket), + channel.is_inbound ? "inbound" : "outbound", + channel.is_multicast ? "multicast" : "mesh"); + + CHANNEL_UNLOCK(); +} + +/** + * Accept an incoming tcp connection. For now this is relevant only to the mesh + * mode. + * @param lsock the listening socket that received the connection. + */ +static void +channel_accept_connection(cf_socket* lsock) +{ + if (!hb_is_mesh()) { + // We do not accept connections in non mesh modes. + return; + } + + cf_socket csock; + cf_sock_addr caddr; + + if (cf_socket_accept(lsock, &csock, &caddr) < 0) { + if ((errno == EMFILE) || (errno == ENFILE) || (errno == ENOMEM) + || (errno == ENOBUFS)) { + TICKER_WARNING( + "failed to accept heartbeat connection due to error : %s", + cf_strerror(errno)); + // We are in an extreme situation where we ran out of system + // resources (file/mem). We should rather lie low and not do too + // much activity. So, sleep. We should not sleep too long as this + // same function is supposed to send heartbeat also. + usleep(MAX(AS_HB_TX_INTERVAL_MS_MIN, 1) * 1000); + return; + } + else { + // TODO: Find what there errors are. + WARNING("accept failed: %s", cf_strerror(errno)); + return; + } + } + + // Update the stats to reflect to a new connection opened. + cf_atomic_int_incr(&g_stats.heartbeat_connections_opened); + + char caddr_str[HOST_NAME_MAX]; + cf_sock_addr_to_string_safe(&caddr, caddr_str, sizeof(caddr_str)); + DEBUG("new connection from %s", caddr_str); + + cf_sock_cfg *cfg = lsock->cfg; + + if (cfg->owner == CF_SOCK_OWNER_HEARTBEAT_TLS) { + tls_socket_prepare_server(g_config.hb_config.tls, &csock); + + if (tls_socket_accept_block(&csock) != 1) { + WARNING("heartbeat TLS server handshake with %s failed", caddr_str); + cf_socket_close(&csock); + cf_socket_term(&csock); + + cf_atomic_int_incr(&g_stats.heartbeat_connections_closed); + return; + } + } + + // Allocate a new socket. + cf_socket* sock = cf_malloc(sizeof(cf_socket)); + cf_socket_init(sock); + cf_socket_copy(&csock, sock); + + // Register this socket with the channel subsystem. + channel_socket_register(sock, false, true, NULL); +} + +/** + * Parse compressed buffer into a message. + * + * @param msg the input parsed compressed message and also the output heartbeat + * message. + * @param buffer the input buffer. + * @param buffer_content_len the length of the content in the buffer. + * @return the status of parsing the message. + */ +static as_hb_channel_msg_read_status +channel_compressed_message_parse(msg* msg, void* buffer, int buffer_content_len) +{ + // This is a direct pointer inside the buffer parameter. No allocation + // required. + uint8_t* compressed_buffer = NULL; + size_t compressed_buffer_length = 0; + int parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL; + void* uncompressed_buffer = NULL; + size_t uncompressed_buffer_length = 0; + + if (msg_get_buf(msg, AS_HB_MSG_COMPRESSED_PAYLOAD, &compressed_buffer, + &compressed_buffer_length, MSG_GET_DIRECT) != 0) { + parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL; + goto Exit; + } + + // Assume compression ratio of 3. We will expand the buffer if needed. + uncompressed_buffer_length = round_up_pow2(3 * compressed_buffer_length); + + // Keep trying till we allocate enough memory for the uncompressed buffer. + while (true) { + uncompressed_buffer = MSG_BUFF_ALLOC_OR_DIE(uncompressed_buffer_length, + "error allocating memory size %zu for decompressing message", + uncompressed_buffer_length); + + int uncompress_rv = uncompress(uncompressed_buffer, + &uncompressed_buffer_length, compressed_buffer, + compressed_buffer_length); + + if (uncompress_rv == Z_OK) { + // Decompression was successful. + break; + } + + if (uncompress_rv == Z_BUF_ERROR) { + // The uncompressed buffer is not large enough. Free current buffer + // and allocate a new buffer. + MSG_BUFF_FREE(uncompressed_buffer, uncompressed_buffer_length); + + // Give uncompressed buffer more space. + uncompressed_buffer_length *= 2; + continue; + } + + // Decompression failed. Clean up and exit. + parsed = AS_HB_CHANNEL_MSG_PARSE_FAIL; + goto Exit; + } + + // Reset the message to prepare for parsing the uncompressed buffer. We have + // no issues losing the compressed buffer because we have an uncompressed + // copy. + msg_reset(msg); + + // Parse the uncompressed buffer. + parsed = + msg_parse(msg, uncompressed_buffer, uncompressed_buffer_length) + == 0 ? + AS_HB_CHANNEL_MSG_READ_SUCCESS : + AS_HB_CHANNEL_MSG_PARSE_FAIL; + + if (parsed == AS_HB_CHANNEL_MSG_READ_SUCCESS) { + // Copying the buffer content to ensure that the message and the buffer + // can have separate life cycles and we never get into races. The + // frequency of heartbeat messages is low enough to make this not matter + // much unless we have massive clusters. + msg_preserve_all_fields(msg); + } + +Exit: + MSG_BUFF_FREE(uncompressed_buffer, uncompressed_buffer_length); + return parsed; +} + +/** + * Parse the buffer into a message. + * + * @param msg the output heartbeat message. + * @param buffer the input buffer. + * @param buffer_content_len the length of the content in the buffer. + * @return the status of parsing the message. + */ +static as_hb_channel_msg_read_status +channel_message_parse(msg* msg, void* buffer, int buffer_content_len) +{ + // Peek into the buffer to get hold of the message type. + msg_type type = 0; + uint32_t msg_size = 0; + if (msg_get_initial(&msg_size, &type, (uint8_t*)buffer, buffer_content_len) + != 0 || type != msg->type) { + // Pre check because msg_parse considers this a warning but this would + // be common when protocol version between nodes do not match. + DEBUG("message type mismatch - expected:%d received:%d", msg->type, + type); + return AS_HB_CHANNEL_MSG_PARSE_FAIL; + } + + bool parsed = msg_parse(msg, buffer, buffer_content_len) == 0; + + if (parsed) { + if (msg_is_set(msg, AS_HB_MSG_COMPRESSED_PAYLOAD)) { + // This is a compressed message. + return channel_compressed_message_parse(msg, buffer, + buffer_content_len); + } + + // This is an uncompressed message. Copying the buffer content to ensure + // that the message and the buffer can have separate life cycles and we + // never get into races. The frequency of heartbeat messages is low + // enough to make this not matter much unless we have massive clusters. + msg_preserve_all_fields(msg); + } + + return parsed ? + AS_HB_CHANNEL_MSG_READ_SUCCESS : AS_HB_CHANNEL_MSG_PARSE_FAIL; +} + +/** + * Iterate over a endpoint list and see if there is a matching socket address. + */ +static void +channel_endpoint_find_iterate_fn(const as_endpoint* endpoint, void* udata) +{ + cf_sock_addr sock_addr; + as_hb_channel_endpoint_iterate_udata* iterate_data = + (as_hb_channel_endpoint_iterate_udata*)udata; + if (as_endpoint_to_sock_addr(endpoint, &sock_addr) != 0) { + return; + } + + if (cf_sock_addr_is_any(&sock_addr)) { + return; + } + + iterate_data->found |= (cf_sock_addr_compare(&sock_addr, + iterate_data->addr_to_search) == 0); +} + +/** + * Reduce function to find a matching endpoint. + */ +static int +channel_endpoint_search_reduce(const void* key, void* data, void* udata) +{ + cf_socket** socket = (cf_socket**)key; + as_hb_channel* channel = (as_hb_channel*)data; + as_hb_channel_endpoint_reduce_udata* endpoint_reduce_udata = + (as_hb_channel_endpoint_reduce_udata*)udata; + + as_hb_channel_endpoint_iterate_udata iterate_udata; + iterate_udata.addr_to_search = &channel->endpoint_addr; + iterate_udata.found = false; + + as_endpoint_list_iterate(endpoint_reduce_udata->endpoint_list, + channel_endpoint_find_iterate_fn, &iterate_udata); + + if (iterate_udata.found) { + endpoint_reduce_udata->found = true; + endpoint_reduce_udata->socket = *socket; + // Stop the reduce, we have found a match. + return CF_SHASH_ERR_FOUND; + } + + return CF_SHASH_OK; +} + +/** + * Indicates if any endpoint from the input endpoint list is already connected. + * @param endpoint_list the endpoint list to check. + * @return true if at least one endpoint is already connected to, false + * otherwise. + */ +static bool +channel_endpoint_is_connected(as_endpoint_list* endpoint_list) +{ + CHANNEL_LOCK(); + // Linear search. This will in practice not be a very frequent operation. + as_hb_channel_endpoint_reduce_udata udata; + memset(&udata, 0, sizeof(udata)); + udata.endpoint_list = endpoint_list; + + cf_shash_reduce(g_hb.channel_state.socket_to_channel, + channel_endpoint_search_reduce, &udata); + + CHANNEL_UNLOCK(); + return udata.found; +} + +/** + * Read a message from the multicast socket. + * + * @param socket the multicast socket to read from. + * @param msg the message to read into. + * + * @return the status the read operation. + */ +static as_hb_channel_msg_read_status +channel_multicast_msg_read(cf_socket* socket, msg* msg) +{ + CHANNEL_LOCK(); + + as_hb_channel_msg_read_status rv = AS_HB_CHANNEL_MSG_READ_UNDEF; + + int buffer_len = MAX(hb_mtu(), STACK_ALLOC_LIMIT); + uint8_t* buffer = MSG_BUFF_ALLOC(buffer_len); + + if (!buffer) { + WARNING( + "error allocating space for multicast recv buffer of size %d on fd %d", + buffer_len, CSFD(socket)); + goto Exit; + } + + cf_sock_addr from; + + int num_rcvd = cf_socket_recv_from(socket, buffer, buffer_len, 0, &from); + + if (num_rcvd <= 0) { + DEBUG("multicast packed read failed on fd %d", CSFD(socket)); + rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL; + goto Exit; + } + + rv = channel_message_parse(msg, buffer, num_rcvd); + if (rv != AS_HB_CHANNEL_MSG_READ_SUCCESS) { + goto Exit; + } + + rv = AS_HB_CHANNEL_MSG_READ_SUCCESS; + +Exit: + MSG_BUFF_FREE(buffer, buffer_len); + + CHANNEL_UNLOCK(); + return rv; +} + +/** + * Read a message from the a tcp mesh socket. + * + * @param socket the tcp socket to read from. + * @param msg the message to read into. + * + * @return status of the read operation. + */ +static as_hb_channel_msg_read_status +channel_mesh_msg_read(cf_socket* socket, msg* msg) +{ + CHANNEL_LOCK(); + + uint32_t buffer_len = 0; + uint8_t* buffer = NULL; + + as_hb_channel_msg_read_status rv = AS_HB_CHANNEL_MSG_READ_UNDEF; + uint8_t len_buff[MSG_WIRE_LENGTH_SIZE]; + + if (cf_socket_recv_all(socket, len_buff, MSG_WIRE_LENGTH_SIZE, 0, + MESH_RW_TIMEOUT) < 0) { + WARNING("mesh size recv failed fd %d : %s", CSFD(socket), + cf_strerror(errno)); + rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL; + goto Exit; + } + + buffer_len = ntohl(*((uint32_t*)len_buff)) + 6; + + buffer = MSG_BUFF_ALLOC(buffer_len); + + if (!buffer) { + WARNING( + "error allocating space for mesh recv buffer of size %d on fd %d", + buffer_len, CSFD(socket)); + goto Exit; + } + + memcpy(buffer, len_buff, MSG_WIRE_LENGTH_SIZE); + + if (cf_socket_recv_all(socket, buffer + MSG_WIRE_LENGTH_SIZE, + buffer_len - MSG_WIRE_LENGTH_SIZE, 0, MESH_RW_TIMEOUT) < 0) { + DETAIL("mesh recv failed fd %d : %s", CSFD(socket), cf_strerror(errno)); + rv = AS_HB_CHANNEL_MSG_CHANNEL_FAIL; + goto Exit; + } + + DETAIL("mesh recv success fd %d message size %d", CSFD(socket), buffer_len); + + rv = channel_message_parse(msg, buffer, buffer_len); + +Exit: + MSG_BUFF_FREE(buffer, buffer_len); + + CHANNEL_UNLOCK(); + return rv; +} + +/** + * Associate a socket with a nodeid and notify listeners about a node being + * connected, effective only for mesh channels. + * + * For multicast channels this function is a no-op. The reason being additional + * machinery would be required to clean up the node to channel mapping on node + * expiry. + * + * @param socket the socket. + * @param channel the channel to associate. + * @param nodeid the nodeid associated with this socket. + */ +static void +channel_node_attach(cf_socket* socket, as_hb_channel* channel, cf_node nodeid) +{ + // For now node to socket mapping is not maintained for multicast channels. + if (channel->is_multicast) { + return; + } + + CHANNEL_LOCK(); + + // Update the node information for the channel. + // This is the first time this node has a connection. Record the mapping. + cf_shash_put(g_hb.channel_state.nodeid_to_socket, &nodeid, &socket); + + channel->nodeid = nodeid; + cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, channel); + + DEBUG("attached fd %d to node %" PRIx64, CSFD(socket), nodeid); + + CHANNEL_UNLOCK(); + + // Publish an event to let know that a new node has a channel now. + as_hb_channel_event node_connected_event; + channel_event_init(&node_connected_event); + node_connected_event.nodeid = nodeid; + node_connected_event.type = AS_HB_CHANNEL_NODE_CONNECTED; + channel_event_queue(&node_connected_event); +} + +/** + * Indicates if a channel should be allowed to continue to win and live because + * of a winning grace period. + */ +static bool +channel_socket_should_live(cf_socket* socket, as_hb_channel* channel) +{ + if (channel->resolution_win_ts > 0 + && channel->resolution_win_ts + channel_win_grace_ms() + > cf_getms()) { + // Losing socket was a previous winner. Allow it time to do some work + // before knocking it off. + INFO("giving %d unresolved fd some grace time", CSFD(socket)); + return true; + } + return false; +} + +/** + * Selects one out give two sockets connected to same remote node. The algorithm + * is deterministic and ensures the remote node also chooses a socket that drops + * the same connection. + * + * @param socket1 one of the sockets + * @param socket2 one of the sockets + * @return resolved socket on success, NULL if resolution fails. + */ +static cf_socket* +channel_socket_resolve(cf_socket* socket1, cf_socket* socket2) +{ + cf_socket* rv = NULL; + CHANNEL_LOCK(); + + DEBUG("resolving between fd %d and %d", CSFD(socket1), CSFD(socket2)); + + as_hb_channel channel1; + if (channel_get_channel(socket1, &channel1) < 0) { + // Should not happen in practice. + WARNING("resolving fd %d without channel", CSFD(socket1)); + rv = socket2; + goto Exit; + } + + as_hb_channel channel2; + if (channel_get_channel(socket2, &channel2) < 0) { + // Should not happen in practice. + WARNING("resolving fd %d without channel", CSFD(socket2)); + rv = socket1; + goto Exit; + } + + if (channel_socket_should_live(socket1, &channel1)) { + rv = socket1; + goto Exit; + } + + if (channel_socket_should_live(socket2, &channel2)) { + rv = socket2; + goto Exit; + } + + cf_node remote_nodeid = + channel1.nodeid != 0 ? channel1.nodeid : channel2.nodeid; + + if (remote_nodeid == 0) { + // Should not happen in practice. + WARNING("remote node id unknown for fds %d and %d", CSFD(socket1), + CSFD(socket2)); + rv = NULL; + goto Exit; + } + + // Choose the socket with the highest acceptor nodeid. + cf_node acceptor_nodeid1 = + channel1.is_inbound ? config_self_nodeid_get() : remote_nodeid; + cf_node acceptor_nodeid2 = + channel2.is_inbound ? config_self_nodeid_get() : remote_nodeid; + + as_hb_channel* winner_channel = NULL; + cf_socket* winner_socket = NULL; + if (acceptor_nodeid1 > acceptor_nodeid2) { + winner_channel = &channel1; + winner_socket = socket1; + } + else if (acceptor_nodeid1 < acceptor_nodeid2) { + winner_channel = &channel2; + winner_socket = socket2; + } + else { + // Both connections have the same acceptor. Should not happen in + // practice. Despair and report resolution failure. + INFO( + "found redundant connections to same node, fds %d %d - choosing at random", + CSFD(socket1), CSFD(socket2)); + + if (cf_getms() % 2 == 0) { + winner_channel = &channel1; + winner_socket = socket1; + } + else { + winner_channel = &channel2; + winner_socket = socket2; + } + } + + cf_clock now = cf_getms(); + if (winner_channel->resolution_win_ts == 0) { + winner_channel->resolution_win_ts = now; + // Update the winning count of the winning channel in the channel data + // structures. + cf_shash_put(g_hb.channel_state.socket_to_channel, &winner_socket, + winner_channel); + } + + if (winner_channel->resolution_win_ts > now + channel_win_grace_ms()) { + // The winner has been winning a lot, most likely the other side has us + // with a seed address different from our published address. + // + // Break the cycle here and choose the loosing channel as the winner. + INFO("breaking socket resolve loop dropping winning fd %d", + CSFD(winner_socket)); + winner_channel = (winner_channel == &channel1) ? &channel2 : &channel1; + winner_socket = (socket1 == winner_socket) ? socket2 : socket1; + } + + rv = winner_socket; + +Exit: + CHANNEL_UNLOCK(); + return rv; +} + +/** + * Basic sanity check for a message. + * @param msg_event the message event. + * @return 0 if the message passes basic sanity tests. -1 on failure. + */ +static int +channel_msg_sanity_check(as_hb_channel_event* msg_event) +{ + msg* msg = msg_event->msg; + uint32_t id = 0; + + as_hb_msg_type type = 0; + cf_node src_nodeid = 0; + + int rv = 0; + + if (msg_nodeid_get(msg, &src_nodeid) != 0) { + TICKER_WARNING("received message without a source node"); + rv = -1; + } + + // Validate the fact that we have a valid source nodeid. + if (src_nodeid == 0) { + // Event nodeid is zero. Not a valid source nodeid. This will happen in + // compatibility mode if the info request from a new node arrives before + // the pulse message. Can be ignored. + TICKER_WARNING("received a message from node with unknown nodeid"); + rv = -1; + } + + if (msg_id_get(msg, &id) != 0) { + TICKER_WARNING( + "received message without heartbeat protocol identifier from node %" PRIx64, + src_nodeid); + rv = -1; + } + else { + DETAIL( + "received message with heartbeat protocol identifier %d from node %" PRIx64, + id, src_nodeid); + + // Ignore the message if the protocol of the incoming message does not + // match. + if (id != hb_protocol_identifier_get()) { + TICKER_WARNING( + "received message with different heartbeat protocol identifier from node %" PRIx64, + src_nodeid); + rv = -1; + } + } + + if (msg_type_get(msg, &type) != 0) { + TICKER_WARNING( + "received message without message type from node %" PRIx64, + src_nodeid); + rv = -1; + } + + as_endpoint_list* endpoint_list; + if (hb_is_mesh()) { + // Check only applies to v3 mesh. + // v3 multicast protocol does not advertise endpoint list. + if (msg_endpoint_list_get(msg, &endpoint_list) != 0 + || endpoint_list->n_endpoints <= 0) { + TICKER_WARNING( + "received message without address/port from node %" PRIx64, + src_nodeid); + rv = -1; + } + } + + as_hlc_timestamp send_ts; + if (msg_send_hlc_ts_get(msg, &send_ts) != 0) { + TICKER_WARNING("received message without HLC time from node %" PRIx64, + src_nodeid); + rv = -1; + } + + if (type == AS_HB_MSG_TYPE_PULSE) { + char* remote_cluster_name = NULL; + if (msg_cluster_name_get(msg, &remote_cluster_name) != 0) { + remote_cluster_name = ""; + } + + if (!as_config_cluster_name_matches(remote_cluster_name)) { + // Generate cluster-name mismatch event. + as_hb_channel_event mismatch_event; + channel_event_init(&mismatch_event); + + // Notify hb about cluster-name mismatch. + mismatch_event.type = AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH; + mismatch_event.nodeid = src_nodeid; + mismatch_event.msg = NULL; + memcpy(&mismatch_event.msg_hlc_ts, &msg_event->msg_hlc_ts, + sizeof(msg_event->msg_hlc_ts)); + + channel_event_queue(&mismatch_event); + + TICKER_WARNING("ignoring message from %"PRIX64" with different cluster name(%s)", + src_nodeid, remote_cluster_name[0] == '\0' ? "null" : remote_cluster_name ); + rv = -1; + } + } + + DETAIL("received message of type %d from node %" PRIx64, type, src_nodeid); + + return rv; +} + +/** + * Process incoming message to possibly update channel state. + * + * @param socket the socket on which the message is received. + * @param event the message wrapped around in a channel event. + * @return 0 if the message can be further processed, -1 if the message should + * be discarded. + */ +static int +channel_msg_event_process(cf_socket* socket, as_hb_channel_event* event) +{ + // Basic sanity check for the inbound message. + if (channel_msg_sanity_check(event) != 0) { + DETAIL("sanity check failed for message on fd %d", CSFD(socket)); + return -1; + } + + int rv = -1; + CHANNEL_LOCK(); + + as_hb_channel channel; + if (channel_get_channel(socket, &channel) < 0) { + // This is a bug and should not happen. Be paranoid and try fixing it ? + WARNING("received a message on an unregistered fd %d - closing the fd", + CSFD(socket)); + channel_socket_close_queue(socket, false, true); + rv = -1; + goto Exit; + } + + if (channel.is_multicast) { + rv = 0; + goto Exit; + } + + cf_node nodeid = event->nodeid; + + if (channel.nodeid != 0 && channel.nodeid != nodeid) { + // The event nodeid does not match previously know event id. Something + // seriously wrong here. + WARNING("received a message from node with incorrect nodeid - expected %" PRIx64 " received %" PRIx64 "on fd %d", + channel.nodeid, nodeid, CSFD(socket)); + rv = -1; + goto Exit; + } + + // Update the last received time for this node + channel.last_received = cf_getms(); + + cf_shash_put(g_hb.channel_state.socket_to_channel, &socket, &channel); + + cf_socket* existing_socket; + int get_result = cf_shash_get(g_hb.channel_state.nodeid_to_socket, &nodeid, + &existing_socket); + + if (get_result == CF_SHASH_ERR_NOT_FOUND) { + // Associate this socket with the node. + channel_node_attach(socket, &channel, nodeid); + } + else if (existing_socket != socket) { + // Somehow the other node and this node discovered each other together + // both connected via two tcp connections. Choose one and close the + // other. + cf_socket* resolved = channel_socket_resolve(socket, existing_socket); + + if (!resolved) { + DEBUG( + "resolving between fd %d and %d failed - closing both connections", + CSFD(socket), CSFD(existing_socket)); + + // Resolution failed. Should not happen but there is a window where + // the same node initiated two connections. + // Close both connections and try again. + channel_socket_close_queue(socket, false, true); + channel_socket_close_queue(existing_socket, false, true); + + // Nothing wrong with the message. Let it through. + rv = 0; + goto Exit; + } + + DEBUG("resolved fd %d between redundant fd %d and %d for node %" PRIx64, + CSFD(resolved), CSFD(socket), CSFD(existing_socket), nodeid); + + if (resolved == existing_socket) { + // The node to socket mapping is correct, just close this socket and + // this node will still be connected to the remote node. Do not + // raise any event for this closure. + channel_socket_close_queue(socket, false, false); + } + else { + // We need to close the existing socket. Disable channel events + // because we make the node appear to be not connected. Do not raise + // any event for this closure. + channel_socket_close_queue(existing_socket, false, false); + // Associate this socket with the node. + channel_node_attach(socket, &channel, nodeid); + } + } + + rv = 0; + +Exit: + CHANNEL_UNLOCK(); + return rv; +} + +/** + * Read a message from a socket that has data. + * @param socket the socket having data to be read. + */ +static void +channel_msg_read(cf_socket* socket) +{ + CHANNEL_LOCK(); + + as_hb_channel_msg_read_status status; + as_hb_channel channel; + + bool free_msg = true; + + msg* msg = hb_msg_get(); + + if (channel_get_channel(socket, &channel) != 0) { + // Would happen if the channel was closed in the same epoll loop. + DEBUG("error the channel does not exist for fd %d", CSFD(socket)); + goto Exit; + } + + if (channel.is_multicast) { + status = channel_multicast_msg_read(socket, msg); + } + else { + status = channel_mesh_msg_read(socket, msg); + } + + switch (status) { + case AS_HB_CHANNEL_MSG_READ_SUCCESS: { + break; + } + + case AS_HB_CHANNEL_MSG_PARSE_FAIL: { + TICKER_WARNING("unable to parse heartbeat message on fd %d", + CSFD(socket)); + goto Exit; + } + + case AS_HB_CHANNEL_MSG_CHANNEL_FAIL: // Falling through + default: { + DEBUG("could not read message from fd %d", CSFD(socket)); + if (!channel.is_multicast) { + // Shut down only mesh socket. + channel_socket_shutdown(socket); + } + goto Exit; + } + } + + as_hb_channel_event event; + channel_event_init(&event); + + if (msg_get_uint64(msg, AS_HB_MSG_NODE, &event.nodeid) < 0) { + // Node id missing from the message. Assume this message to be corrupt. + TICKER_WARNING("message with invalid nodeid received on fd %d", + CSFD(socket)); + goto Exit; + } + + event.msg = msg; + event.type = AS_HB_CHANNEL_MSG_RECEIVED; + + // Update hlc and store update message timestamp for the event. + as_hlc_timestamp send_ts = 0; + msg_send_hlc_ts_get(msg, &send_ts); + as_hlc_timestamp_update(event.nodeid, send_ts, &event.msg_hlc_ts); + + // Process received message to update channel state. + if (channel_msg_event_process(socket, &event) == 0) { + // The message needs to be delivered to the listeners. Prevent a free. + free_msg = false; + channel_event_queue(&event); + } + +Exit: + CHANNEL_UNLOCK(); + + // release the message. + if (free_msg) { + hb_msg_return(msg); + } +} + +/** + * Reduce function to remove faulty channels / nodes. Shutdown associated socket + * to have channel tender cleanup. + */ +static int +channel_channels_tend_reduce(const void* key, void* data, void* udata) +{ + cf_socket** socket = (cf_socket**)key; + as_hb_channel* channel = (as_hb_channel*)data; + + DETAIL("tending channel fd %d for node %" PRIx64 " - last received %" PRIu64 " endpoint %s", + CSFD(*socket), channel->nodeid, channel->last_received, + cf_sock_addr_print(&channel->endpoint_addr)); + + if (channel->last_received + CHANNEL_NODE_READ_IDLE_TIMEOUT() + < cf_getms()) { + // Shutdown associated socket if it is not a multicast socket. + if (!channel->is_multicast) { + DEBUG("channel shutting down idle fd %d for node %" PRIx64 " - last received %" PRIu64 " endpoint %s", + CSFD(*socket), channel->nodeid, channel->last_received, + cf_sock_addr_print(&channel->endpoint_addr)); + channel_socket_shutdown(*socket); + } + } + + return CF_SHASH_OK; +} + +/** + * Tend channel specific node information to remove channels that are faulty (or + * TODO: attached to misbehaving nodes). + */ +static void +channel_channels_idle_check() +{ + CHANNEL_LOCK(); + + cf_clock now = cf_getms(); + if (g_hb.channel_state.last_channel_idle_check + CHANNEL_IDLE_CHECK_PERIOD + <= now) { + cf_shash_reduce(g_hb.channel_state.socket_to_channel, + channel_channels_tend_reduce, NULL); + g_hb.channel_state.last_channel_idle_check = now; + } + + CHANNEL_UNLOCK(); +} + +/** + * Socket tending thread. Manages heartbeat receive as well. + */ +void* +channel_tender(void* arg) +{ + DETAIL("channel tender started"); + + while (channel_is_running()) { + cf_poll_event events[POLL_SZ]; + int32_t nevents = cf_poll_wait(g_hb.channel_state.poll, events, POLL_SZ, + AS_HB_TX_INTERVAL_MS_MIN); + + DETAIL("tending channel"); + + for (int32_t i = 0; i < nevents; i++) { + cf_socket* socket = events[i].data; + if (channel_cf_sockets_contains( + g_hb.channel_state.listening_sockets, socket) + && hb_is_mesh()) { + // Accept a new connection. + channel_accept_connection(socket); + } + else if (events[i].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) { + channel_socket_close_queue(socket, true, true); + } + else if (events[i].events & EPOLLIN) { + // Read a message for the socket that is ready. + channel_msg_read(socket); + } + } + + // Tend channels to discard stale channels. + channel_channels_idle_check(); + + // Close queued up socket. + channel_socket_close_pending(); + + // Publish pending events. Should be outside channel lock. + channel_event_publish_pending(); + + DETAIL("done tending channel"); + } + + DETAIL("channel tender shut down"); + return NULL; +} + +/* + * ---------------------------------------------------------------------------- + * Channel public API + * ---------------------------------------------------------------------------- + */ + +/** + * Filter out endpoints not matching this node's capabilities. + */ +static bool +channel_mesh_endpoint_filter(const as_endpoint* endpoint, void* udata) +{ + if ((cf_ip_addr_legacy_only()) + && endpoint->addr_type == AS_ENDPOINT_ADDR_TYPE_IPv6) { + return false; + } + + // If we don't offer TLS, then we won't connect via TLS, either. + if (g_config.hb_tls_serv_spec.bind_port == 0 + && as_endpoint_capability_is_supported(endpoint, + AS_ENDPOINT_TLS_MASK)) { + return false; + } + + return true; +} + +/** + * Try and connect to a set of endpoint_lists. + */ +static void +channel_mesh_channel_establish(as_endpoint_list** endpoint_lists, + int endpoint_list_count) +{ + for (int i = 0; i < endpoint_list_count; i++) { + char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; + as_endpoint_list_to_string(endpoint_lists[i], endpoint_list_str, + sizeof(endpoint_list_str)); + + if (channel_endpoint_is_connected(endpoint_lists[i])) { + DEBUG( + "duplicate endpoint connect request - ignoring endpoint list {%s}", + endpoint_list_str); + continue; + } + + DEBUG("attempting to connect mesh host at {%s}", endpoint_list_str); + + cf_socket* sock = (cf_socket*)cf_malloc(sizeof(cf_socket)); + + const as_endpoint* connected_endpoint = as_endpoint_connect_any( + endpoint_lists[i], channel_mesh_endpoint_filter, NULL, + CONNECT_TIMEOUT(), sock); + + if (connected_endpoint) { + cf_atomic_int_incr(&g_stats.heartbeat_connections_opened); + + cf_sock_addr endpoint_addr; + memset(&endpoint_addr, 0, sizeof(endpoint_addr)); + cf_ip_addr_set_any(&endpoint_addr.addr); + if (as_endpoint_to_sock_addr(connected_endpoint, &endpoint_addr) + != 0) { + // Should never happen in practice. + WARNING("error converting endpoint to socket address"); + channel_socket_destroy(sock); + sock = NULL; + + cf_atomic_int_incr(&g_stats.heartbeat_connections_closed); + continue; + } + + if (as_endpoint_capability_is_supported(connected_endpoint, + AS_ENDPOINT_TLS_MASK)) { + tls_socket_prepare_client(g_config.hb_config.tls, sock); + + if (tls_socket_connect_block(sock) != 1) { + WARNING("heartbeat TLS client handshake with {%s} failed", + endpoint_list_str); + channel_socket_destroy(sock); + sock = NULL; + + cf_atomic_int_incr(&g_stats.heartbeat_connections_closed); + return; + } + } + + channel_socket_register(sock, false, false, &endpoint_addr); + } + else { + TICKER_WARNING("could not create heartbeat connection to node {%s}", + endpoint_list_str); + if (sock) { + cf_free(sock); + sock = NULL; + } + } + } +} + +/** + * Disconnect a node from the channel list. + * @param nodeid the nodeid of the node whose channel should be disconnected. + * @return 0 if the node had a channel and was disconnected. -1 otherwise. + */ +static int +channel_node_disconnect(cf_node nodeid) +{ + int rv = -1; + + CHANNEL_LOCK(); + + cf_socket* socket; + if (channel_socket_get(nodeid, &socket) != 0) { + // not found + rv = -1; + goto Exit; + } + + DEBUG("disconnecting the channel attached to node %" PRIx64, nodeid); + + channel_socket_close_queue(socket, false, true); + + rv = 0; + +Exit: + CHANNEL_UNLOCK(); + + return rv; +} + +/** + * Register mesh listening sockets. + */ +static void +channel_mesh_listening_socks_register(cf_sockets* listening_sockets) +{ + CHANNEL_LOCK(); + g_hb.channel_state.listening_sockets = listening_sockets; + + cf_poll_add_sockets(g_hb.channel_state.poll, + g_hb.channel_state.listening_sockets, + EPOLLIN | EPOLLERR | EPOLLHUP); + cf_socket_show_server(AS_HB, "mesh heartbeat", + g_hb.channel_state.listening_sockets); + + // We do not need a separate channel to cover this socket because IO will + // not happen on these sockets. + CHANNEL_UNLOCK(); +} + +/** + * Deregister mesh listening socket from epoll event. + * @param socket the listening socket socket. + */ +static void +channel_mesh_listening_socks_deregister(cf_sockets* listening_sockets) +{ + CHANNEL_LOCK(); + cf_poll_delete_sockets(g_hb.channel_state.poll, listening_sockets); + CHANNEL_UNLOCK(); +} + +/** + * Register the multicast listening socket. + * @param socket the listening socket. + * @param endpoint the endpoint on which multicast io happens. + */ +static void +channel_multicast_listening_socks_register(cf_sockets* listening_sockets) +{ + CHANNEL_LOCK(); + g_hb.channel_state.listening_sockets = listening_sockets; + + // Create a new multicast channel for each multicast socket. + for (uint32_t i = 0; + i < g_hb.mode_state.multicast_state.listening_sockets.n_socks; + ++i) { + channel_socket_register(&g_hb.channel_state.listening_sockets->socks[i], + true, false, NULL); + } + + cf_socket_mcast_show(AS_HB, "multicast heartbeat", + g_hb.channel_state.listening_sockets); + CHANNEL_UNLOCK(); +} + +/** + * Deregister multicast listening socket from epoll event. + * @param socket the listening socket socket. + */ +static void +channel_multicast_listening_socks_deregister(cf_sockets* listening_sockets) +{ + CHANNEL_LOCK(); + cf_poll_delete_sockets(g_hb.channel_state.poll, listening_sockets); + CHANNEL_UNLOCK(); +} + +/** + * Initialize the channel sub module. + */ +static void +channel_init() +{ + CHANNEL_LOCK(); + + // Disable events till initialization is complete. + channel_events_enabled_set(false); + + // Initialize unpublished event queue. + cf_queue_init(&g_hb.channel_state.events_queue, sizeof(as_hb_channel_event), + AS_HB_CLUSTER_MAX_SIZE_SOFT, true); + + // Initialize sockets to close queue. + cf_queue_init(&g_hb.channel_state.socket_close_queue, + sizeof(as_hb_channel_socket_close_entry), + AS_HB_CLUSTER_MAX_SIZE_SOFT, true); + + // Initialize the nodeid to socket hash. + g_hb.channel_state.nodeid_to_socket = cf_shash_create(cf_nodeid_shash_fn, + sizeof(cf_node), sizeof(cf_socket*), AS_HB_CLUSTER_MAX_SIZE_SOFT, + 0); + + // Initialize the socket to channel state hash. + g_hb.channel_state.socket_to_channel = cf_shash_create(hb_socket_hash_fn, + sizeof(cf_socket*), sizeof(as_hb_channel), + AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); + + g_hb.channel_state.status = AS_HB_STATUS_STOPPED; + + CHANNEL_UNLOCK(); +} + +/** + * Start channel sub module. Kicks off the channel tending thread. + */ +static void +channel_start() +{ + CHANNEL_LOCK(); + + if (channel_is_running()) { + WARNING("heartbeat channel already started"); + goto Exit; + } + + // create the epoll socket. + cf_poll_create(&g_hb.channel_state.poll); + + DEBUG("created epoll fd %d", CEFD(g_hb.channel_state.poll)); + + // Disable events till initialization is complete. + channel_events_enabled_set(false); + + // Data structures have been initialized. + g_hb.channel_state.status = AS_HB_STATUS_RUNNING; + + // Initialization complete enable events. + channel_events_enabled_set(true); + + // Start the channel tender. + if (pthread_create(&g_hb.channel_state.channel_tender_tid, 0, + channel_tender, &g_hb) != 0) { + CRASH("could not create channel tender thread: %s", cf_strerror(errno)); + } + +Exit: + CHANNEL_UNLOCK(); +} + +/** + * Get all sockets. + */ +static int +channel_sockets_get_reduce(const void* key, void* data, void* udata) +{ + cf_vector* sockets = (cf_vector*)udata; + cf_vector_append(sockets, key); + return CF_SHASH_OK; +} + +/** + * Stop the channel sub module called on hb_stop. + */ +static void +channel_stop() +{ + if (!channel_is_running()) { + WARNING("heartbeat channel already stopped"); + return; + } + + DEBUG("stopping the channel"); + + // Unguarded state change but this should be OK. + g_hb.channel_state.status = AS_HB_STATUS_SHUTTING_DOWN; + + // Wait for the channel tender thread to finish. + pthread_join(g_hb.channel_state.channel_tender_tid, NULL); + + CHANNEL_LOCK(); + + cf_vector sockets; + cf_socket buff[cf_shash_get_size(g_hb.channel_state.socket_to_channel)]; + cf_vector_init_smalloc(&sockets, sizeof(cf_socket*), (uint8_t*)buff, + sizeof(buff), VECTOR_FLAG_INITZERO); + + cf_shash_reduce(g_hb.channel_state.socket_to_channel, + channel_sockets_get_reduce, &sockets); + + channel_sockets_close(&sockets); + + // Disable events. + channel_events_enabled_set(false); + + cf_vector_destroy(&sockets); + + // Close epoll socket. + cf_poll_destroy(g_hb.channel_state.poll); + EFD(g_hb.channel_state.poll) = -1; + + // Disable the channel thread. + g_hb.channel_state.status = AS_HB_STATUS_STOPPED; + + DEBUG("channel Stopped"); + + CHANNEL_UNLOCK(); +} + +/** + * Send heartbeat protocol message retries in case of EAGAIN and EWOULDBLOCK + * @param socket the socket to send the buffer over. + * @param buff the data buffer. + * @param buffer_length the number of bytes in the buffer to send. + * @return 0 on successful send -1 on failure + */ +static int +channel_mesh_msg_send(cf_socket* socket, uint8_t* buff, size_t buffer_length) +{ + CHANNEL_LOCK(); + int rv; + + if (cf_socket_send_all(socket, buff, buffer_length, 0, + MESH_RW_TIMEOUT) < 0) { + as_hb_channel channel; + if (channel_get_channel(socket, &channel) == 0) { + // Would happen if the channel was closed in the same epoll loop. + TICKER_WARNING("sending mesh message to %"PRIx64" on fd %d failed : %s", + channel.nodeid, CSFD(socket), cf_strerror(errno)); + } + else { + TICKER_WARNING("sending mesh message on fd %d failed : %s", + CSFD(socket), cf_strerror(errno)); + } + + channel_socket_shutdown(socket); + rv = -1; + } + else { + rv = 0; + } + + CHANNEL_UNLOCK(); + return rv; +} + +/** + * Send heartbeat protocol message retries in case of EAGAIN and EWOULDBLOCK + * @param socket the socket to send the buffer over. + * @param buff the data buffer. + * @param buffer_length the number of bytes in the buffer to send. + * @return 0 on successful send -1 on failure + */ +static int +channel_multicast_msg_send(cf_socket* socket, uint8_t* buff, + size_t buffer_length) +{ + CHANNEL_LOCK(); + int rv = 0; + DETAIL("sending udp heartbeat to fd %d: msg size %zu", CSFD(socket), + buffer_length); + + int mtu = hb_mtu(); + if (buffer_length > mtu) { + TICKER_WARNING("mtu breach, sending udp heartbeat to fd %d: mtu %d", + CSFD(socket), mtu); + } + + cf_msock_cfg* socket_cfg = (cf_msock_cfg*)(socket->cfg); + cf_sock_addr dest; + dest.port = socket_cfg->port; + cf_ip_addr_copy(&socket_cfg->addr, &dest.addr); + + if (cf_socket_send_to(socket, buff, buffer_length, 0, &dest) < 0) { + TICKER_WARNING("multicast message send failed on fd %d %s", + CSFD(socket), cf_strerror(errno)); + rv = -1; + } + CHANNEL_UNLOCK(); + return rv; +} + +/** + * Indicates if this msg requires compression. + */ +static bool +channel_msg_is_compression_required(msg* msg, int wire_size, int mtu) +{ + return wire_size > msg_compression_threshold(mtu); +} + +/** + * Estimate the size of the buffer required to fill out the serialized message. + * @param msg the input message. + * @param mtu the underlying network mtu. + * @return the size of the buffer required. + */ +static int +channel_msg_buffer_size_get(int wire_size, int mtu) +{ + return round_up_pow2(MAX(wire_size, compressBound(wire_size))); +} + +/** + * Fills the buffer with the serialized message. + * @param original_msg the original message to serialize. + * @param wire_size the message wire size. + * @param mtu the underlying network mtu. + * @param buffer the destination buffer. + * @param buffer_len the buffer length. + * + * @return length of the serialized message. + */ +static size_t +channel_msg_buffer_fill(msg* original_msg, int wire_size, int mtu, + uint8_t* buffer, size_t buffer_len) +{ + // This is output by msg_to_wire. Using a separate variable so that we do + // not lose the actual buffer length needed for compression later on. + size_t msg_size = msg_to_wire(original_msg, buffer); + + if (channel_msg_is_compression_required(original_msg, msg_size, mtu)) { + // Compression is required. + const size_t compressed_buffer_len = buffer_len; + uint8_t* compressed_buffer = MSG_BUFF_ALLOC_OR_DIE( + compressed_buffer_len, + "error allocating memory size %zu for compressing message", + compressed_buffer_len); + + size_t compressed_msg_size = compressed_buffer_len; + int compress_rv = compress2(compressed_buffer, &compressed_msg_size, + buffer, wire_size, Z_BEST_COMPRESSION); + + if (compress_rv == Z_BUF_ERROR) { + // Compression result going to be larger than original input buffer. + // Skip compression and try to send the message as is. + DETAIL( + "skipping compression - compressed size larger than input size %zu", + msg_size); + } + else { + msg* temp_msg = hb_msg_get(); + + msg_set_buf(temp_msg, AS_HB_MSG_COMPRESSED_PAYLOAD, + compressed_buffer, compressed_msg_size, MSG_SET_COPY); + msg_size = msg_to_wire(temp_msg, buffer); + + hb_msg_return(temp_msg); + } + + MSG_BUFF_FREE(compressed_buffer, compressed_buffer_len); + + } + + return msg_size; +} + +/** + * Send a message to a destination node. + */ +static int +channel_msg_unicast(cf_node dest, msg* msg) +{ + size_t buffer_len = 0; + uint8_t* buffer = NULL; + if (!hb_is_mesh()) { + // Can't send a unicast message in the multicast mode. + WARNING("ignoring sending unicast message in multicast mode"); + return -1; + } + + CHANNEL_LOCK(); + + int rv = -1; + cf_socket* connected_socket; + + if (channel_socket_get(dest, &connected_socket) != 0) { + DEBUG("failing message send to disconnected node %" PRIx64, dest); + rv = -1; + goto Exit; + } + + // Read the message to a buffer. + int mtu = hb_mtu(); + int wire_size = msg_get_wire_size(msg); + buffer_len = channel_msg_buffer_size_get(wire_size, mtu); + buffer = + MSG_BUFF_ALLOC_OR_DIE(buffer_len, + "error allocating memory size %zu for sending message to node %" PRIx64, + buffer_len, dest); + + size_t msg_size = channel_msg_buffer_fill(msg, wire_size, mtu, buffer, + buffer_len); + + // Send over the buffer. + rv = channel_mesh_msg_send(connected_socket, buffer, msg_size); + +Exit: + MSG_BUFF_FREE(buffer, buffer_len); + CHANNEL_UNLOCK(); + return rv; +} + +/** + * Shash reduce function to walk over the socket to channel hash and broadcast + * the message in udata. + */ +static int +channel_msg_broadcast_reduce(const void* key, void* data, void* udata) +{ + CHANNEL_LOCK(); + cf_socket** socket = (cf_socket**)key; + as_hb_channel* channel = (as_hb_channel*)data; + as_hb_channel_buffer_udata* buffer_udata = + (as_hb_channel_buffer_udata*)udata; + + if (!channel->is_multicast) { + DETAIL( + "broadcasting message of length %zu on channel %d assigned to node %" PRIx64, + buffer_udata->buffer_len, CSFD(*socket), channel->nodeid); + + channel_mesh_msg_send(*socket, buffer_udata->buffer, + buffer_udata->buffer_len); + } + else { + channel_multicast_msg_send(*socket, buffer_udata->buffer, + buffer_udata->buffer_len); + } + + CHANNEL_UNLOCK(); + + return CF_SHASH_OK; +} + +/** + * Broadcast a message over all channels. + */ +static int +channel_msg_broadcast(msg* msg) +{ + CHANNEL_LOCK(); + + int rv = -1; + + // Read the message to a buffer. + int mtu = hb_mtu(); + int wire_size = msg_get_wire_size(msg); + size_t buffer_len = channel_msg_buffer_size_get(wire_size, mtu); + uint8_t* buffer = MSG_BUFF_ALLOC_OR_DIE(buffer_len, + "error allocating memory size %zu for sending broadcast message", + buffer_len); + + as_hb_channel_buffer_udata udata; + udata.buffer = buffer; + + // Note this is the length of buffer to send. + udata.buffer_len = channel_msg_buffer_fill(msg, wire_size, mtu, buffer, + buffer_len); + + cf_shash_reduce(g_hb.channel_state.socket_to_channel, + channel_msg_broadcast_reduce, &udata); + + MSG_BUFF_FREE(buffer, buffer_len); + CHANNEL_UNLOCK(); + return rv; +} + +/** + * Clear all channel state. + */ +static void +channel_clear() +{ + if (!channel_is_stopped()) { + WARNING("attempted channel clear without stopping the channel"); + return; + } + + CHANNEL_LOCK(); + + // Free the unpublished event queue. + cf_queue_delete_all(&g_hb.channel_state.events_queue); + + // Delete nodeid to socket hash. + cf_shash_reduce(g_hb.channel_state.nodeid_to_socket, hb_delete_all_reduce, + NULL); + + // Delete the socket_to_channel hash. + cf_shash_reduce(g_hb.channel_state.socket_to_channel, hb_delete_all_reduce, + NULL); + + DETAIL("cleared channel information"); + CHANNEL_UNLOCK(); +} + +/** + * Reduce function to dump channel node info to log file. + */ +static int +channel_dump_reduce(const void* key, void* data, void* udata) +{ + cf_socket** socket = (cf_socket**)key; + as_hb_channel* channel = (as_hb_channel*)data; + + INFO("\tHB Channel (%s): node-id %" PRIx64 " fd %d endpoint %s polarity %s last-received %" PRIu64, + channel->is_multicast ? "multicast" : "mesh", channel->nodeid, + CSFD(*socket), (cf_sock_addr_is_any(&channel->endpoint_addr)) + ? "unknown" + : cf_sock_addr_print(&channel->endpoint_addr), + channel->is_inbound ? "inbound" : "outbound", + channel->last_received); + + return CF_SHASH_OK; +} + +/** + * Dump channel state to logs. + * @param verbose enables / disables verbose logging. + */ +static void +channel_dump(bool verbose) +{ + CHANNEL_LOCK(); + + INFO("HB Channel Count %d", + cf_shash_get_size(g_hb.channel_state.socket_to_channel)); + + if (verbose) { + cf_shash_reduce(g_hb.channel_state.socket_to_channel, + channel_dump_reduce, NULL); + } + + CHANNEL_UNLOCK(); +} + +/* + * ---------------------------------------------------------------------------- + * Mesh sub module. + * ---------------------------------------------------------------------------- + */ + +/** + * Is mesh running. + */ +static bool +mesh_is_running() +{ + MESH_LOCK(); + bool retval = + (g_hb.mode_state.mesh_state.status == AS_HB_STATUS_RUNNING) ? + true : false; + MESH_UNLOCK(); + return retval; +} + +/** + * Is mesh stopped. + */ +static bool +mesh_is_stopped() +{ + MESH_LOCK(); + bool retval = + (g_hb.mode_state.mesh_state.status == AS_HB_STATUS_STOPPED) ? + true : false; + MESH_UNLOCK(); + return retval; +} + +/** + * Refresh the mesh published endpoint list. + * @return 0 on successful list creation, -1 otherwise. + */ +static int +mesh_published_endpoint_list_refresh() +{ + int rv = -1; + MESH_LOCK(); + + // TODO: Add interface addresses change detection logic here as well. + if (g_hb.mode_state.mesh_state.published_endpoint_list != NULL + && g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only + == cf_ip_addr_legacy_only()) { + rv = 0; + goto Exit; + } + + // The global flag has changed, refresh the published address list. + if (g_hb.mode_state.mesh_state.published_endpoint_list) { + // Free the obsolete list. + cf_free(g_hb.mode_state.mesh_state.published_endpoint_list); + } + + const cf_serv_cfg* bind_cfg = config_bind_cfg_get(); + cf_serv_cfg published_cfg; + + config_bind_serv_cfg_expand(bind_cfg, &published_cfg, + g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only); + + g_hb.mode_state.mesh_state.published_endpoint_list = + as_endpoint_list_from_serv_cfg(&published_cfg); + + if (!g_hb.mode_state.mesh_state.published_endpoint_list) { + CRASH("error initializing mesh published address list"); + } + + g_hb.mode_state.mesh_state.published_endpoint_list_ipv4_only = + cf_ip_addr_legacy_only(); + + rv = 0; + + char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; + as_endpoint_list_to_string( + g_hb.mode_state.mesh_state.published_endpoint_list, + endpoint_list_str, sizeof(endpoint_list_str)); + INFO("updated heartbeat published address list to {%s}", endpoint_list_str); + +Exit: + MESH_UNLOCK(); + return rv; +} + +/** + * Read the published endpoint list via a callback. The call back pattern is to + * prevent access to the published list outside the mesh lock. + * @param process_fn the list process function. The list passed to the process + * function can be NULL. + * @param udata passed as is to the process function. + */ +static void +mesh_published_endpoints_process(endpoint_list_process_fn process_fn, + void* udata) +{ + MESH_LOCK(); + + as_endpoint_list* rv = NULL; + if (mesh_published_endpoint_list_refresh()) { + WARNING("error creating mesh published endpoint list"); + rv = NULL; + } + else { + rv = g_hb.mode_state.mesh_state.published_endpoint_list; + } + + (process_fn)(rv, udata); + + MESH_UNLOCK(); +} + +/** + * Convert mesh status to a string. + */ +static const char* +mesh_node_status_string(as_hb_mesh_node_status status) +{ + static char* status_str[] = { + "active", + "pending", + "inactive", + "endpoint-unknown" }; + + if (status > AS_HB_MESH_NODE_STATUS_SENTINEL) { + return "corrupted"; + } + return status_str[status]; +} + +/** + * Change the state of a mesh node. Note: memset the mesh_nodes to zero before + * calling state change for the first time. + */ +static void +mesh_seed_status_change(as_hb_mesh_seed* seed, + as_hb_mesh_node_status new_status) +{ + seed->status = new_status; + seed->last_status_updated = cf_getms(); +} + +/** + * Destroy a mesh seed node. + */ +static void +mesh_seed_destroy(as_hb_mesh_seed* seed) +{ + MESH_LOCK(); + if (seed->resolved_endpoint_list) { + cf_free(seed->resolved_endpoint_list); + seed->resolved_endpoint_list = NULL; + } + MESH_UNLOCK(); +} + +/** + * Fill the endpoint list for a mesh seed using the mesh seed hostname and port. + * returns the + * @param mesh_node the mesh node + * @return 0 on success. -1 if a valid endpoint list does not exist and it could + * not be generated. + */ +static int +mesh_seed_endpoint_list_fill(as_hb_mesh_seed* seed) +{ + if (seed->resolved_endpoint_list != NULL + && seed->resolved_endpoint_list->n_endpoints > 0) { + // A valid endpoint list already exists. For now we resolve only once. + return 0; + } + + cf_clock now = cf_getms(); + if (now + < seed->resolved_endpoint_list_ts + + MESH_SEED_RESOLVE_ATTEMPT_INTERVAL()) { + // We have just resolved this seed entry unsuccessfully. Don't try again + // for sometime. + return -1; + } + + uint32_t n_resolved_addresses = CF_SOCK_CFG_MAX; + cf_ip_addr resolved_addresses[n_resolved_addresses]; + + // Resolve and get all IPv4/IPv6 ip addresses. + seed->resolved_endpoint_list_ts = now; + if (cf_ip_addr_from_string_multi(seed->seed_host_name, resolved_addresses, + &n_resolved_addresses) != 0 || n_resolved_addresses == 0) { + TICKER_WARNING("failed resolving mesh seed hostname %s", + seed->seed_host_name); + + // Hostname resolution failed. + return -1; + } + + // Convert resolved addresses to an endpoint list. + cf_serv_cfg temp_serv_cfg; + cf_serv_cfg_init(&temp_serv_cfg); + + cf_sock_cfg sock_cfg; + cf_sock_cfg_init(&sock_cfg, + seed->seed_tls ? + CF_SOCK_OWNER_HEARTBEAT_TLS : CF_SOCK_OWNER_HEARTBEAT); + sock_cfg.port = seed->seed_port; + + for (int i = 0; i < n_resolved_addresses; i++) { + cf_ip_addr_copy(&resolved_addresses[i], &sock_cfg.addr); + if (cf_serv_cfg_add_sock_cfg(&temp_serv_cfg, &sock_cfg)) { + CRASH("error initializing resolved address list"); + } + + DETAIL("resolved mesh node hostname %s to %s", seed->seed_host_name, + cf_ip_addr_print(&resolved_addresses[i])); + } + + seed->resolved_endpoint_list = as_endpoint_list_from_serv_cfg( + &temp_serv_cfg); + return seed->resolved_endpoint_list != NULL ? 0 : -1; +} + +/** + * Find a mesh seed in the seed list that has an overlapping endpoint and return + * an internal pointer. Assumes this function is called within mesh lock to + * prevent invalidating the returned index after function return. + * + * @param endpoint_list the endpoint list to find the endpoint by. + * @return index to matching seed entry if found, else -1 + */ +static int +mesh_seed_endpoint_list_overlapping_find_unsafe(as_endpoint_list* endpoint_list) +{ + MESH_LOCK(); + + int match_index = -1; + if (!endpoint_list) { + // Null / empty endpoint list. + goto Exit; + } + cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; + int element_count = cf_vector_size(seeds); + for (int i = 0; i < element_count; i++) { + as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); + + // Ensure the seed hostname is resolved. + mesh_seed_endpoint_list_fill(seed); + + if (as_endpoint_lists_are_overlapping(endpoint_list, + seed->resolved_endpoint_list, true)) { + match_index = i; + break; + } + } + +Exit: + MESH_UNLOCK(); + return match_index; +} + +/** + * Remove a seed entry from the seed list. + * Assumes this function is called within mesh lock to prevent invalidating the + * used index during a function call. + * @param seed_index the index of the seed element. + * @return 0 on success -1 on failure. + */ +static int +mesh_seed_delete_unsafe(int seed_index) +{ + int rv = -1; + MESH_LOCK(); + cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; + if (seed_index >= 0) { + as_hb_mesh_seed* seed = cf_vector_getp(seeds, seed_index); + mesh_seed_destroy(seed); + rv = cf_vector_delete(seeds, seed_index); + if (rv == 0) { + INFO("removed mesh seed host:%s port %d", seed->seed_host_name, + seed->seed_port); + } + } + MESH_UNLOCK(); + return rv; +} + +/** + * Find a mesh seed in the seed list with exactly matching hostname and port. + * Assumes this function is called within mesh lock to prevent invalidating the + * returned index after function return. + * + * @param host the seed hostname + * @param port the seed port + * @return index to matching seed entry if found, else -1 + */ +static int +mesh_seed_find_unsafe(char* host, int port) +{ + MESH_LOCK(); + + int match_index = -1; + cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; + int element_count = cf_vector_size(seeds); + for (int i = 0; i < element_count; i++) { + as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); + if (strncmp(seed->seed_host_name, host, sizeof(seed->seed_host_name)) + == 0 && seed->seed_port == port) { + match_index = i; + break; + } + } + + MESH_UNLOCK(); + return match_index; +} + +/** + * Endure mesh tend udata has enough space for current mesh nodes. + */ +static void +mesh_tend_udata_capacity_ensure(as_hb_mesh_tend_reduce_udata* tend_reduce_udata, + int mesh_node_count) +{ + // Ensure capacity for nodes to connect. + if (tend_reduce_udata->to_connect_capacity < mesh_node_count) { + uint32_t alloc_size = round_up_pow2( + mesh_node_count * sizeof(as_endpoint_list*)); + int old_capacity = tend_reduce_udata->to_connect_capacity; + tend_reduce_udata->to_connect_capacity = alloc_size + / sizeof(as_endpoint_list*); + tend_reduce_udata->to_connect = cf_realloc( + tend_reduce_udata->to_connect, alloc_size); + + // NULL out newly allocated elements. + for (int i = old_capacity; i < tend_reduce_udata->to_connect_capacity; + i++) { + tend_reduce_udata->to_connect[i] = NULL; + } + } +} + +/** + * Change the state of a mesh node. Note: memset the mesh_nodes to zero before + * calling state change for the first time. + */ +static void +mesh_node_status_change(as_hb_mesh_node* mesh_node, + as_hb_mesh_node_status new_status) +{ + as_hb_mesh_node_status old_status = mesh_node->status; + mesh_node->status = new_status; + + if ((new_status != AS_HB_MESH_NODE_CHANNEL_ACTIVE + && old_status == AS_HB_MESH_NODE_CHANNEL_ACTIVE) + || mesh_node->last_status_updated == 0) { + mesh_node->inactive_since = cf_getms(); + } + mesh_node->last_status_updated = cf_getms(); + return; +} + +/** + * Close mesh listening sockets. + */ +static void +mesh_listening_sockets_close() +{ + MESH_LOCK(); + INFO("closing mesh heartbeat sockets"); + cf_sockets_close(&g_hb.mode_state.mesh_state.listening_sockets); + DEBUG("closed mesh heartbeat sockets"); + MESH_UNLOCK(); +} + +/** + * Populate the buffer with mesh seed list. + */ +static void +mesh_seed_host_list_get(cf_dyn_buf* db, bool tls) +{ + if (!hb_is_mesh()) { + return; + } + + MESH_LOCK(); + + cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; + int element_count = cf_vector_size(seeds); + for (int i = 0; i < element_count; i++) { + as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); + const char* info_key = + seed->seed_tls ? + "heartbeat.tls-mesh-seed-address-port=" : + "heartbeat.mesh-seed-address-port="; + + cf_dyn_buf_append_string(db, info_key); + cf_dyn_buf_append_string(db, seed->seed_host_name); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_uint32(db, seed->seed_port); + cf_dyn_buf_append_char(db, ';'); + } + + MESH_UNLOCK(); +} + +/** + * Checks if the match between a mesh seed and a mesh node is valid. + * The matching would be invalid if the mesh node's endpoint has been updated + * after the match was made or there has been no match. + */ +static bool +mesh_seed_mesh_node_check(as_hb_mesh_seed* seed) +{ + if (seed->status != AS_HB_MESH_NODE_CHANNEL_ACTIVE) { + return false; + } + + as_hb_mesh_node node; + if (mesh_node_get(seed->mesh_nodeid, &node) != 0) { + // The matched node has vanished. + return false; + } + + return seed->mesh_node_endpoint_change_ts == node.endpoint_change_ts; +} + +/** + * Refresh the matching between seeds and mesh nodes and get inactive seeds. + * Should be invoked under a mesh lock to ensure the validity of returned + * pointers. + * @param inactive_seeds_p output vector of inactive seed pointers. Can be NULL + * if inactive nodes need not be returned. + */ +static void +mesh_seed_inactive_refresh_get_unsafe(cf_vector* inactive_seeds_p) +{ + MESH_LOCK(); + + cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; + int element_count = cf_vector_size(seeds); + if (inactive_seeds_p) { + cf_vector_clear(inactive_seeds_p); + } + + // Mark seeds that do not have a matching mesh node and transitively do not + // have a matching channel. + cf_clock now = cf_getms(); + for (int i = 0; i < element_count; i++) { + as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); + if (mesh_seed_mesh_node_check(seed)) { + continue; + } + + seed->mesh_nodeid = 0; + seed->mesh_node_endpoint_change_ts = 0; + + // The mesh node is being connected. Skip. + if (seed->status == AS_HB_MESH_NODE_CHANNEL_PENDING) { + if (seed->last_status_updated + MESH_PENDING_TIMEOUT > now) { + // Spare the pending seeds, since we are attempting to connect + // to the seed host. + continue; + } + + // Flip to inactive if we have been in pending state for a long + // time. + mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE); + } + + if (seed->status != AS_HB_MESH_NODE_CHANNEL_PENDING) { + mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE); + if (inactive_seeds_p) { + cf_vector_append(inactive_seeds_p, &seed); + } + } + } + + MESH_UNLOCK(); +} + +/** + * Match input seeds to a mesh node using its endpoint address and + */ +static void +mesh_seeds_mesh_node_match_update(cf_vector* inactive_seeds_p, + as_hb_mesh_node* mesh_node, cf_node mesh_nodeid) +{ + if (mesh_node->status + == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN|| mesh_node->endpoint_list == NULL) { + return; + } + + int element_count = cf_vector_size(inactive_seeds_p); + for (int i = 0; i < element_count; i++) { + // No null check required since we are iterating under a lock and within + // vector bounds. + as_hb_mesh_seed* seed = *(as_hb_mesh_seed**)cf_vector_getp( + inactive_seeds_p, i); + if (as_endpoint_lists_are_overlapping(seed->resolved_endpoint_list, + mesh_node->endpoint_list, true)) { + // We found a matching mesh node for the seed, flip its status to + // active. + seed->mesh_nodeid = mesh_nodeid; + seed->mesh_node_endpoint_change_ts = mesh_node->endpoint_change_ts; + mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_ACTIVE); + DEBUG("seed entry %s:%d connected", seed->seed_host_name, + seed->seed_port); + } + } +} + +/** + * Determines if a mesh entry should be connected to or expired and deleted. + */ +static int +mesh_tend_reduce(const void* key, void* data, void* udata) +{ + MESH_LOCK(); + + int rv = CF_SHASH_OK; + cf_node nodeid = *(cf_node*)key; + as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; + as_hb_mesh_tend_reduce_udata* tend_reduce_udata = + (as_hb_mesh_tend_reduce_udata*)udata; + + DETAIL("tending mesh node %"PRIx64" with status %s", nodeid, + mesh_node_status_string(mesh_node->status)); + + mesh_seeds_mesh_node_match_update(tend_reduce_udata->inactive_seeds_p, + mesh_node, nodeid); + + if (mesh_node->status == AS_HB_MESH_NODE_CHANNEL_ACTIVE) { + // The mesh node is connected. Skip. + goto Exit; + } + + cf_clock now = cf_getms(); + + if (!mesh_node->endpoint_list) { + // Will happen if node discover and disconnect happen close together. + mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_ENDPOINT_UNKNOWN); + } + + if (mesh_node->inactive_since + MESH_INACTIVE_TIMEOUT <= now) { + DEBUG("mesh forgetting node %" PRIx64" because it could not be connected since %" PRIx64, + nodeid, mesh_node->inactive_since); + rv = CF_SHASH_REDUCE_DELETE; + goto Exit; + } + + if (mesh_node->status == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN) { + if (mesh_node->last_status_updated + MESH_ENDPOINT_UNKNOWN_TIMEOUT + > now) { + DEBUG("mesh forgetting node %"PRIx64" ip address/port undiscovered since %"PRIu64, + nodeid, mesh_node->last_status_updated); + + rv = CF_SHASH_REDUCE_DELETE; + } + // Skip connecting with a node with unknown endpoint. + goto Exit; + } + + if (mesh_node->status == AS_HB_MESH_NODE_CHANNEL_PENDING) { + // The mesh node is being connected. Skip. + if (mesh_node->last_status_updated + MESH_PENDING_TIMEOUT > now) { + goto Exit; + } + + // Flip to inactive if we have been in pending state for a long time. + mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_CHANNEL_INACTIVE); + } + + // Channel for this node is inactive. Prompt the channel sub module to + // connect to this node. + if (tend_reduce_udata->to_connect_count + >= tend_reduce_udata->to_connect_capacity) { + // New nodes found but we are out of capacity. Ultra defensive coding. + // This will never happen under the locks. + WARNING("skipping connecting to node %" PRIx64" - not enough memory allocated", + nodeid); + goto Exit; + } + + endpoint_list_copy( + &tend_reduce_udata->to_connect[tend_reduce_udata->to_connect_count], + mesh_node->endpoint_list); + tend_reduce_udata->to_connect_count++; + + // Flip status to pending. + mesh_node_status_change(mesh_node, AS_HB_MESH_NODE_CHANNEL_PENDING); + +Exit: + if (rv == CF_SHASH_REDUCE_DELETE) { + // Clear all internal allocated memory. + mesh_node_destroy(mesh_node); + } + + MESH_UNLOCK(); + + return rv; +} + +/** + * Add inactive seeds to to_connect array. + * Should be invoked under mesh lock to prevent invalidating the array of seed + * node pointers. + * @param seed_p vector of seed pointers. + * @param tend reduce udata having the to connect endpoint list. + */ +void +mesh_seeds_inactive_add_to_connect(cf_vector* seeds_p, + as_hb_mesh_tend_reduce_udata* tend_reduce_udata) +{ + MESH_LOCK(); + int element_count = cf_vector_size(seeds_p); + for (int i = 0; i < element_count; i++) { + as_hb_mesh_seed* seed = *(as_hb_mesh_seed**)cf_vector_getp(seeds_p, i); + if (seed->status != AS_HB_MESH_NODE_CHANNEL_INACTIVE) { + continue; + } + + // Channel for this node is inactive. Prompt the channel sub module to + // connect to this node. + if (tend_reduce_udata->to_connect_count + >= tend_reduce_udata->to_connect_capacity) { + // New nodes found but we are out of capacity. Ultra defensive + // coding. + // This will never happen under the locks. + WARNING( + "skipping connecting to %s:%d - not enough memory allocated", + seed->seed_host_name, seed->seed_port); + return; + } + + // Ensure the seed hostname is resolved. + if (mesh_seed_endpoint_list_fill(seed) != 0) { + continue; + } + + endpoint_list_copy( + &tend_reduce_udata->to_connect[tend_reduce_udata->to_connect_count], + seed->resolved_endpoint_list); + tend_reduce_udata->to_connect_count++; + + // Flip status to pending. + mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_PENDING); + } + MESH_UNLOCK(); +} + +/** + * Tends the mesh host list, to discover and remove nodes. Should never invoke a + * channel call while holding a mesh lock. + */ +void* +mesh_tender(void* arg) +{ + DETAIL("mesh tender started"); + // Figure out which nodes need to be connected to. + // collect nodes to connect to and remove dead nodes. + as_hb_mesh_tend_reduce_udata tend_reduce_udata = { NULL, 0, 0 }; + + // Vector of pointer to inactive seeds. + cf_vector inactive_seeds_p; + cf_vector_init(&inactive_seeds_p, sizeof(as_hb_mesh_seed*), + AS_HB_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO); + + cf_clock last_time = 0; + + while (hb_is_mesh() && mesh_is_running()) { + cf_clock curr_time = cf_getms(); + + // Unlocked access but this should be alright Set the discovered flag. + bool nodes_discovered = g_hb.mode_state.mesh_state.nodes_discovered; + if ((curr_time - last_time) < MESH_TEND_INTERVAL && !nodes_discovered) { + // Interval has not been reached for sending heartbeats + usleep(MIN(AS_HB_TX_INTERVAL_MS_MIN, (last_time + + MESH_TEND_INTERVAL) - curr_time) * 1000); + continue; + } + last_time = curr_time; + + DETAIL("tending mesh list"); + + MESH_LOCK(); + // Unset the discovered flag. + g_hb.mode_state.mesh_state.nodes_discovered = false; + + // Update the list of inactive seeds. + mesh_seed_inactive_refresh_get_unsafe(&inactive_seeds_p); + + // Make sure the udata has enough capacity. + int connect_count_max = cf_shash_get_size( + g_hb.mode_state.mesh_state.nodeid_to_mesh_node) + + cf_vector_size(&inactive_seeds_p); + mesh_tend_udata_capacity_ensure(&tend_reduce_udata, connect_count_max); + + tend_reduce_udata.to_connect_count = 0; + tend_reduce_udata.inactive_seeds_p = &inactive_seeds_p; + cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, + mesh_tend_reduce, &tend_reduce_udata); + + // Add inactive seeds for connection. + mesh_seeds_inactive_add_to_connect(&inactive_seeds_p, + &tend_reduce_udata); + + MESH_UNLOCK(); + + // Connect can be time consuming, especially in failure cases. + // Connect outside of the mesh lock and prevent hogging the lock. + if (tend_reduce_udata.to_connect_count > 0) { + // Try connecting the newer nodes. + channel_mesh_channel_establish(tend_reduce_udata.to_connect, + tend_reduce_udata.to_connect_count); + } + + DETAIL("done tending mesh list"); + } + + if (tend_reduce_udata.to_connect) { + // Free space allocated for endpoint lists. + for (int i = 0; i < tend_reduce_udata.to_connect_capacity; i++) { + if (tend_reduce_udata.to_connect[i]) { + cf_free(tend_reduce_udata.to_connect[i]); + } + } + cf_free(tend_reduce_udata.to_connect); + } + + cf_vector_destroy(&inactive_seeds_p); + + DETAIL("mesh tender shut down"); + return NULL; +} + +/** + * Add or update a mesh node to mesh node list. + */ +static void +mesh_node_add_update(cf_node nodeid, as_hb_mesh_node* mesh_node) +{ + MESH_LOCK(); + cf_shash_put(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, &nodeid, + mesh_node); + MESH_UNLOCK(); +} + +/** + * Destroy a mesh node. + */ +static void +mesh_node_destroy(as_hb_mesh_node* mesh_node) +{ + MESH_LOCK(); + if (mesh_node->endpoint_list) { + cf_free(mesh_node->endpoint_list); + mesh_node->endpoint_list = NULL; + } + MESH_UNLOCK(); +} + +/** + * Endpoint list iterate function find endpoint matching sock addr. + */ +static void +mesh_endpoint_addr_find_iterate(const as_endpoint* endpoint, void* udata) +{ + cf_sock_addr endpoint_addr; + if (as_endpoint_to_sock_addr(endpoint, &endpoint_addr) != 0) { + return; + } + + as_hb_endpoint_list_addr_find_udata* endpoint_reduce_udata = + (as_hb_endpoint_list_addr_find_udata*)udata; + + if (cf_sock_addr_compare(&endpoint_addr, endpoint_reduce_udata->to_search) + == 0) { + endpoint_reduce_udata->found = true; + } +} + +/** + * Indicates if a give node is discovered. + * @param nodeid the input nodeid. + * @return true if discovered, false otherwise. + */ +static bool +mesh_node_is_discovered(cf_node nodeid) +{ + if (nodeid == config_self_nodeid_get()) { + // Assume this node knows itself. + return true; + } + + as_hb_mesh_node mesh_node; + return mesh_node_get(nodeid, &mesh_node) == 0; +} + +/** + * Indicates if a give node has a valid endpoint list. + * @param nodeid the input nodeid. + * @return true if node has valid endpoint list, false otherwise. + */ +static bool +mesh_node_endpoint_list_is_valid(cf_node nodeid) +{ + if (nodeid == config_self_nodeid_get()) { + // Assume this node knows itself. + return true; + } + + as_hb_mesh_node mesh_node; + return mesh_node_get(nodeid, &mesh_node) == 0 + && mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN + && mesh_node.endpoint_list; +} + +/** + * Get the mesh node associated with this node. + * @param nodeid the nodeid to search for. + * @param is_real_nodeid indicates if the query is for a real or fake nodeid. + * @param mesh_node the output mesh node. + * @return 0 on success -1 if there is mesh node attached. + */ +static int +mesh_node_get(cf_node nodeid, as_hb_mesh_node* mesh_node) +{ + int rv = -1; + + MESH_LOCK(); + if (cf_shash_get(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, &nodeid, + mesh_node) == CF_SHASH_OK) { + rv = 0; + } + else { + // The node not found. + rv = -1; + } + MESH_UNLOCK(); + return rv; +} + +/** + * Handle the event when the channel reports a node as disconnected. + */ +static void +mesh_channel_on_node_disconnect(as_hb_channel_event* event) +{ + MESH_LOCK(); + + as_hb_mesh_node mesh_node; + if (mesh_node_get(event->nodeid, &mesh_node) != 0) { + // Again should not happen in practice. But not really bad. + DEBUG("unknown mesh node disconnected %" PRIx64, event->nodeid); + goto Exit; + } + + DEBUG("mesh setting node %" PRIx64" status as inactive on loss of channel", + event->nodeid); + + // Mark this node inactive and move on. Mesh tender should remove this node + // after it has been inactive for a while. + mesh_node_status_change(&mesh_node, AS_HB_MESH_NODE_CHANNEL_INACTIVE); + + // Update the mesh entry. + mesh_node_add_update(event->nodeid, &mesh_node); + +Exit: + MESH_UNLOCK(); +} + +/** + * Check and fix the case where we received a self incoming message probably + * because one of our non loop back interfaces was used as a seed address. + * + * @return true if this message is a self message, false otherwise. + */ +static bool +mesh_node_check_fix_self_msg(as_hb_channel_event* event) +{ + if (event->nodeid == config_self_nodeid_get()) { + // Handle self message. Will happen if the seed node address on this + // node does not match the listen / publish address. + as_endpoint_list* msg_endpoint_list; + msg_endpoint_list_get(event->msg, &msg_endpoint_list); + + MESH_LOCK(); + + // Check if this node has published an endpoint list matching self node. + endpoint_list_equal_check_udata udata = { 0 }; + udata.are_equal = false; + udata.other = msg_endpoint_list; + mesh_published_endpoints_process(endpoint_list_equal_process, &udata); + + if (udata.are_equal) { + // Definitely pulse message from self node. + int self_seed_index = + mesh_seed_endpoint_list_overlapping_find_unsafe( + msg_endpoint_list); + if (self_seed_index >= 0) { + as_hb_mesh_seed* self_seed = cf_vector_getp( + &g_hb.mode_state.mesh_state.seeds, self_seed_index); + INFO("removing self seed entry host:%s port:%d", + self_seed->seed_host_name, self_seed->seed_port); + as_hb_mesh_tip_clear(self_seed->seed_host_name, + self_seed->seed_port); + } + } + MESH_UNLOCK(); + return true; + } + return false; +} + +/** + * Update mesh node status based on an incoming message. + */ +static void +mesh_node_data_update(as_hb_channel_event* event) +{ + if (mesh_node_check_fix_self_msg(event)) { + // Message from self, can be ignored. + return; + } + + MESH_LOCK(); + as_hb_mesh_node existing_mesh_node = { 0 }; + as_endpoint_list* msg_endpoint_list = NULL; + msg_endpoint_list_get(event->msg, &msg_endpoint_list); + + // Search for existing entry. + bool needs_update = mesh_node_get(event->nodeid, &existing_mesh_node) != 0; + + // Update the endpoint list to be the message endpoint list if the seed ip + // list and the published ip list differ + if (!as_endpoint_lists_are_equal(existing_mesh_node.endpoint_list, + msg_endpoint_list)) { + char endpoint_list_str1[ENDPOINT_LIST_STR_SIZE]; + endpoint_list_str1[0] = 0; + + as_endpoint_list_to_string(existing_mesh_node.endpoint_list, + endpoint_list_str1, sizeof(endpoint_list_str1)); + + char endpoint_list_str2[ENDPOINT_LIST_STR_SIZE]; + as_endpoint_list_to_string(msg_endpoint_list, endpoint_list_str2, + sizeof(endpoint_list_str2)); + + if (existing_mesh_node.endpoint_list) { + INFO("for node %"PRIx64" updating mesh endpoint address from {%s} to {%s}",event->nodeid, + endpoint_list_str1, endpoint_list_str2); + } + + // Update the endpoints. + endpoint_list_copy(&existing_mesh_node.endpoint_list, + msg_endpoint_list); + existing_mesh_node.endpoint_change_ts = as_hlc_timestamp_now(); + + needs_update = true; + } + + if (existing_mesh_node.status != AS_HB_MESH_NODE_CHANNEL_ACTIVE) { + // Update status to active. + mesh_node_status_change(&existing_mesh_node, + AS_HB_MESH_NODE_CHANNEL_ACTIVE); + needs_update = true; + } + + if (needs_update) { + // Apply the update. + mesh_node_add_update(event->nodeid, &existing_mesh_node); + } + + MESH_UNLOCK(); +} + +/** + * Return the in memory and on wire size of an info reply array. + * @param reply the info reply. + * @param reply_count the number of replies. + * @param reply_size the wire size of the message. + * @return 0 on successful reply count computation, -1 otherwise, + */ +static int +mesh_info_reply_sizeof(as_hb_mesh_info_reply* reply, int reply_count, + size_t* reply_size) +{ + // Go over reply and compute the count of replies and also validate the + // endpoint lists. + uint8_t* start_ptr = (uint8_t*)reply; + *reply_size = 0; + + for (int i = 0; i < reply_count; i++) { + as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr; + *reply_size += sizeof(as_hb_mesh_info_reply); + start_ptr += sizeof(as_hb_mesh_info_reply); + + size_t endpoint_list_size = 0; + if (as_endpoint_list_sizeof(&reply_ptr->endpoint_list[0], + &endpoint_list_size)) { + // Incomplete / garbled info reply message. + *reply_size = 0; + return -1; + } + + *reply_size += endpoint_list_size; + start_ptr += endpoint_list_size; + } + + return 0; +} + +/** + * Send a info reply in reply to an info request. + * @param dest the destination node to send the info reply to. + * @param reply array of node ids and endpoints + * @param reply_count the count of replies. + */ +static void +mesh_nodes_send_info_reply(cf_node dest, as_hb_mesh_info_reply* reply, + size_t reply_count) +{ + // Create the discover message. + msg* msg = mesh_info_msg_init(AS_HB_MSG_TYPE_INFO_REPLY); + + // Set the reply. + msg_info_reply_set(msg, reply, reply_count); + + DEBUG("sending info reply to node %" PRIx64, dest); + + // Send the info reply. + if (channel_msg_unicast(dest, msg) != 0) { + TICKER_WARNING("error sending info reply message to node %" PRIx64, + dest); + } + + hb_msg_return(msg); +} + +/** + * Initialize the info request msg buffer + */ +static msg* +mesh_info_msg_init(as_hb_msg_type msg_type) +{ + msg* msg = hb_msg_get(); + msg_src_fields_fill(msg); + msg_type_set(msg, msg_type); + return msg; +} + +/** + * Send a info request for all undiscovered nodes. + * @param dest the destination node to send the discover message to. + * @param to_discover array of node ids to discover. + * @param to_discover_count the count of nodes in the array. + */ +static void +mesh_nodes_send_info_request(msg* in_msg, cf_node dest, cf_node* to_discover, + size_t to_discover_count) +{ + // Create the discover message. + msg* info_req = mesh_info_msg_init(AS_HB_MSG_TYPE_INFO_REQUEST); + + // Set the list of nodes to discover. + msg_node_list_set(info_req, AS_HB_MSG_INFO_REQUEST, to_discover, + to_discover_count); + + DEBUG("sending info request to node %" PRIx64, dest); + + // Send the info request. + if (channel_msg_unicast(dest, info_req) != 0) { + TICKER_WARNING("error sending info request message to node %" PRIx64, + dest); + } + hb_msg_return(info_req); +} + +/** + * Handle an incoming pulse message to discover new neighbours. + */ +static void +mesh_channel_on_pulse(msg* msg) +{ + cf_node* adj_list; + size_t adj_length; + + cf_node source; + + // Channel has validated the source. Don't bother checking here. + msg_nodeid_get(msg, &source); + if (msg_adjacency_get(msg, &adj_list, &adj_length) != 0) { + // Adjacency list absent. + WARNING("received message from %" PRIx64" without adjacency list", + source); + return; + } + + cf_node to_discover[adj_length]; + size_t num_to_discover = 0; + + // TODO: Track already queried nodes so that we do not retry immediately. + // Will need a separate state, pending query. + MESH_LOCK(); + + // Try and discover new nodes from this message's adjacency list. + for (int i = 0; i < adj_length; i++) { + if (!mesh_node_is_discovered(adj_list[i])) { + DEBUG("discovered new mesh node %" PRIx64, adj_list[i]); + + as_hb_mesh_node new_node; + memset(&new_node, 0, sizeof(new_node)); + mesh_node_status_change(&new_node, + AS_HB_MESH_NODE_ENDPOINT_UNKNOWN); + + // Add as a new node + mesh_node_add_update(adj_list[i], &new_node); + } + + if (!mesh_node_endpoint_list_is_valid(adj_list[i])) { + to_discover[num_to_discover++] = adj_list[i]; + } + } + + MESH_UNLOCK(); + + // Discover these nodes outside a lock. + if (num_to_discover) { + mesh_nodes_send_info_request(msg, source, to_discover, num_to_discover); + } +} + +/** + * Handle an incoming info message. + */ +static void +mesh_channel_on_info_request(msg* msg) +{ + cf_node* query_nodeids; + size_t query_count; + + cf_node source; + msg_nodeid_get(msg, &source); + + if (msg_node_list_get(msg, AS_HB_MSG_INFO_REQUEST, &query_nodeids, + &query_count) != 0) { + TICKER_WARNING("got an info request without query nodes from %" PRIx64, + source); + return; + } + + MESH_LOCK(); + + // Compute the entire response size. + size_t reply_size = 0; + + for (int i = 0; i < query_count; i++) { + as_hb_mesh_node mesh_node; + + if (mesh_node_get(query_nodeids[i], &mesh_node) == 0) { + if (mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN + && mesh_node.endpoint_list) { + size_t endpoint_list_size = 0; + as_endpoint_list_sizeof(mesh_node.endpoint_list, + &endpoint_list_size); + reply_size += sizeof(as_hb_mesh_info_reply) + + endpoint_list_size; + } + } + } + + as_hb_mesh_info_reply* replies = alloca(reply_size); + uint8_t* reply_ptr = (uint8_t*)replies; + size_t reply_count = 0; + + DEBUG("received info request from node : %" PRIx64, source); + DEBUG("preparing a reply for %zu requests", query_count); + + for (int i = 0; i < query_count; i++) { + as_hb_mesh_node mesh_node; + + DEBUG("mesh received info request for node %" PRIx64, query_nodeids[i]); + + if (mesh_node_get(query_nodeids[i], &mesh_node) == 0) { + if (mesh_node.status != AS_HB_MESH_NODE_ENDPOINT_UNKNOWN + && mesh_node.endpoint_list) { + as_hb_mesh_info_reply* reply = (as_hb_mesh_info_reply*)reply_ptr; + + reply->nodeid = query_nodeids[i]; + + size_t endpoint_list_size = 0; + as_endpoint_list_sizeof(mesh_node.endpoint_list, + &endpoint_list_size); + + memcpy(&reply->endpoint_list[0], mesh_node.endpoint_list, + endpoint_list_size); + + reply_ptr += sizeof(as_hb_mesh_info_reply) + endpoint_list_size; + + reply_count++; + } + } + } + + MESH_UNLOCK(); + + // Send the reply + if (reply_count > 0) { + mesh_nodes_send_info_reply(source, replies, reply_count); + } +} + +/** + * Handle an incoming info reply. + */ +static void +mesh_channel_on_info_reply(msg* msg) +{ + as_hb_mesh_info_reply* reply = NULL; + size_t reply_count = 0; + cf_node source = 0; + msg_nodeid_get(msg, &source); + if (msg_info_reply_get(msg, &reply, &reply_count) != 0 + || reply_count == 0) { + TICKER_WARNING( + "got an info reply from without query nodes from %" PRIx64, + source); + return; + } + + DEBUG("received info reply from node %" PRIx64, source); + + MESH_LOCK(); + + uint8_t *start_ptr = (uint8_t*)reply; + for (int i = 0; i < reply_count; i++) { + as_hb_mesh_info_reply* reply_ptr = (as_hb_mesh_info_reply*)start_ptr; + as_hb_mesh_node existing_node; + if (mesh_node_get(reply_ptr->nodeid, &existing_node) != 0) { + // Somehow the node was removed from the mesh hash. Maybe a timeout. + goto NextReply; + } + + // Update the state of this node. + if (existing_node.status == AS_HB_MESH_NODE_ENDPOINT_UNKNOWN) { + // Update the endpoint. + endpoint_list_copy(&existing_node.endpoint_list, + reply_ptr->endpoint_list); + + mesh_node_status_change(&existing_node, + AS_HB_MESH_NODE_CHANNEL_INACTIVE); + // Set the discovered flag. + g_hb.mode_state.mesh_state.nodes_discovered = true; + + char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; + as_endpoint_list_to_string(existing_node.endpoint_list, + endpoint_list_str, sizeof(endpoint_list_str)); + + DEBUG("for node %" PRIx64" discovered endpoints {%s}", + reply_ptr->nodeid, endpoint_list_str); + + // Update the hash. + mesh_node_add_update(reply_ptr->nodeid, &existing_node); + } + + NextReply: + start_ptr += sizeof(as_hb_mesh_info_reply); + size_t endpoint_list_size = 0; + as_endpoint_list_sizeof(reply_ptr->endpoint_list, &endpoint_list_size); + start_ptr += endpoint_list_size; + } + + MESH_UNLOCK(); +} + +/** + * Handle the case when a message is received on a channel. + */ +static void +mesh_channel_on_msg_rcvd(as_hb_channel_event* event) +{ + // Update the mesh node status. + mesh_node_data_update(event); + + as_hb_msg_type msg_type; + msg_type_get(event->msg, &msg_type); + + switch (msg_type) { + case AS_HB_MSG_TYPE_PULSE: // A pulse message. Try and discover new nodes. + mesh_channel_on_pulse(event->msg); + break; + case AS_HB_MSG_TYPE_INFO_REQUEST: // Send back an info reply. + mesh_channel_on_info_request(event->msg); + break; + case AS_HB_MSG_TYPE_INFO_REPLY: // Update the list of mesh nodes, if this is an undiscovered node. + mesh_channel_on_info_reply(event->msg); + break; + default: + WARNING("received a message of unknown type from"); + // Ignore other messages. + break; + } +} + +/* + * ---------------------------------------------------------------------------- + * Mesh public API + * ---------------------------------------------------------------------------- + */ + +/** + * Add a host / port to the mesh seed list. + * @param host the seed node hostname / ip address + * @param port the seed node port. + * @param tls indicates TLS support. + * @return CF_SHASH_OK, CF_SHASH_ERR, CF_SHASH_ERR_FOUND. + */ +static int +mesh_tip(char* host, int port, bool tls) +{ + MESH_LOCK(); + + int rv = -1; + as_hb_mesh_seed new_seed = { { 0 } }; + + // Check validity of hostname and port. + int hostname_len = strnlen(host, HOST_NAME_MAX); + if (hostname_len <= 0 || hostname_len == HOST_NAME_MAX) { + // Invalid hostname. + WARNING("mesh seed host %s exceeds allowed %d characters", host, + HOST_NAME_MAX); + goto Exit; + } + if (port <= 0 || port > USHRT_MAX) { + WARNING("mesh seed port %s:%d exceeds should be between 0 to %d", host, + port, USHRT_MAX); + goto Exit; + } + + // Check if we already have a match for this seed. + if (mesh_seed_find_unsafe(host, port) >= 0) { + WARNING("mesh seed host %s:%d already in seed list", host, port); + goto Exit; + } + + mesh_seed_status_change(&new_seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE); + strncpy(new_seed.seed_host_name, host, sizeof(new_seed.seed_host_name)); + new_seed.seed_port = port; + new_seed.seed_tls = tls; + + cf_vector_append(&g_hb.mode_state.mesh_state.seeds, &new_seed); + + INFO("added new mesh seed %s:%d", host, port); + rv = 0; + +Exit: + if (rv != 0) { + // Ensure endpoint allocated space is freed. + mesh_seed_destroy(&new_seed); + } + + MESH_UNLOCK(); + return rv; +} + +/** + * Handle a channel event on an endpoint. + */ +static void +mesh_channel_event_process(as_hb_channel_event* event) +{ + // Skip if we are not in mesh mode. + if (!hb_is_mesh()) { + return; + } + + MESH_LOCK(); + switch (event->type) { + case AS_HB_CHANNEL_NODE_CONNECTED: + // Ignore this event. The subsequent message event will be use for + // determining mesh node active status. + break; + case AS_HB_CHANNEL_NODE_DISCONNECTED: + mesh_channel_on_node_disconnect(event); + break; + case AS_HB_CHANNEL_MSG_RECEIVED: + mesh_channel_on_msg_rcvd(event); + break; + case AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH: // Ignore this event. HB module will handle it. + break; + } + + MESH_UNLOCK(); +} + +/** + * Initialize mesh mode data structures. + */ +static void +mesh_init() +{ + if (!hb_is_mesh()) { + return; + } + + MESH_LOCK(); + + g_hb.mode_state.mesh_state.status = AS_HB_STATUS_STOPPED; + + // Initialize the mesh node hash. + g_hb.mode_state.mesh_state.nodeid_to_mesh_node = cf_shash_create( + cf_nodeid_shash_fn, sizeof(cf_node), sizeof(as_hb_mesh_node), + AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); + + // Initialize the seed list. + cf_vector_init(&g_hb.mode_state.mesh_state.seeds, sizeof(as_hb_mesh_seed), + AS_HB_CLUSTER_MAX_SIZE_SOFT, VECTOR_FLAG_INITZERO); + + MESH_UNLOCK(); +} + +/** + * Delete the shash entries only if they are not seed entries. + */ +static int +mesh_free_node_data_reduce(const void* key, void* data, void* udata) +{ + as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; + mesh_node_destroy(mesh_node); + return CF_SHASH_REDUCE_DELETE; +} + +/** + * Remove a host / port from the mesh list. + */ +static int +mesh_tip_clear_reduce(const void* key, void* data, void* udata) +{ + int rv = CF_SHASH_OK; + + MESH_LOCK(); + + cf_node nodeid = *(cf_node*)key; + as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; + as_hb_mesh_tip_clear_udata* tip_clear_udata = + (as_hb_mesh_tip_clear_udata*)udata; + + if (tip_clear_udata == NULL || nodeid == tip_clear_udata->nodeid) { + // Handling tip clear all or clear of a specific node. + rv = CF_SHASH_REDUCE_DELETE; + goto Exit; + } + + // See if the address matches any one of the endpoints in the node's + // endpoint list. + cf_ip_addr addrs[CF_SOCK_CFG_MAX]; + uint32_t n_addrs = CF_SOCK_CFG_MAX; + + if (cf_ip_addr_from_string_multi(tip_clear_udata->host, addrs, &n_addrs) + == 0) { + for (int i = 0; i < n_addrs; i++) { + cf_sock_addr sock_addr; + cf_ip_addr_copy(&addrs[i], &sock_addr.addr); + sock_addr.port = tip_clear_udata->port; + as_hb_endpoint_list_addr_find_udata udata; + udata.found = false; + udata.to_search = &sock_addr; + + as_endpoint_list_iterate(mesh_node->endpoint_list, + mesh_endpoint_addr_find_iterate, &udata); + + if (udata.found) { + rv = CF_SHASH_REDUCE_DELETE; + goto Exit; + } + } + + // Not found by endpoint. + rv = CF_SHASH_OK; + } + +Exit: + if (rv == CF_SHASH_REDUCE_DELETE) { + char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; + as_endpoint_list_to_string(mesh_node->endpoint_list, endpoint_list_str, + sizeof(endpoint_list_str)); + + // Find all seed entries matching this mesh entry and delete them. + cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; + int element_count = cf_vector_size(seeds); + for (int i = 0; i < element_count; i++) { + as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); + if (seed->mesh_nodeid != nodeid) { + // Does not match this mesh entry. + continue; + } + if (mesh_seed_delete_unsafe(i) == 0) { + i--; + element_count--; + } + else { + // Should not happen in practice. + CRASH("error deleting mesh seed entry %s:%d", + seed->seed_host_name, seed->seed_port); + } + } + + if (channel_node_disconnect(nodeid) != 0) { + WARNING("unable to disconnect the channel to node %" PRIx64, + nodeid); + } + + mesh_node_destroy(mesh_node); + if (tip_clear_udata != NULL) { + tip_clear_udata->entry_deleted = true; + } + } + + MESH_UNLOCK(); + return rv; +} + +/** + * Output Heartbeat endpoints of peers. + */ +static int +mesh_peer_endpoint_reduce(const void* key, void* data, void* udata) +{ + int rv = CF_SHASH_OK; + MESH_LOCK(); + cf_node nodeid = *(cf_node*)key; + as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; + cf_dyn_buf* db = (cf_dyn_buf*)udata; + + cf_dyn_buf_append_string(db, "heartbeat.peer="); + cf_dyn_buf_append_string(db, "node-id="); + cf_dyn_buf_append_uint64_x(db, nodeid); + cf_dyn_buf_append_string(db, ":"); + as_endpoint_list_info(mesh_node->endpoint_list, db); + cf_dyn_buf_append_string(db, ";"); + + MESH_UNLOCK(); + return rv; +} + +/** + * Free the mesh mode data structures. + */ +static void +mesh_clear() +{ + if (!mesh_is_stopped()) { + WARNING( + "attempted clearing mesh module without stopping it - skip mesh clear!"); + return; + } + + MESH_LOCK(); + // Delete the elements from the map. + cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, + mesh_free_node_data_reduce, NULL); + + // Reset the seeds to inactive state + cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; + int element_count = cf_vector_size(seeds); + for (int i = 0; i < element_count; i++) { + // Should not happen in practice. + as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); + seed->mesh_nodeid = 0; + mesh_seed_status_change(seed, AS_HB_MESH_NODE_CHANNEL_INACTIVE); + } + + MESH_UNLOCK(); +} + +/** + * Open mesh listening socket. Crashes if open failed. + */ +static void +mesh_listening_sockets_open() +{ + MESH_LOCK(); + + const cf_serv_cfg* bind_cfg = config_bind_cfg_get(); + + // Compute min MTU across all binding interfaces. + int min_mtu = -1; + char addr_string[HOST_NAME_MAX]; + for (uint32_t i = 0; i < bind_cfg->n_cfgs; ++i) { + const cf_sock_cfg* sock_cfg = &bind_cfg->cfgs[i]; + cf_ip_addr_to_string_safe(&sock_cfg->addr, addr_string, + sizeof(addr_string)); + + INFO("initializing mesh heartbeat socket: %s:%d", addr_string, + sock_cfg->port); + + int bind_interface_mtu = + !cf_ip_addr_is_any(&sock_cfg->addr) ? + cf_inter_mtu(&sock_cfg->addr) : cf_inter_min_mtu(); + + if (min_mtu == -1 || min_mtu > bind_interface_mtu) { + min_mtu = bind_interface_mtu; + } + } + + if (cf_socket_init_server((cf_serv_cfg*)bind_cfg, + &g_hb.mode_state.mesh_state.listening_sockets) != 0) { + CRASH("couldn't initialize unicast heartbeat sockets"); + } + + for (uint32_t i = 0; + i < g_hb.mode_state.mesh_state.listening_sockets.n_socks; ++i) { + DEBUG("opened mesh heartbeat socket: %d", + CSFD(&g_hb.mode_state.mesh_state.listening_sockets.socks[i])); + } + + if (min_mtu == -1) { + WARNING("error getting the min MTU - using the default %d", + DEFAULT_MIN_MTU); + min_mtu = DEFAULT_MIN_MTU; + } + + g_hb.mode_state.mesh_state.min_mtu = min_mtu; + INFO("mtu of the network is %d", min_mtu); + + MESH_UNLOCK(); +} + +/** + * Start mesh threads. + */ +static void +mesh_start() +{ + if (!hb_is_mesh()) { + return; + } + + MESH_LOCK(); + + mesh_listening_sockets_open(); + channel_mesh_listening_socks_register( + &g_hb.mode_state.mesh_state.listening_sockets); + + g_hb.mode_state.mesh_state.status = AS_HB_STATUS_RUNNING; + + // Start the mesh tender thread. + if (pthread_create(&g_hb.mode_state.mesh_state.mesh_tender_tid, 0, + mesh_tender, &g_hb) != 0) { + CRASH("could not create channel tender thread: %s", cf_strerror(errno)); + } + + MESH_UNLOCK(); +} + +/** + * Stop the mesh module. + */ +static void +mesh_stop() +{ + if (!mesh_is_running()) { + WARNING("mesh is already stopped"); + return; + } + + // Unguarded state, but this should be OK. + g_hb.mode_state.mesh_state.status = AS_HB_STATUS_SHUTTING_DOWN; + + // Wait for the channel tender thread to finish. + pthread_join(g_hb.mode_state.mesh_state.mesh_tender_tid, NULL); + + MESH_LOCK(); + + channel_mesh_listening_socks_deregister( + &g_hb.mode_state.mesh_state.listening_sockets); + + mesh_listening_sockets_close(); + + g_hb.mode_state.mesh_state.status = AS_HB_STATUS_STOPPED; + + // Clear allocated state if any. + if (g_hb.mode_state.mesh_state.published_endpoint_list) { + cf_free(g_hb.mode_state.mesh_state.published_endpoint_list); + g_hb.mode_state.mesh_state.published_endpoint_list = NULL; + } + + MESH_UNLOCK(); +} + +/** + * Reduce function to dump mesh node info to log file. + */ +static int +mesh_dump_reduce(const void* key, void* data, void* udata) +{ + cf_node nodeid = *(cf_node*)key; + as_hb_mesh_node* mesh_node = (as_hb_mesh_node*)data; + + char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; + as_endpoint_list_to_string(mesh_node->endpoint_list, endpoint_list_str, + sizeof(endpoint_list_str)); + + INFO("\tHB Mesh Node: node-id %" PRIx64" status %s last-updated %" PRIu64 " endpoints {%s}", + nodeid, mesh_node_status_string(mesh_node->status), + mesh_node->last_status_updated, endpoint_list_str); + + return CF_SHASH_OK; +} + +/** + * Dump mesh state to logs. + * @param verbose enables / disables verbose logging. + */ +static void +mesh_dump(bool verbose) +{ + if (!hb_is_mesh() || !verbose) { + return; + } + + MESH_LOCK(); + cf_vector* seeds = &g_hb.mode_state.mesh_state.seeds; + int element_count = cf_vector_size(seeds); + INFO("HB Seed Count %d", element_count); + for (int i = 0; i < element_count; i++) { + as_hb_mesh_seed* seed = cf_vector_getp(seeds, i); + char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; + as_endpoint_list_to_string(seed->resolved_endpoint_list, + endpoint_list_str, sizeof(endpoint_list_str)); + INFO("\tHB Mesh Seed: host %s port %d node-id %" PRIx64" status %s endpoints {%s}", + seed->seed_host_name, seed->seed_port, seed->mesh_nodeid, mesh_node_status_string(seed->status), + endpoint_list_str); + } + + INFO("HB Mesh Nodes Count %d", cf_shash_get_size(g_hb.mode_state.mesh_state.nodeid_to_mesh_node)); + cf_shash_reduce(g_hb.mode_state.mesh_state.nodeid_to_mesh_node, + mesh_dump_reduce, NULL); + MESH_UNLOCK(); +} + +/* + * ---------------------------------------------------------------------------- + * Multicast sub module. + * ---------------------------------------------------------------------------- + */ + +/** + * Initialize multicast data structures. + */ +static void +multicast_init() +{ +} + +/** + * Clear multicast data structures. + */ +static void +multicast_clear() +{ + // Free multicast data structures. Nothing to do. +} + +/** + * Open multicast sockets. Crashes if open failed. + */ +static void +multicast_listening_sockets_open() +{ + MULTICAST_LOCK(); + + const cf_mserv_cfg* mserv_cfg = config_multicast_group_cfg_get(); + + // Compute min MTU across all binding interfaces. + int min_mtu = -1; + char addr_string[HOST_NAME_MAX]; + for (uint32_t i = 0; i < mserv_cfg->n_cfgs; ++i) { + const cf_msock_cfg* sock_cfg = &mserv_cfg->cfgs[i]; + cf_ip_addr_to_string_safe(&sock_cfg->addr, addr_string, + sizeof(addr_string)); + + INFO("initializing multicast heartbeat socket: %s:%d", addr_string, + sock_cfg->port); + + int bind_interface_mtu = + !cf_ip_addr_is_any(&sock_cfg->if_addr) ? + cf_inter_mtu(&sock_cfg->if_addr) : cf_inter_min_mtu(); + + if (min_mtu == -1 || min_mtu > bind_interface_mtu) { + min_mtu = bind_interface_mtu; + } + } + + if (cf_socket_mcast_init((cf_mserv_cfg*)mserv_cfg, + &g_hb.mode_state.multicast_state.listening_sockets) != 0) { + CRASH("couldn't initialize multicast heartbeat socket: %s", + cf_strerror(errno)); + } + + for (uint32_t i = 0; + i < g_hb.mode_state.multicast_state.listening_sockets.n_socks; + ++i) { + DEBUG("opened multicast socket %d", + CSFD( + &g_hb.mode_state.multicast_state.listening_sockets.socks[i])); + } + + if (min_mtu == -1) { + WARNING("error getting the min mtu - using the default %d", + DEFAULT_MIN_MTU); + min_mtu = DEFAULT_MIN_MTU; + } + + g_hb.mode_state.multicast_state.min_mtu = min_mtu; + + INFO("mtu of the network is %d", min_mtu); + MULTICAST_UNLOCK(); +} + +/** + * Start multicast module. + */ +static void +multicast_start() +{ + MULTICAST_LOCK(); + multicast_listening_sockets_open(); + channel_multicast_listening_socks_register( + &g_hb.mode_state.multicast_state.listening_sockets); + MULTICAST_UNLOCK(); +} + +/** + * Close multicast listening socket. + */ +static void +multicast_listening_sockets_close() +{ + MULTICAST_LOCK(); + INFO("closing multicast heartbeat sockets"); + cf_sockets_close(&g_hb.mode_state.multicast_state.listening_sockets); + DEBUG("closed multicast heartbeat socket"); + MULTICAST_UNLOCK(); +} + +/** + * Stop Multicast. + */ +static void +multicast_stop() +{ + MULTICAST_LOCK(); + channel_multicast_listening_socks_deregister( + &g_hb.mode_state.multicast_state.listening_sockets); + multicast_listening_sockets_close(); + + MULTICAST_UNLOCK(); +} + +/** + * Dump multicast state to logs. + * @param verbose enables / disables verbose logging. + */ +static void +multicast_dump(bool verbose) +{ + if (hb_is_mesh()) { + return; + } + + // Mode is multicast. + INFO("HB Multicast TTL: %d", config_multicast_ttl_get()); +} + +/** + * Find the maximum cluster size based on MTU of the network. + * + * num_nodes is computed so that + * + * MTU = compression_factor(fixed_size + num_nodesper_node_size) + * where, + * fixed_size = udp_header_size + msg_header_size + + * sigma(per_plugin_fixed_size) + * per_node_size = sigma(per_plugin_per_node_size). + */ +static int +multicast_supported_cluster_size_get() +{ + // Calculate the fixed size for a UDP packet and the message header. + size_t msg_fixed_size = msg_get_template_fixed_sz(g_hb_msg_template, + sizeof(g_hb_msg_template) / sizeof(msg_template)); + + size_t msg_plugin_per_node_size = 0; + + for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { + // Adding plugin specific fixed size + msg_fixed_size += g_hb.plugins[i].wire_size_fixed; + // Adding plugin specific per node size. + msg_plugin_per_node_size += g_hb.plugins[i].wire_size_per_node; + } + + // TODO: Compute the max cluster size using max storage per node in cluster + // and the min mtu. + int supported_cluster_size = MAX(1, + (((hb_mtu() - UDP_HEADER_SIZE_MAX) * MSG_COMPRESSION_RATIO) + - msg_fixed_size) / msg_plugin_per_node_size); + + return supported_cluster_size; +} + +/* + * ---------------------------------------------------------------------------- + * Heartbeat main sub module. + * ---------------------------------------------------------------------------- + */ + +/** + * Is Main module initialized. + */ +static bool +hb_is_initialized() +{ + HB_LOCK(); + bool retval = (g_hb.status != AS_HB_STATUS_UNINITIALIZED) ? true : false; + HB_UNLOCK(); + return retval; +} + +/** + * Is Main module running. + */ +static bool +hb_is_running() +{ + HB_LOCK(); + bool retval = (g_hb.status == AS_HB_STATUS_RUNNING) ? true : false; + HB_UNLOCK(); + return retval; +} + +/** + * Is Main module stopped. + */ +static bool +hb_is_stopped() +{ + HB_LOCK(); + bool retval = (g_hb.status == AS_HB_STATUS_STOPPED) ? true : false; + HB_UNLOCK(); + return retval; +} + +/** + * Initialize the mode specific data structures. + */ +static void +hb_mode_init() +{ + if (hb_is_mesh()) { + mesh_init(); + } + else { + multicast_init(); + } +} + +/** + * Start mode specific threads.. + */ +static void +hb_mode_start() +{ + if (hb_is_mesh()) { + mesh_start(); + } + else { + multicast_start(); + } +} + +/** + * The MTU for underlying network. + */ +static int +hb_mtu() +{ + int __mtu = config_override_mtu_get(); + if (!__mtu) { + __mtu = hb_is_mesh() ? + g_hb.mode_state.mesh_state.min_mtu : + g_hb.mode_state.multicast_state.min_mtu; + __mtu = __mtu > 0 ? __mtu : DEFAULT_MIN_MTU; + } + return __mtu; +} + +/** + * Initialize the template to be used for heartbeat messages. + */ +static void +hb_msg_init() +{ + // Register fabric heartbeat msg type with no processing function: + // This permits getting / putting heartbeat msgs to be moderated via an idle + // msg queue. + as_fabric_register_msg_fn(M_TYPE_HEARTBEAT, g_hb_msg_template, + sizeof(g_hb_msg_template), + AS_HB_MSG_SCRATCH_SIZE, 0, 0); +} + +/** + * Get hold of current heartbeat protocol version + */ +static uint32_t +hb_protocol_identifier_get() +{ + return HB_PROTOCOL_V3_IDENTIFIER; +} + +/** + * Node depart event time estimate. Assumes node departed timeout milliseconds + * before the detection. + */ +static cf_clock +hb_node_depart_time(cf_clock detect_time) +{ + return (detect_time - HB_NODE_TIMEOUT()); +} + +/** + * Indicates if mode is mesh. + */ +static bool +hb_is_mesh() +{ + return (config_mode_get() == AS_HB_MODE_MESH); +} + +/** + * Publish an event to subsystems listening to heart beat events. + */ +static void +hb_event_queue(as_hb_internal_event_type event_type, const cf_node* nodes, + int node_count) +{ + // Lock-less because the queue is thread safe and we do not use heartbeat + // state here. + for (int i = 0; i < node_count; i++) { + as_hb_event_node event; + event.nodeid = nodes[i]; + event.event_detected_time = cf_getms(); + + switch (event_type) { + case AS_HB_INTERNAL_NODE_ARRIVE: + event.evt = AS_HB_NODE_ARRIVE; + event.event_time = event.event_detected_time; + break; + case AS_HB_INTERNAL_NODE_DEPART: + event.evt = AS_HB_NODE_DEPART; + event.event_time = hb_node_depart_time(event.event_detected_time); + break; + case AS_HB_INTERNAL_NODE_EVICT: + event.evt = AS_HB_NODE_DEPART; + event.event_time = event.event_detected_time; + break; + case AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED: + event.evt = AS_HB_NODE_ADJACENCY_CHANGED; + event.event_time = event.event_detected_time; + break; + } + + DEBUG("queuing event of type %d for node %" PRIx64, event.evt, + event.nodeid); + cf_queue_push(&g_hb_event_listeners.external_events_queue, &event); + } +} + +/** + * Publish all pending events. Should be invoked outside hb locks. + */ +static void +hb_event_publish_pending() +{ + EXTERNAL_EVENT_PUBLISH_LOCK(); + int num_events = cf_queue_sz(&g_hb_event_listeners.external_events_queue); + if (num_events <= 0) { + // Events need not be published. + goto Exit; + } + + as_hb_event_node events[AS_HB_CLUSTER_MAX_SIZE_SOFT]; + int published_count = 0; + while (published_count < AS_HB_CLUSTER_MAX_SIZE_SOFT + && cf_queue_pop(&g_hb_event_listeners.external_events_queue, + &events[published_count], 0) == CF_QUEUE_OK) { + published_count++; + } + + if (published_count) { + // Assuming that event listeners are not registered after system init, + // no locks here. + DEBUG("publishing %d heartbeat events", published_count); + for (int i = 0; i < g_hb_event_listeners.event_listener_count; i++) { + (g_hb_event_listeners.event_listeners[i].event_callback)( + published_count, events, + g_hb_event_listeners.event_listeners[i].udata); + } + } + +Exit: + EXTERNAL_EVENT_PUBLISH_UNLOCK(); +} + +/** + * Delete the heap allocated data while iterating through the hash and deleting + * entries. + */ +static int +hb_adjacency_free_data_reduce(const void* key, void* data, void* udata) +{ + as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; + + const cf_node* nodeid = (const cf_node*)key; + + hb_adjacent_node_destroy(adjacent_node); + + // Send event depart to for this node + hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, nodeid, 1); + + return CF_SHASH_REDUCE_DELETE; +} + +/** + * Clear the heartbeat data structures. + */ +static void +hb_clear() +{ + if (!hb_is_stopped()) { + WARNING("attempted to clear heartbeat module without stopping it"); + return; + } + + HB_LOCK(); + + // Free the plugin data and delete adjacent nodes. + cf_shash_reduce(g_hb.adjacency, hb_adjacency_free_data_reduce, NULL); + cf_shash_reduce(g_hb.on_probation, hb_adjacency_free_data_reduce, NULL); + hb_adjacent_node_destroy(&g_hb.self_node); + memset(&g_hb.self_node, 0, sizeof(g_hb.self_node)); + + HB_UNLOCK(); + + // Publish node departed events for the removed nodes. + hb_event_publish_pending(); + + // Clear the mode module. + if (hb_is_mesh()) { + mesh_clear(); + } + else { + multicast_clear(); + } + + channel_clear(); +} + +/** + * Reduce function to get hold of current adjacency list. + */ +static int +hb_adjacency_iterate_reduce(const void* key, void* data, void* udata) +{ + const cf_node* nodeid = (const cf_node*)key; + as_hb_adjacency_reduce_udata* adjacency_reduce_udata = + (as_hb_adjacency_reduce_udata*)udata; + + adjacency_reduce_udata->adj_list[adjacency_reduce_udata->adj_count] = + *nodeid; + adjacency_reduce_udata->adj_count++; + + return CF_SHASH_OK; +} + +/** + * Plugin function to set heartbeat adjacency list into a pulse message. + */ +static void +hb_plugin_set_fn(msg* msg) +{ + HB_LOCK(); + + cf_node adj_list[cf_shash_get_size(g_hb.adjacency)]; + as_hb_adjacency_reduce_udata adjacency_reduce_udata = { adj_list, 0 }; + + cf_shash_reduce(g_hb.adjacency, hb_adjacency_iterate_reduce, + &adjacency_reduce_udata); + + HB_UNLOCK(); + + // Populate adjacency list. + msg_adjacency_set(msg, adj_list, adjacency_reduce_udata.adj_count); + + // Set cluster name. + char cluster_name[AS_CLUSTER_NAME_SZ]; + as_config_cluster_name_get(cluster_name); + + if (cluster_name[0] != '\0' + && msg_set_str(msg, AS_HB_MSG_CLUSTER_NAME, cluster_name, + MSG_SET_COPY) != 0) { + CRASH("error setting cluster name on msg"); + } +} + +/** + * Plugin function that parses adjacency list out of a heartbeat pulse message. + */ +static void +hb_plugin_parse_data_fn(msg* msg, cf_node source, + as_hb_plugin_node_data* plugin_data) +{ + size_t adj_length = 0; + cf_node* adj_list = NULL; + + if (msg_adjacency_get(msg, &adj_list, &adj_length) != 0) { + // Store a zero length adjacency list. Should not have happened. + WARNING("received heartbeat without adjacency list %" PRIx64, source); + adj_length = 0; + } + + // The guess can be larger for older protocols which also include self node + // in the adjacency list. + int guessed_data_size = (adj_length * sizeof(cf_node)); + + if (guessed_data_size > plugin_data->data_capacity) { + // Round up to nearest multiple of block size to prevent very frequent + // reallocation. + size_t data_capacity = ((guessed_data_size + HB_PLUGIN_DATA_BLOCK_SIZE + - 1) / + HB_PLUGIN_DATA_BLOCK_SIZE) * + HB_PLUGIN_DATA_BLOCK_SIZE; + + // Reallocate since we have outgrown existing capacity. + plugin_data->data = cf_realloc(plugin_data->data, data_capacity); + plugin_data->data_capacity = data_capacity; + } + + cf_node* dest_list = (cf_node*)(plugin_data->data); + + size_t final_list_length = 0; + for (size_t i = 0; i < adj_length; i++) { + if (adj_list[i] == source) { + // Skip the source node. + continue; + } + dest_list[final_list_length++] = adj_list[i]; + } + + plugin_data->data_size = (final_list_length * sizeof(cf_node)); +} + +/** + * Get the msg buffer from a pool based on the protocol under use. + * @return the msg buff + */ +static msg* +hb_msg_get() +{ + return as_fabric_msg_get(M_TYPE_HEARTBEAT); +} + +/** + * Return the message buffer back to the pool. + */ +static void +hb_msg_return(msg* msg) +{ + as_fabric_msg_put(msg); +} + +/** + * Fill the outgoing pulse message with plugin specific data. + * + * Note: The set functions would be acquiring their locks. This function should + * never directly use nor have a call stack under HB_LOCK. + * + * @param msg the outgoing pulse message. + */ +static void +hb_plugin_msg_fill(msg* msg) +{ + for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { + if (g_hb.plugins[i].set_fn) { + (g_hb.plugins[i].set_fn)(msg); + } + } +} + +/** + * Parse fields from the message into plugin specific data. + * @param msg the outgoing pulse message. + * @param adjacent_node the node from which this message was received. + * @param plugin_data_changed (output) array whose ith entry is set to true if + * ith plugin's data changed, false otherwise. Should be large enough to hold + * flags for all plugins. + */ +static void +hb_plugin_msg_parse(msg* msg, as_hb_adjacent_node* adjacent_node, + as_hb_plugin* plugins, bool plugin_data_changed[]) +{ + cf_node source; + adjacent_node->plugin_data_cycler++; + + msg_nodeid_get(msg, &source); + for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { + plugin_data_changed[i] = false; + if (plugins[i].parse_fn) { + as_hb_plugin_node_data* curr_data = + &adjacent_node->plugin_data[i][adjacent_node->plugin_data_cycler + % 2]; + + as_hb_plugin_node_data* prev_data = + &adjacent_node->plugin_data[i][(adjacent_node->plugin_data_cycler + + 1) % 2]; + + // Ensure there is a preallocated data pointer. + if (curr_data->data == NULL) { + curr_data->data = cf_malloc(HB_PLUGIN_DATA_DEFAULT_SIZE); + curr_data->data_capacity = HB_PLUGIN_DATA_DEFAULT_SIZE; + curr_data->data_size = 0; + } + + // Parse message data into current data. + (plugins[i]).parse_fn(msg, source, curr_data); + + if (!plugins[i].change_listener) { + // No change listener configured. Skip detecting change. + continue; + } + + size_t curr_data_size = curr_data->data_size; + void* curr_data_blob = curr_data_size ? curr_data->data : NULL; + + size_t prev_data_size = prev_data->data_size; + void* prev_data_blob = prev_data_size ? prev_data->data : NULL; + + if (prev_data_blob == curr_data_blob) { + // Old and new data both NULL or both point to the same memory + // location. + plugin_data_changed[i] = false; + continue; + } + + if (prev_data_size + != curr_data_size|| prev_data_blob == NULL || curr_data_blob == NULL) { + // Plugin data definitely changed, as the data sizes differ or + // exactly one of old or new data pointers is NULL. + plugin_data_changed[i] = true; + continue; + } + + // The data sizes match at this point and neither values are NULL. + plugin_data_changed[i] = memcmp(prev_data_blob, curr_data_blob, + curr_data_size) != 0; + } + } +} + +/** + * Adjacency list for an adjacent node changed. + */ +static void +hb_plugin_data_change_listener(cf_node changed_node_id) +{ + hb_event_queue(AS_HB_INTERNAL_NODE_ADJACENCY_CHANGED, &changed_node_id, 1); +} + +/** + * Initialize the plugin specific data structures. + */ +static void +hb_plugin_init() +{ + memset(&g_hb.plugins, 0, sizeof(g_hb.plugins)); + + // Be cute. Register self as a plugin. + as_hb_plugin self_plugin; + memset(&self_plugin, 0, sizeof(self_plugin)); + self_plugin.id = AS_HB_PLUGIN_HB; + self_plugin.wire_size_fixed = 0; + self_plugin.wire_size_per_node = sizeof(cf_node); + self_plugin.set_fn = hb_plugin_set_fn; + self_plugin.parse_fn = hb_plugin_parse_data_fn; + self_plugin.change_listener = hb_plugin_data_change_listener; + hb_plugin_register(&self_plugin); +} + +/** + * Transmits heartbeats at fixed intervals. + */ +void* +hb_transmitter(void* arg) +{ + DETAIL("heartbeat transmitter started"); + + cf_clock last_time = 0; + + while (hb_is_running()) { + cf_clock curr_time = cf_getms(); + + if ((curr_time - last_time) < PULSE_TRANSMIT_INTERVAL()) { + // Interval has not been reached for sending heartbeats + usleep(MIN(AS_HB_TX_INTERVAL_MS_MIN, (last_time + + PULSE_TRANSMIT_INTERVAL()) - curr_time) * 1000); + continue; + } + + last_time = curr_time; + + // Construct the pulse message. + msg* msg = hb_msg_get(); + + msg_src_fields_fill(msg); + msg_type_set(msg, AS_HB_MSG_TYPE_PULSE); + + // Have plugins fill their data into the heartbeat pulse message. + hb_plugin_msg_fill(msg); + + // Broadcast the heartbeat to all known recipients. + channel_msg_broadcast(msg); + + // Return the msg back to the fabric. + hb_msg_return(msg); + + DETAIL("done sending pulse message"); + } + + DETAIL("heartbeat transmitter stopped"); + return NULL; +} + +/** + * Get hold of adjacent node information given its nodeid. + * @param nodeid the nodeid. + * @param adjacent_node the output node information. + * @return 0 on success, -1 on failure. + */ +static int +hb_adjacent_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node) +{ + int rv = -1; + HB_LOCK(); + + if (cf_shash_get(g_hb.adjacency, &nodeid, adjacent_node) == CF_SHASH_OK) { + rv = 0; + } + + HB_UNLOCK(); + return rv; +} + +/** + * Get hold of an on-probation node information given its nodeid. + * @param nodeid the nodeid. + * @param adjacent_node the output node information. + * @return 0 on success, -1 on failure. + */ +static int +hb_on_probation_node_get(cf_node nodeid, as_hb_adjacent_node* adjacent_node) +{ + int rv = -1; + HB_LOCK(); + + if (cf_shash_get(g_hb.on_probation, &nodeid, adjacent_node) + == CF_SHASH_OK) { + rv = 0; + } + + HB_UNLOCK(); + return rv; +} + +/** + * Read the plugin data from an adjacent node. + * @param adjacent_node the adjacent node. + * @param plugin_data (output) will be null if this node has no plugin data. + * Else will point to the plugin data. + * @param plugin_data_size (output) the size of the plugin data. + */ +static void +hb_adjacent_node_plugin_data_get(as_hb_adjacent_node* adjacent_node, + as_hb_plugin_id plugin_id, void** plugin_data, size_t* plugin_data_size) +{ + *plugin_data_size = + adjacent_node->plugin_data[plugin_id][adjacent_node->plugin_data_cycler + % 2].data_size; + + *plugin_data = + *plugin_data_size ? + (cf_node*)(adjacent_node->plugin_data[plugin_id][adjacent_node->plugin_data_cycler + % 2].data) : NULL; +} + +/** + * Get adjacency list for an adjacent node. + */ +static void +hb_adjacent_node_adjacency_get(as_hb_adjacent_node* adjacent_node, + cf_node** adjacency_list, size_t* adjacency_length) +{ + hb_adjacent_node_plugin_data_get(adjacent_node, AS_HB_PLUGIN_HB, + (void**)adjacency_list, adjacency_length); + (*adjacency_length) /= sizeof(cf_node); +} + +/** + * Indicates if a give node has expired and should be removed from the adjacency + * list. + */ +static bool +hb_node_has_expired(cf_node nodeid, as_hb_adjacent_node* adjacent_node) +{ + if (nodeid == config_self_nodeid_get()) { + return false; + } + + HB_LOCK(); + + cf_clock now = cf_getms(); + + bool expired = adjacent_node->last_updated_monotonic_ts + HB_NODE_TIMEOUT() + < now; + + HB_UNLOCK(); + return expired; +} + +/** + * Indicates if self node has duplicate ids. + */ +static bool +hb_self_is_duplicate(){ + HB_LOCK(); + bool self_is_duplicate = g_hb.self_is_duplicate; + HB_UNLOCK(); + return self_is_duplicate; +} + +/** + * Updates the self is duplicate flag. + */ +static void +hb_self_duplicate_update() +{ + cf_clock now = cf_getms(); + HB_LOCK(); + if (g_hb.self_is_duplicate) { + uint32_t duplicate_block_interval = + config_endpoint_track_intervals_get() + * config_tx_interval_get(); + if (g_hb.self_duplicate_detected_ts + duplicate_block_interval <= now) { + // We have not seen duplicates for the endpoint change tracking + // interval. Mark ourself as non-duplicate. + g_hb.self_is_duplicate = false; + } + } + HB_UNLOCK(); +} + +/** + * Free up space occupied by plugin data from adjacent node. + */ +static void +hb_adjacent_node_destroy(as_hb_adjacent_node* adjacent_node) +{ + HB_LOCK(); + for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { + as_hb_plugin_node_data* curr_plugin_data = adjacent_node->plugin_data[i]; + for (int j = 0; j < 2; j++) { + if (curr_plugin_data[j].data) { + cf_free(curr_plugin_data[j].data); + curr_plugin_data[j].data = NULL; + } + + curr_plugin_data[j].data_capacity = 0; + curr_plugin_data[j].data_size = 0; + } + } + + if (adjacent_node->endpoint_list) { + // Free the endpoint list. + cf_free(adjacent_node->endpoint_list); + adjacent_node->endpoint_list = NULL; + } + + HB_UNLOCK(); +} + +/** + * Tend reduce function that removes expired nodes from adjacency list. + */ +static int +hb_adjacency_tend_reduce(const void* key, void* data, void* udata) +{ + cf_node nodeid = *(const cf_node*)key; + as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; + as_hb_adjacency_tender_udata* adjacency_tender_udata = + (as_hb_adjacency_tender_udata*)udata; + + int rv = CF_SHASH_OK; + bool cluster_name_mismatch = adjacent_node->cluster_name_mismatch_count + > CLUSTER_NAME_MISMATCH_MAX; + if (hb_node_has_expired(nodeid, adjacent_node) || cluster_name_mismatch) { + INFO("node expired %" PRIx64" %s", nodeid, cluster_name_mismatch ? "(cluster name mismatch)" : ""); + if (cluster_name_mismatch) { + adjacency_tender_udata->evicted_nodes[adjacency_tender_udata->evicted_node_count++] = + nodeid; + } + else { + adjacency_tender_udata->dead_nodes[adjacency_tender_udata->dead_node_count++] = + nodeid; + } + + // Free plugin data as well. + hb_adjacent_node_destroy(adjacent_node); + + rv = CF_SHASH_REDUCE_DELETE; + } + + return rv; +} + +/** + * Tend reduce function that removes expired nodes from the probationary list. + */ +static int +hb_on_probation_tend_reduce(const void* key, void* data, void* udata) +{ + cf_node nodeid = *(const cf_node*)key; + as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; + + int rv = CF_SHASH_OK; + if (hb_node_has_expired(nodeid, adjacent_node)) { + DEBUG("on-probation node %" PRIx64 " expired", nodeid); + // Free plugin data as well. + hb_adjacent_node_destroy(adjacent_node); + rv = CF_SHASH_REDUCE_DELETE; + } + return rv; +} + +/** + * Tends the adjacency list. Removes nodes that expire. + */ +void* +hb_adjacency_tender(void* arg) +{ + DETAIL("adjacency tender started"); + + cf_clock last_time = 0; + cf_clock last_depart_time = 0; + + while (hb_is_running()) { + cf_clock curr_time = cf_getms(); + uint32_t adjacency_tend_interval = ADJACENCY_TEND_INTERVAL; + // Interval after node depart where we tend faster to detect additional + // node departures. + uint32_t fast_check_interval = 2 * config_tx_interval_get(); + if (last_depart_time + fast_check_interval > curr_time) { + adjacency_tend_interval = ADJACENCY_FAST_TEND_INTERVAL; + } + + hb_self_duplicate_update(); + + if ((curr_time - last_time) < adjacency_tend_interval) { + // Publish any pendng events. + hb_event_publish_pending(); + + // Interval has not been reached for sending heartbeats + usleep( + MIN(AS_HB_TX_INTERVAL_MS_MIN, + (last_time + adjacency_tend_interval) - curr_time) + * 1000); + continue; + } + + last_time = curr_time; + + DETAIL("tending adjacency list"); + + HB_LOCK(); + cf_node dead_nodes[cf_shash_get_size(g_hb.adjacency)]; + cf_node evicted_nodes[cf_shash_get_size(g_hb.adjacency)]; + as_hb_adjacency_tender_udata adjacency_tender_udata; + adjacency_tender_udata.dead_nodes = dead_nodes; + adjacency_tender_udata.dead_node_count = 0; + adjacency_tender_udata.evicted_nodes = evicted_nodes; + adjacency_tender_udata.evicted_node_count = 0; + + cf_shash_reduce(g_hb.adjacency, hb_adjacency_tend_reduce, + &adjacency_tender_udata); + + if (adjacency_tender_udata.dead_node_count > 0) { + last_depart_time = curr_time; + // Queue events for dead nodes. + hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, dead_nodes, + adjacency_tender_udata.dead_node_count); + } + + if (adjacency_tender_udata.evicted_node_count > 0) { + last_depart_time = curr_time; + // Queue events for evicted nodes. + hb_event_queue(AS_HB_INTERNAL_NODE_EVICT, evicted_nodes, + adjacency_tender_udata.evicted_node_count); + } + + // Expire nodes from the on-probation list. + cf_shash_reduce(g_hb.on_probation, hb_on_probation_tend_reduce, NULL); + HB_UNLOCK(); + + // See if we have pending events to publish. + hb_event_publish_pending(); + + DETAIL("done tending adjacency list"); + } + + DETAIL("adjacency tender shut down"); + return NULL; +} + +/** + * Start the transmitter thread. + */ +static void +hb_tx_start() +{ + // Start the transmitter thread. + if (pthread_create(&g_hb.transmitter_tid, 0, hb_transmitter, &g_hb) != 0) { + CRASH("could not create heartbeat transmitter thread: %s", + cf_strerror(errno)); + } +} + +/** + * Stop the transmitter thread. + */ +static void +hb_tx_stop() +{ + DETAIL("waiting for the transmitter thread to stop"); + // Wait for the adjacency tender thread to stop. + pthread_join(g_hb.transmitter_tid, NULL); +} + +/** + * Start the transmitter thread. + */ +static void +hb_adjacency_tender_start() +{ + // Start the transmitter thread. + if (pthread_create(&g_hb.adjacency_tender_tid, 0, hb_adjacency_tender, + &g_hb) != 0) { + CRASH("could not create heartbeat adjacency tender thread: %s", + cf_strerror(errno)); + } +} + +/** + * Stop the adjacency tender thread. + */ +static void +hb_adjacency_tender_stop() +{ + // Wait for the adjacency tender thread to stop. + pthread_join(g_hb.adjacency_tender_tid, NULL); +} + +/** + * Initialize the heartbeat subsystem. + */ +static void +hb_init() +{ + if (hb_is_initialized()) { + WARNING("heartbeat main module is already initialized"); + return; + } + + // Operate under a lock. Let's be paranoid everywhere. + HB_LOCK(); + + // Initialize the heartbeat data structure. + memset(&g_hb, 0, sizeof(g_hb)); + + // Initialize the adjacency hash. + g_hb.adjacency = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), + sizeof(as_hb_adjacent_node), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); + + // Initialize the on_probation hash. + g_hb.on_probation = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), + sizeof(as_hb_adjacent_node), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); + + // Initialize the temporary hash to map nodeid to index. + g_hb.nodeid_to_index = cf_shash_create(cf_nodeid_shash_fn, sizeof(cf_node), + sizeof(int), AS_HB_CLUSTER_MAX_SIZE_SOFT, 0); + + // Initialize unpublished event queue. + cf_queue_init(&g_hb_event_listeners.external_events_queue, + sizeof(as_hb_event_node), + AS_HB_CLUSTER_MAX_SIZE_SOFT, true); + + // Initialize the mode specific state. + hb_mode_init(); + + // Initialize the plugin functions. + hb_plugin_init(); + + // Initialize IO channel subsystem. + channel_init(); + + g_hb.status = AS_HB_STATUS_STOPPED; + + HB_UNLOCK(); +} + +/** + * Start the heartbeat subsystem. + */ +static void +hb_start() +{ + // Operate under a lock. Let's be paranoid everywhere. + HB_LOCK(); + + if (hb_is_running()) { + // Shutdown the heartbeat subsystem. + hb_stop(); + } + + g_hb.status = AS_HB_STATUS_RUNNING; + + // Initialize the heartbeat message templates. Called from here because + // fabric needs to be initialized for this call to succeed. Fabric init + // happens after heartbeat init. + hb_msg_init(); + + // Initialize channel sub module. + channel_start(); + + // Start the mode sub module + hb_mode_start(); + + // Start heart beat transmitter. + hb_tx_start(); + + // Start heart beat adjacency tender. + hb_adjacency_tender_start(); + + HB_UNLOCK(); +} + +/** + * Shut down the heartbeat subsystem. + */ +static void +hb_stop() +{ + if (!hb_is_running()) { + WARNING("heartbeat is already stopped"); + return; + } + + HB_LOCK(); + g_hb.status = AS_HB_STATUS_SHUTTING_DOWN; + HB_UNLOCK(); + + // Publish pending events. Should not delay any events. + hb_event_publish_pending(); + + // Shutdown mode. + if (hb_is_mesh()) { + mesh_stop(); + } + else { + multicast_stop(); + } + + // Wait for the threads to shut down. + hb_tx_stop(); + + hb_adjacency_tender_stop(); + + // Stop channels. + channel_stop(); + + g_hb.status = AS_HB_STATUS_STOPPED; +} + +/** + * Register a plugin with the heart beat system. + */ +static void +hb_plugin_register(as_hb_plugin* plugin) +{ + HB_LOCK(); + memcpy(&g_hb.plugins[plugin->id], plugin, sizeof(as_hb_plugin)); + HB_UNLOCK(); +} + +/** + * Check if the heartbeat recieved is duplicate or stale. + */ +static bool +hb_msg_is_obsolete(as_hb_channel_event* event, as_hlc_timestamp last_send_ts) +{ + if (as_hlc_timestamp_order_get(event->msg_hlc_ts.send_ts, last_send_ts) + == AS_HLC_HAPPENS_BEFORE) { + // Received a delayed heartbeat send before the current heartbeat. + return true; + } + return false; +} + +/** + * Update the tracker with endpoint change status. + */ +static void +hb_endpoint_change_tracker_update(uint64_t* tracker, bool endpoint_changed) +{ + *tracker = *tracker << 1; + if (endpoint_changed) { + (*tracker)++; + } +} + +/** + * Indicates if endpoint changes for this node are normal. + */ +static bool +hb_endpoint_change_tracker_is_normal(uint64_t tracker) +{ + if (tracker == 0) { + // Normal and healthy case. + return true; + } + + uint32_t num_intervals_to_track = MIN(64, + config_endpoint_track_intervals_get()); + uint64_t mask = ~(~(uint64_t)0 << num_intervals_to_track); + + // Ignore older history. + tracker &= mask; + + int flip_count = 0; + static int nibblebits[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; + for (; tracker != 0; tracker >>= 4) { + flip_count += nibblebits[tracker & 0x0f]; + } + + return flip_count <= config_endpoint_changes_allowed_get(); +} + + +/** + * Indicates if the change tracker just changed. + */ +static bool +hb_endpoint_change_tracker_has_changed(uint64_t tracker) +{ + return tracker % 2; +} + +/** + * Update adjacent node data on receiving a valid pulse message. + */ +static void +hb_adjacent_node_update(as_hb_channel_event* msg_event, + as_hb_adjacent_node* adjacent_node, bool plugin_data_changed[]) +{ + msg* msg = msg_event->msg; + cf_node source = 0; + // Channel has validated the source. Don't bother checking here. + msg_nodeid_get(msg, &source); + + // Update all fields irrespective of whether this is a new node. + msg_id_get(msg, &adjacent_node->protocol_version); + + // Get the ip address. + as_endpoint_list* msg_endpoint_list; + if (msg_endpoint_list_get(msg, &msg_endpoint_list) == 0 + && !as_endpoint_lists_are_equal(adjacent_node->endpoint_list, + msg_endpoint_list)) { + // Update the endpoints. + endpoint_list_copy(&adjacent_node->endpoint_list, msg_endpoint_list); + } + + // Populate plugin data. + hb_plugin_msg_parse(msg, adjacent_node, g_hb.plugins, plugin_data_changed); + + // Update the last updated time. + adjacent_node->last_updated_monotonic_ts = cf_getms(); + memcpy(&adjacent_node->last_msg_hlc_ts, &msg_event->msg_hlc_ts, + sizeof(adjacent_node->last_msg_hlc_ts)); + + // Update the latency. + int64_t latency = as_hlc_timestamp_diff_ms(msg_event->msg_hlc_ts.send_ts, + msg_event->msg_hlc_ts.recv_ts); + latency = latency < 0 ? -latency : latency; + adjacent_node->avg_latency = ALPHA * latency + + (1 - ALPHA) * adjacent_node->avg_latency; + + // Reset the cluster-name mismatch counter to zero. + adjacent_node->cluster_name_mismatch_count = 0; + + // Check if fabric endpoints have changed. + as_hb_plugin_node_data* curr_data = + &adjacent_node->plugin_data[AS_HB_PLUGIN_FABRIC][adjacent_node->plugin_data_cycler + % 2]; + + as_hb_plugin_node_data* prev_data = + &adjacent_node->plugin_data[AS_HB_PLUGIN_FABRIC][(adjacent_node->plugin_data_cycler + + 1) % 2]; + + as_endpoint_list* curr_fabric_endpoints = + as_fabric_hb_plugin_get_endpoint_list(curr_data); + as_endpoint_list* prev_fabric_endpoints = + as_fabric_hb_plugin_get_endpoint_list(prev_data); + + // Endpoints changed if this is not the first update where there is no + // previous data or if the endpoint lists do not match. + bool endpoints_changed = prev_fabric_endpoints != NULL + && !as_endpoint_lists_are_equal(curr_fabric_endpoints, + prev_fabric_endpoints); + + if (endpoints_changed) { + char curr_fabric_endpoints_str[ENDPOINT_LIST_STR_SIZE]; + char prev_fabric_endpoints_str[ENDPOINT_LIST_STR_SIZE]; + + as_endpoint_list_to_string(curr_fabric_endpoints, + curr_fabric_endpoints_str, sizeof(curr_fabric_endpoints_str)); + as_endpoint_list_to_string(prev_fabric_endpoints, + prev_fabric_endpoints_str, sizeof(prev_fabric_endpoints_str)); + + TICKER_WARNING("node: %"PRIx64" fabric endpoints changed from {%s} to {%s}", source, prev_fabric_endpoints_str, curr_fabric_endpoints_str); + } + + hb_endpoint_change_tracker_update(&adjacent_node->endpoint_change_tracker, + endpoints_changed); +} + +/** + * Indicates if a node can be considered adjacent, based on accumulated + * statistics. + */ +static bool +hb_node_can_consider_adjacent(as_hb_adjacent_node* adjacent_node) +{ + return hb_endpoint_change_tracker_is_normal( + adjacent_node->endpoint_change_tracker); +} + +/** + * Process a pulse from source having the out node-id. + */ +static void +hb_channel_on_self_pulse(as_hb_channel_event* msg_event) +{ + bool plugin_data_changed[AS_HB_PLUGIN_SENTINEL] = { 0 }; + + HB_LOCK(); + hb_adjacent_node_update(msg_event, &g_hb.self_node, plugin_data_changed); + + as_hb_plugin_node_data* curr_data = + &g_hb.self_node.plugin_data[AS_HB_PLUGIN_FABRIC][g_hb.self_node.plugin_data_cycler + % 2]; + as_endpoint_list* curr_fabric_endpoints = + as_fabric_hb_plugin_get_endpoint_list(curr_data); + + if (!as_fabric_is_published_endpoint_list(curr_fabric_endpoints)) { + // Mark self as having duplicate node-id. + g_hb.self_is_duplicate = true; + g_hb.self_duplicate_detected_ts = cf_getms(); + + // Found another node with duplicate node-id. + char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; + as_endpoint_list_to_string(curr_fabric_endpoints, endpoint_list_str, + sizeof(endpoint_list_str)); + TICKER_WARNING("duplicate node-id: %" PRIx64 " with fabric endpoints {%s}", config_self_nodeid_get(), endpoint_list_str); + } + else { + cf_atomic_int_incr(&g_stats.heartbeat_received_self); + } + + HB_UNLOCK(); +} + +/** + * Process an incoming pulse message. + */ +static void +hb_channel_on_pulse(as_hb_channel_event* msg_event) +{ + msg* msg = msg_event->msg; + cf_node source; + + // Print cluster breach only once per second. + static cf_clock last_cluster_breach_print = 0; + + // Channel has validated the source. Don't bother checking here. + msg_nodeid_get(msg, &source); + + if (source == config_self_nodeid_get()) { + hb_channel_on_self_pulse(msg_event); + // Ignore self heartbeats. + return; + } + + HB_LOCK(); + + as_hb_adjacent_node adjacent_node = { 0 }; + + bool plugin_data_changed[AS_HB_PLUGIN_SENTINEL] = { 0 }; + bool is_in_adjacency = (hb_adjacent_node_get(source, &adjacent_node) == 0); + bool should_be_on_probation = false; + + if (!is_in_adjacency) { + hb_on_probation_node_get(source, &adjacent_node); + } + + // Update the adjacent node with contents of the message. If the msg is + // obsolete + hb_adjacent_node_update(msg_event, &adjacent_node, plugin_data_changed); + as_hlc_timestamp send_ts = adjacent_node.last_msg_hlc_ts.send_ts; + + // Check if this node needs to be on probation. + should_be_on_probation = !hb_node_can_consider_adjacent(&adjacent_node); + + if (hb_endpoint_change_tracker_has_changed( + adjacent_node.endpoint_change_tracker)) { + // Allow a little more slack for obsolete checking because the two nodes + // might not have matching send timestamps. + send_ts = as_hlc_timestamp_subtract_ms(send_ts, config_tx_interval_get()); + } + + if (hb_msg_is_obsolete(msg_event, send_ts)) { + WARNING("ignoring delayed heartbeat - expected timestamp less than %" PRIu64" but was %" PRIu64 " from node: %" PRIx64, + send_ts, + msg_event->msg_hlc_ts.send_ts, source); + goto Exit; + } + + cf_atomic_int_incr(&g_stats.heartbeat_received_foreign); + + bool is_new = !should_be_on_probation && !is_in_adjacency; + + if (is_new) { + int mcsize = config_mcsize(); + // Note: adjacency list does not contain self node hence + // (mcsize - 1) in the check. + if (cf_shash_get_size(g_hb.adjacency) >= (mcsize - 1)) { + if (last_cluster_breach_print != (cf_getms() / 1000L)) { + WARNING("ignoring node: %" PRIx64" - exceeding maximum supported cluster size %d", + source, mcsize); + last_cluster_breach_print = cf_getms() / 1000L; + } + goto Exit; + } + } + + // Update plugin data, update times, etc. + cf_shash_put(should_be_on_probation ? g_hb.on_probation : g_hb.adjacency, + &source, &adjacent_node); + + // Maintain mutual exclusion between adjacency and on_probation hashes. + cf_shash_delete(should_be_on_probation ? g_hb.adjacency : g_hb.on_probation, + &source); + + if (is_new) { + // Publish event if this is a new node. + INFO("node arrived %" PRIx64, source); + hb_event_queue(AS_HB_INTERNAL_NODE_ARRIVE, &source, 1); + } + else if (should_be_on_probation && is_in_adjacency) { + // This node needs to be on probation, most likely due to duplicate + // node-ids. + WARNING("node expired %" PRIx64" - potentially duplicate node-id", source); + hb_event_queue(AS_HB_INTERNAL_NODE_DEPART, &source, 1); + } + +Exit: + HB_UNLOCK(); + + // Publish any pending node arrival events. + hb_event_publish_pending(); + + if (!should_be_on_probation) { + // Call plugin change listeners outside of a lock to prevent deadlocks. + for (int i = 0; i < AS_HB_PLUGIN_SENTINEL; i++) { + if (plugin_data_changed[i] && g_hb.plugins[i].change_listener) { + // Notify that data for this plugin for the source node has + // changed. + DETAIL("plugin data for node %" PRIx64" changed for plugin %d", + source, i); + (g_hb.plugins[i]).change_listener(source); + } + } + } +} + +/** + * Process an incoming heartbeat message. + */ +static void +hb_channel_on_msg_rcvd(as_hb_channel_event* event) +{ + msg* msg = event->msg; + as_hb_msg_type type; + msg_type_get(msg, &type); + + switch (type) { + case AS_HB_MSG_TYPE_PULSE: // A pulse message. Update the adjacent node data. + hb_channel_on_pulse(event); + break; + default: // Ignore other messages. + break; + } +} + +/** + * Increase the cluster-name mismatch counter the node. + */ +static void +hb_handle_cluster_name_mismatch(as_hb_channel_event* event) +{ + HB_LOCK(); + + as_hb_adjacent_node adjacent_node; + memset(&adjacent_node, 0, sizeof(adjacent_node)); + + if (hb_adjacent_node_get(event->nodeid, &adjacent_node) != 0) { + // Node does not exist in the adjacency list + goto Exit; + } + + if (hb_msg_is_obsolete(event, adjacent_node.last_msg_hlc_ts.send_ts)) { + WARNING("ignoring delayed heartbeat - expected timestamp less than %" PRIu64" but was %" PRIu64 " from node: %" PRIx64, + adjacent_node.last_msg_hlc_ts.send_ts, + event->msg_hlc_ts.send_ts, event->nodeid); + goto Exit; + } + + // Update the cluster_name_mismatch counter. + adjacent_node.cluster_name_mismatch_count++; + cf_shash_put(g_hb.adjacency, &event->nodeid, &adjacent_node); +Exit: + HB_UNLOCK(); +} + +/** + * Process channel events. + */ +static void +hb_channel_event_process(as_hb_channel_event* event) +{ + // Deal with pulse messages here. + switch (event->type) { + case AS_HB_CHANNEL_MSG_RECEIVED: + hb_channel_on_msg_rcvd(event); + break; + case AS_HB_CHANNEL_CLUSTER_NAME_MISMATCH: + hb_handle_cluster_name_mismatch(event); + break; + default: // Ignore channel active and inactive events. Rather rely on the adjacency + // tender to expire nodes. + break; + } +} + +/** + * Dump hb mode state to logs. + * @param verbose enables / disables verbose logging. + */ +static void +hb_mode_dump(bool verbose) +{ + if (hb_is_mesh()) { + mesh_dump(verbose); + } + else { + multicast_dump(verbose); + } +} + +/** + * Reduce function to dump hb node info to log file. + */ +static int +hb_dump_reduce(const void* key, void* data, void* udata) +{ + const cf_node* nodeid = (const cf_node*)key; + as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; + + char endpoint_list_str[ENDPOINT_LIST_STR_SIZE]; + as_endpoint_list_to_string(adjacent_node->endpoint_list, endpoint_list_str, + sizeof(endpoint_list_str)); + + INFO("\tHB %s Node: node-id %" PRIx64" protocol %" PRIu32" endpoints {%s} last-updated %" PRIu64 " latency-ms %" PRIu64 , + (char*)udata, + *nodeid, adjacent_node->protocol_version, endpoint_list_str, + adjacent_node->last_updated_monotonic_ts, adjacent_node->avg_latency); + + return CF_SHASH_OK; +} + +/** + * Dump hb state to logs. + * @param verbose enables / disables verbose logging. + */ +static void +hb_dump(bool verbose) +{ + HB_LOCK(); + + INFO("HB Adjacency Size: %d", cf_shash_get_size(g_hb.adjacency)); + + if (verbose) { + cf_shash_reduce(g_hb.adjacency, hb_dump_reduce, "Adjacent"); + } + + if (cf_shash_get_size(g_hb.on_probation)) { + INFO("HB On-probation Size: %d", cf_shash_get_size(g_hb.on_probation)); + + if (verbose) { + cf_shash_reduce(g_hb.on_probation, hb_dump_reduce, "On-probation"); + } + } + + HB_UNLOCK(); +} + +/** + * Compute a complement / inverted adjacency graph for input nodes such that + * entry + * + * inverted_graph[i][j] = 0 iff node[i] and node[j] are in each others adjacency + * lists. That is they have a bidirectional network link active between them. + * + * else + * + * inverted_graph[i][j] > 0 iff there is no link or a unidirectional link + * between them. + * + * + * @param nodes the input vector of nodes. + * @param inverted_graph (output) a (num_nodes x num_nodes ) 2D byte array. + */ +static void +hb_adjacency_graph_invert(cf_vector* nodes, uint8_t** inverted_graph) +{ + HB_LOCK(); + int num_nodes = cf_vector_size(nodes); + + for (int i = 0; i < num_nodes; i++) { + for (int j = 0; j < num_nodes; j++) { + inverted_graph[i][j] = 2; + } + cf_node nodeid = 0; + cf_vector_get(nodes, i, &nodeid); + cf_shash_put(g_hb.nodeid_to_index, &nodeid, &i); + } + + cf_node self_nodeid = config_self_nodeid_get(); + int self_node_index = -1; + cf_shash_get(g_hb.nodeid_to_index, &self_nodeid, &self_node_index); + + for (int i = 0; i < num_nodes; i++) { + // Mark the node connected from itself, i.e, disconnected in the + // inverted graph. + inverted_graph[i][i] = 0; + + cf_node node = *(cf_node*)cf_vector_getp(nodes, i); + as_hb_adjacent_node node_info; + + if (hb_adjacent_node_get(node, &node_info) == 0) { + if (self_node_index >= 0) { + // Self node will not have plugin data. But the fact that this + // node has an adjacent node indicates that is is in our + // adjacency list. Adjust the graph. + inverted_graph[i][self_node_index]--; + inverted_graph[self_node_index][i]--; + } + + cf_node* adjacency_list = NULL; + size_t adjacency_length = 0; + hb_adjacent_node_adjacency_get(&node_info, &adjacency_list, &adjacency_length); + + for (int j = 0; j < adjacency_length; j++) { + int other_node_index = -1; + cf_shash_get(g_hb.nodeid_to_index, &adjacency_list[j], + &other_node_index); + if (other_node_index < 0) { + // This node is not in the input set of nodes. + continue; + } + + if (i != other_node_index) { + inverted_graph[i][other_node_index]--; + inverted_graph[other_node_index][i]--; + } + } + } + } + + // Cleanup the temporary hash. + cf_shash_delete_all(g_hb.nodeid_to_index); + + HB_UNLOCK(); +} + +/** + * Compute the nodes to evict from the input nodes so that remaining nodes form + * a clique, based on adjacency lists using minimal vertex cover. + * + * The minimal vertex cover on this graph is the set of nodes that should be + * removed to result in a clique on the remaining nodes. This implementation is + * an approximation of the minimal vertex cover. The notion is to keep removing + * vertices having the highest degree until there are no more edges remaining. + * The heuristic gets rid of the more problematic nodes first. + * + * @param nodes input cf_node vector. + * @param nodes_to_evict output cf_node clique array, that is initialized. + */ +static void +hb_maximal_clique_evict(cf_vector* nodes, cf_vector* nodes_to_evict) +{ + int num_nodes = cf_vector_size(nodes); + + if (num_nodes == 0) { + // Nothing to do. + return; + } + + int graph_alloc_size = sizeof(uint8_t) * num_nodes * num_nodes; + void* graph_data = MSG_BUFF_ALLOC(graph_alloc_size); + + if (!graph_data) { + CRASH("error allocating space for clique finding data structure"); + } + + uint8_t* inverted_graph[num_nodes]; + inverted_graph[0] = graph_data; + for (int i = 1; i < num_nodes; i++) { + inverted_graph[i] = *inverted_graph + num_nodes * i; + } + + hb_adjacency_graph_invert(nodes, inverted_graph); + + // Count the number of edges in the inverted graph. These edges are the ones + // that need to be removed so that the remaining nodes form a clique in the + // adjacency graph. Also for performance get hold of the self node index in + // the nodes vector. + int edge_count = 0; + int self_node_index = -1; + for (int i = 0; i < num_nodes; i++) { + cf_node node = 0; + cf_vector_get(nodes, i, &node); + if (node == config_self_nodeid_get()) { + self_node_index = i; + } + + for (int j = 0; j < num_nodes; j++) { + if (inverted_graph[i][j]) { + edge_count++; + } + } + } + + cf_vector_delete_range(nodes_to_evict, 0, + cf_vector_size(nodes_to_evict) - 1); + + // Since we always decide to retain self node, first get rid of all nodes + // having missing links to self node. + if (self_node_index >= 0) { + for (int i = 0; i < num_nodes; i++) { + if (inverted_graph[self_node_index][i] + || inverted_graph[i][self_node_index]) { + cf_node to_evict = 0; + cf_vector_get(nodes, i, &to_evict); + DEBUG("marking node %" PRIx64" for clique based eviction", + to_evict); + + cf_vector_append(nodes_to_evict, &to_evict); + + // Remove all edges attached to the removed node. + for (int j = 0; j < num_nodes; j++) { + if (inverted_graph[i][j]) { + inverted_graph[i][j] = 0; + edge_count--; + } + if (inverted_graph[j][i]) { + inverted_graph[j][i] = 0; + edge_count--; + } + } + } + } + } + + while (edge_count > 0) { + // Find vertex with highest degree. + cf_node max_degree_node = 0; + int max_degree_node_idx = -1; + int max_degree = 0; + + for (int i = 0; i < num_nodes; i++) { + cf_node to_evict = 0; + cf_vector_get(nodes, i, &to_evict); + + if (vector_find(nodes_to_evict, &to_evict) >= 0) { + // We have already decided to evict this node. + continue; + } + + if (to_evict == config_self_nodeid_get()) { + // Do not evict self. + continue; + } + + // Get the degree of this node. + int degree = 0; + for (int j = 0; j < num_nodes; j++) { + if (inverted_graph[i][j]) { + degree++; + } + } + + DETAIL("inverted degree for node %" PRIx64" is %d", + to_evict, degree); + + // See if this node has a higher degree. On ties choose the node + // with a smaller nodeid + if (degree > max_degree + || (degree == max_degree && max_degree_node > to_evict)) { + max_degree = degree; + max_degree_node = to_evict; + max_degree_node_idx = i; + } + } + + if (max_degree_node_idx < 0) { + // We are done no node to evict. + break; + } + + DEBUG("marking node %" PRIx64" with degree %d for clique based eviction", + max_degree_node, max_degree); + + cf_vector_append(nodes_to_evict, &max_degree_node); + + // Remove all edges attached to the removed node. + for (int i = 0; i < num_nodes; i++) { + if (inverted_graph[max_degree_node_idx][i]) { + inverted_graph[max_degree_node_idx][i] = 0; + edge_count--; + } + if (inverted_graph[i][max_degree_node_idx]) { + inverted_graph[i][max_degree_node_idx] = 0; + edge_count--; + } + } + } + + MSG_BUFF_FREE(graph_data, graph_alloc_size); +} + +/** + * Reduce function to iterate over plugin data for all adjacent nodes. + */ +static int +hb_plugin_data_iterate_reduce(const void* key, void* data, void* udata) +{ + const cf_node* nodeid = (const cf_node*)key; + as_hb_adjacent_node* adjacent_node = (as_hb_adjacent_node*)data; + as_hb_adjacecny_iterate_reduce_udata* reduce_udata = + (as_hb_adjacecny_iterate_reduce_udata*)udata; + + size_t plugin_data_size = + adjacent_node->plugin_data[reduce_udata->pluginid][adjacent_node->plugin_data_cycler + % 2].data_size; + void* plugin_data = + plugin_data_size ? + adjacent_node->plugin_data[reduce_udata->pluginid][adjacent_node->plugin_data_cycler + % 2].data : NULL; + + reduce_udata->iterate_fn(*nodeid, plugin_data, plugin_data_size, + adjacent_node->last_updated_monotonic_ts, + &adjacent_node->last_msg_hlc_ts, reduce_udata->udata); + + return CF_SHASH_OK; +} + +/** + * Call the iterate method on all nodes in current adjacency list. Note plugin + * data can still be NULL if the plugin data failed to parse the plugin data. + * + * @param pluginid the plugin identifier. + * @param iterate_fn the iterate function invoked for plugin data forevery node. + * @param udata passed as is to the iterate function. Useful for getting results + * out of the iteration. NULL if there is no plugin data. + * @return the size of the plugin data. 0 if there is no plugin data. + */ +static void +hb_plugin_data_iterate_all(as_hb_plugin_id pluginid, + as_hb_plugin_data_iterate_fn iterate_fn, void* udata) +{ + HB_LOCK(); + + as_hb_adjacecny_iterate_reduce_udata reduce_udata; + reduce_udata.pluginid = pluginid; + reduce_udata.iterate_fn = iterate_fn; + reduce_udata.udata = udata; + cf_shash_reduce(g_hb.adjacency, hb_plugin_data_iterate_reduce, + &reduce_udata); + + HB_UNLOCK(); +} diff --git a/as/src/fabric/hlc.c b/as/src/fabric/hlc.c new file mode 100644 index 00000000..5b7fc01c --- /dev/null +++ b/as/src/fabric/hlc.c @@ -0,0 +1,557 @@ +/* + * hlc.c + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "fabric/hlc.h" + +#include +#include // For MAX() and MIN(). + +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_atomic.h" + +#include "fault.h" + +#include "base/cfg.h" + +/* + * Overview + * ======== + * Hybrid logical clock as described in + * "Logical Physical Clocks and Consistent Snapshots in Globally Distributed + * Databases" available at http://www.cse.buffalo.edu/tech-reports/2014-04.pdf. + * + * Relies on a global 64 bit variable that has the logical time. + * The 48 MSBs include the physical component of the timestamp and the least + * significant 16 bits include the logical component. 48 bits for milliseconds + * since epoch gives us (8925 - years elapsed since epoch today) years before + * wrap around. + * + * The notion of HLC is to bound the skew between the logical clock and phsycial + * clock. This requires rejecting updates to the clock from nodes with large + * clock skews. We DO NOT do that yet and print a warning instead. The current + * envisioned usage is a global monotonically increasing timestamp. Should be + * fixed if we are to use it as a surrogate for wall clock. + * + * Guarantees + * ========== + * 1. Monotonically increasing. (Wraps around after ~8900 years). Service + * restarts might break the monotonicity, however the new clock will leapfrog + * the hlc value before the restart eventually. + * 2. as_hlc_timestamp_update call after every message receipt will ensure the + * message (send hlc ts) < (message receive hlc ts). + * 3. A fixed local timestamp will eventually be marked as happened before a + * remote message. This is an important requirement. For example, in paxos the + * local cluster change timestamp should have happened before some incoming + * heartbeat. The ordering system should not always return a + * AS_HLC_ORDER_INDETERMINATE for a fixed local timestamp and a new message + * received. + * + * Not guaranteed (requires hlc persistence across service restarts) + * ============== + * 1. On service restart the HLC clock will not start where it left off, however + * it will eventually leapfrog the older value. Fixing this requires persistence + * which is not implemented. eventually leapfrogging is alright for all current + * requirements. + * 2. If a as_hlc_msg_timestamp is persisted and compared with a current running + * value, the result may not be correct. + * + * + * Requirements + * ============ + * Subsystems that reply on hlc should have their network messages timestamped + * with hlc timestamps and should invoke the as_hlc_timestamp_update on receipt + * of every message. This will ensure the hlc are in sync across the cluster and + * (send hlc ts) < (message receive hlc ts). + */ + +/** + * Global timestamp with current hlc value. + */ +static as_hlc_timestamp g_now; + +/** + * Previous value of the physical component. + */ +static cf_atomic64 g_prev_physical_component; + +/** + * Previous value of the wall clock, when the physical component changed. + */ +static cf_atomic64 g_prev_wall_clock; + +/* + * ---------------------------------------------------------------------------- + * Globals. + * ---------------------------------------------------------------------------- + */ +/** + * Mask for the physical component of a hlc timestamp. + */ +#define PHYSICAL_TS_MASK 0xffffffffffff0000 + +/** + * Mask for logical component of a hls timestamp. + */ +#define LOGICAL_TS_MASK 0x000000000000ffff + +/** + * Print the skew warning once every five seconds. + */ +#define SKEW_WARNING_INTERVAL_MS() (5000) + +/** + * Logging macros. + */ +#define CRASH(format, ...) cf_crash(AS_HLC, format, ##__VA_ARGS__) +#define WARNING(format, ...) cf_warning(AS_HLC, format, ##__VA_ARGS__) +#define INFO(format, ...) cf_info(AS_HLC, format, ##__VA_ARGS__) +#define DEBUG(format, ...) cf_debug(AS_HLC, format, ##__VA_ARGS__) +#define DETAIL(format, ...) cf_detail(AS_HLC, format, ##__VA_ARGS__) +#define ASSERT(expression, message, ...) \ +if (!(expression)) {WARNING(message, __VA_ARGS__);} + +/* + * ---------------------------------------------------------------------------- + * Forward declarations. + * ---------------------------------------------------------------------------- + */ +static cf_clock +hlc_wall_clock_get(); +static as_hlc_timestamp +hlc_ts_get(); +static bool +hlc_ts_set(as_hlc_timestamp old_value, as_hlc_timestamp new_value); +static cf_clock +hlc_physical_ts_get(as_hlc_timestamp hlc_ts); +static uint16_t +hlc_logical_ts_get(as_hlc_timestamp hlc_ts); +static void +hlc_physical_ts_set(as_hlc_timestamp* hlc_ts, cf_clock physical_ts); +static void +hlc_physical_ts_on_set(cf_clock physical_ts, cf_clock wall_clock_now); +static void +hlc_logical_ts_set(as_hlc_timestamp* hlc_ts, uint16_t logical_ts); +static void +hlc_logical_ts_incr(uint16_t* logical_ts, cf_clock* physical_ts, + cf_clock wall_clock_now); + +/* + * ---------------------------------------------------------------------------- + * Public API. + * ---------------------------------------------------------------------------- + */ +/** + * Initialize hybrid logical clock. + */ +void +as_hlc_init() +{ + g_now = 0; + g_prev_physical_component = 0; + g_prev_wall_clock = 0; +} + +/** + * Return the physical component of a hlc timstamp + * @param hlc_ts the hybrid logical clock timestamp. + */ +cf_clock +as_hlc_physical_ts_get(as_hlc_timestamp hlc_ts) +{ + return hlc_physical_ts_get(hlc_ts); +} + +/** + * Return a hlc timestamp representing the hlc time "now". The notion is to make + * the minimum increment to the hlc timestamp necessary. + */ +as_hlc_timestamp +as_hlc_timestamp_now() +{ + // Keep trying till an atomic operation succeeds. Looks like a tight loop + // but even with reasonable contention should not take more then a few + // iterations to succeed. + while (true) { + as_hlc_timestamp current_hlc_ts = hlc_ts_get(); + + // Initialize the new physical and logical values to current values. + cf_clock new_hlc_physical_ts = hlc_physical_ts_get(current_hlc_ts); + uint16_t new_hlc_logical_ts = hlc_logical_ts_get(current_hlc_ts); + + cf_clock wall_clock_physical_ts = hlc_wall_clock_get(); + + if (new_hlc_physical_ts >= wall_clock_physical_ts) { + // The HLC physical component is greater than the physical wall + // time. Advance the logical timestamp. + hlc_logical_ts_incr(&new_hlc_logical_ts, &new_hlc_physical_ts, + wall_clock_physical_ts); + } + else { + // The wall clock is greater, use this as the physical component and + // reset the logical timestamp. + new_hlc_physical_ts = wall_clock_physical_ts; + new_hlc_logical_ts = 0; + } + + as_hlc_timestamp new_hlc_ts = 0; + + hlc_physical_ts_set(&new_hlc_ts, new_hlc_physical_ts); + hlc_logical_ts_set(&new_hlc_ts, new_hlc_logical_ts); + + if (hlc_ts_set(current_hlc_ts, new_hlc_ts)) { + hlc_physical_ts_on_set(new_hlc_physical_ts, wall_clock_physical_ts); + DETAIL("changed HLC value from %" PRIu64 " to %" PRIu64, + current_hlc_ts, new_hlc_ts); + return new_hlc_ts; + } + } +} + +/** + * Update the HLC on receipt of a remote message. The notion is to adjust this + * node's hlc to ensure the receive hlc ts > the send hlc ts. + * + * @param source for debugging and tracking only. + * @param send_timestamp the hlc timestamp when this message was sent. + * @param recv_timestamp (output) the message receive timestamp which will be + * populated. Can be NULL in which case it will be ignored. + */ +void +as_hlc_timestamp_update(cf_node source, as_hlc_timestamp send_ts, + as_hlc_msg_timestamp* msg_ts) +{ + cf_clock send_ts_physical_ts = hlc_physical_ts_get(send_ts); + uint16_t send_ts_logical_ts = hlc_logical_ts_get(send_ts); + + // Keep trying till an atomic operation succeeds. Looks like a tight loop + // but even with reasonable contention should not take more then a few + // iterations to succeed. + while (true) { + as_hlc_timestamp current_hlc_ts = hlc_ts_get(); + + cf_clock current_hlc_physical_ts = hlc_physical_ts_get(current_hlc_ts); + uint16_t current_hlc_logical_ts = hlc_logical_ts_get(current_hlc_ts); + + cf_clock wall_clock_physical_ts = hlc_wall_clock_get(); + + cf_clock new_hlc_physical_ts = MAX( + MAX(current_hlc_physical_ts, send_ts_physical_ts), + wall_clock_physical_ts); + uint16_t new_hlc_logical_ts = 0; + + if (new_hlc_physical_ts == current_hlc_physical_ts + && new_hlc_physical_ts == send_ts_physical_ts) { + // There is no change in the physical components of peer and local + // hlc clocks. Set logical component to max of the two values and + // increment. + new_hlc_logical_ts = MAX(current_hlc_logical_ts, + send_ts_logical_ts); + hlc_logical_ts_incr(&new_hlc_logical_ts, &new_hlc_physical_ts, + wall_clock_physical_ts); + } + else if (new_hlc_physical_ts == current_hlc_physical_ts) { + // The physical component of the send timestamp is smaller than our + // current physical component. We just need to increment the logical + // component. + new_hlc_logical_ts = current_hlc_ts; + hlc_logical_ts_incr(&new_hlc_logical_ts, &new_hlc_physical_ts, + wall_clock_physical_ts); + } + else if (new_hlc_physical_ts == send_ts_physical_ts) { + // Current physical component is lesser than the incoming physical + // component. We need to ensure that the updated logical component + // is greater than the send logical component. + new_hlc_logical_ts = send_ts_logical_ts; + hlc_logical_ts_incr(&new_hlc_logical_ts, &new_hlc_physical_ts, + wall_clock_physical_ts); + } + else { + // Our physical clock is greater than current physical component and + // the send physical component. We can reset the logical clock to + // zero and still maintain the send and receive ordering. + new_hlc_logical_ts = 0; + } + + as_hlc_timestamp new_hlc_ts = 0; + + hlc_physical_ts_set(&new_hlc_ts, new_hlc_physical_ts); + hlc_logical_ts_set(&new_hlc_ts, new_hlc_logical_ts); + + if (hlc_ts_set(current_hlc_ts, new_hlc_ts)) { + hlc_physical_ts_on_set(new_hlc_physical_ts, wall_clock_physical_ts); + DETAIL("message received from node %" PRIx64 " with HLC %" PRIu64 " - changed HLC value from %" PRIu64 " to %" PRIu64, + source, send_ts, current_hlc_ts, new_hlc_ts); + if (msg_ts) { + msg_ts->send_ts = send_ts; + msg_ts->recv_ts = new_hlc_ts; + } + return; + } + } +} + +/** + * Return the difference in milliseconds between two hlc timestamps. Note this + * difference may be greater than or equal to (but never less than) + * the physical wall call difference, because HLC can have non linear jumps, + * whenever the clock is adjusted. The difference should be used as an estimate + * rather than an absolute difference. + * For e.g. use the difference to check that the real time difference is most + * some number of milliseconds. However do not use this for interval statistics + * or to check if the difference in time is at least some number of + * milliseconds. + * + * @param ts1 the first timestamp. + * @param ts2 the seconds timestamp. + * @return ts1 - ts2 in milliseconds. if ts1 < ts2 the result is negative, + * else it is positive or zero. + */ +int64_t +as_hlc_timestamp_diff_ms(as_hlc_timestamp ts1, as_hlc_timestamp ts2) +{ + int64_t diff = 0; + if (ts1 >= ts2) { + diff = hlc_physical_ts_get(ts1) - hlc_physical_ts_get(ts2); + } + else { + diff = -(hlc_physical_ts_get(ts2) - hlc_physical_ts_get(ts1)); + } + + return diff; +} + +/** + * Orders a local timestamp and remote message send timestamp. + * + * @param local_ts the local timestamp. + * @param msg_ts message receive timestamp containing the remote send and the + * local receive timestamp. + * @return the order between the local and the message timestamp. + */ +as_hlc_timestamp_order +as_hlc_send_timestamp_order(as_hlc_timestamp local_ts, + as_hlc_msg_timestamp* msg_ts) +{ + if (local_ts > msg_ts->recv_ts) { + // The local event happened after the local message received timestamp + // and therefore after the remote send as well. + return AS_HLC_HAPPENS_AFTER; + } + + // Compute the unceratinty window around the local receive timestamp. + uint64_t offset = abs(msg_ts->send_ts - msg_ts->recv_ts); + + if (local_ts > (msg_ts->recv_ts - offset)) { + // Local timestamp is in the uncertainty window. We cannot tell the + // order. + return AS_HLC_ORDER_INDETERMINATE; + } + + cf_clock local_physical_ts = hlc_physical_ts_get(local_ts); + cf_clock recv_physical_ts = hlc_physical_ts_get(msg_ts->recv_ts); + + if ((recv_physical_ts - local_physical_ts) + < g_config.fabric_latency_max_ms) { + // Consider the max network delay worth of time to also be part of the + // uncertainty window. + return AS_HLC_ORDER_INDETERMINATE; + } + + return AS_HLC_HAPPENS_BEFORE; +} + +/** + * Orders two timestamp generated by the same node / process. + * + * @param ts1 the first timestamp. + * @param ts2 the second timestamp. + * @return AS_HLC_HAPPENS_BEFORE if ts1 happens before ts2 else + * AS_HLC_HAPPENS_AFTER if ts1 happens after ts2 else + * AS_HLC_ORDER_INDETERMINATE. + */ +as_hlc_timestamp_order +as_hlc_timestamp_order_get(as_hlc_timestamp ts1, as_hlc_timestamp ts2) +{ + if (ts1 < ts2) { + return AS_HLC_HAPPENS_BEFORE; + } + else if (ts1 > ts2) { + return AS_HLC_HAPPENS_AFTER; + } + + return AS_HLC_ORDER_INDETERMINATE; +} + +/** + * Subtract milliseconds worth of time from the timestamp. + * @param timestamp the input timestamp. + * @param ms the number of milliseconds to subtract. + */ +as_hlc_timestamp +as_hlc_timestamp_subtract_ms(as_hlc_timestamp timestamp, int ms) +{ + cf_clock physical_ts = hlc_physical_ts_get(timestamp); + uint16_t logical_ts = hlc_logical_ts_get(timestamp); + physical_ts -= ms; + as_hlc_timestamp new_hlc_ts = 0; + + hlc_physical_ts_set(&new_hlc_ts, physical_ts); + hlc_logical_ts_set(&new_hlc_ts, logical_ts); + return new_hlc_ts; +} + +/** + * Dump some debugging information to the logs. + */ +void +as_hlc_dump(bool verbose) +{ + as_hlc_timestamp now = as_hlc_timestamp_now(); + cf_clock current_hlc_physical_ts = hlc_physical_ts_get(now); + uint16_t current_hlc_logical_ts = hlc_logical_ts_get(now); + + INFO("HLC Ts:%" PRIu64 " HLC Physical Ts:%" PRIu64 " HLC Logical Ts:%d Wall Clock:%" PRIu64, + now, current_hlc_physical_ts, current_hlc_logical_ts, + hlc_wall_clock_get()); +} + +/* + * ---------------------------------------------------------------------------- + * Private functions. + * ---------------------------------------------------------------------------- + */ + +/** + * Return this node's wall clock. + */ +static cf_clock +hlc_wall_clock_get() +{ + // Unix timestamps will be 48 bits for a reasonable future. We will use only + // 48 bits. + return cf_clock_getabsolute(); +} + +/** + * Return the physical component of a hlc timstamp + * @param hlc_ts the hybrid logical clock timestamp. + */ +static cf_clock +hlc_physical_ts_get(as_hlc_timestamp hlc_ts) +{ + return hlc_ts >> 16; +} + +/** + * Return the logical component of a hlc timstamp + * @param hlc_ts the hybrid logical clock timestamp. + */ +static uint16_t +hlc_logical_ts_get(as_hlc_timestamp hlc_ts) +{ + return (uint16_t)(hlc_ts & LOGICAL_TS_MASK); +} + +/** + * Set the physical component of a hlc timestamp. 16 LSBs of the input physical + * timestamp will be ignored. + * @param hlc_ts the timestamp + * @param physical_ts the physical timestamp whose value should be set into the + * hls timestamp. + */ +static void +hlc_physical_ts_set(as_hlc_timestamp* hlc_ts, cf_clock physical_ts) +{ + *hlc_ts = (*hlc_ts & LOGICAL_TS_MASK) | (physical_ts << 16); +} + +/** + * Handle setting updating the physical component of the hlc timestamp. + */ +static void +hlc_physical_ts_on_set(cf_clock physical_ts, cf_clock wall_clock_now) +{ + if (g_prev_physical_component != physical_ts) { + g_prev_physical_component = physical_ts; + g_prev_wall_clock = wall_clock_now; + } +} + +/** + * Increment the logical timestamp and deal with a wrap around by incrementing + * the physical timestamp and ensure physical component moves at least at the + * rate of the wall clock to ensure hlc can be used as a crude measure of time + * intervals. + */ +static void +hlc_logical_ts_incr(uint16_t* logical_ts, cf_clock* physical_ts, + cf_clock wall_clock_now) +{ + (*logical_ts)++; + if (logical_ts == 0) { + (*physical_ts)++; + } + cf_clock physical_component_diff = *physical_ts - g_prev_physical_component; + cf_clock wall_clock_diff = + (wall_clock_now > g_prev_wall_clock) ? + wall_clock_now - g_prev_wall_clock : 0; + if (physical_component_diff < wall_clock_diff) { + *physical_ts += wall_clock_diff - physical_component_diff; + } +} + +/** + * Set the logical component of a hlc timestamp. + * @param hlc_ts the timestamp + * @param logical_ts the logical timestamp whose value should be set into the + * hls timestamp. + */ +static void +hlc_logical_ts_set(as_hlc_timestamp* hlc_ts, uint16_t logical_ts) +{ + *hlc_ts = (*hlc_ts & PHYSICAL_TS_MASK) | (((uint64_t)logical_ts)); +} + +/** + * Get current value for the global timestamp atomically. + * + * @param new_value the new value for the global timestamp. + * @return true on successful set, false on failure to do an atomic set. + */ +static as_hlc_timestamp +hlc_ts_get() +{ + return ck_pr_load_64(&g_now); +} + +/** + * Set a new value for the global timestamp atomically. + * + * @param new_value the new value for the global timestamp. + * @return true on successful set, false on failure to do an atomic set. + */ +static bool +hlc_ts_set(as_hlc_timestamp old_value, as_hlc_timestamp new_value) +{ + // Default to ck atomic check and set. + return ck_pr_cas_64(&g_now, old_value, new_value); +} diff --git a/as/src/fabric/meta_batch_ce.c b/as/src/fabric/meta_batch_ce.c new file mode 100644 index 00000000..97e799cd --- /dev/null +++ b/as/src/fabric/meta_batch_ce.c @@ -0,0 +1,65 @@ +/* + * meta_batch.c + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "fabric/meta_batch.h" + +#include + + +//========================================================== +// Public API. +// + +struct meta_in_q_s * +meta_in_q_create() +{ + return NULL; +} + + +void +meta_in_q_destroy(struct meta_in_q_s *iq) +{ +} + + +void +meta_in_q_rejected(struct meta_in_q_s *iq) +{ +} + + +struct meta_out_q_s * +meta_out_q_create() +{ + return NULL; +} + + +void +meta_out_q_destroy(struct meta_out_q_s *oq) +{ +} diff --git a/as/src/fabric/migrate.c b/as/src/fabric/migrate.c new file mode 100644 index 00000000..4382e41b --- /dev/null +++ b/as/src/fabric/migrate.c @@ -0,0 +1,1758 @@ +/* + * migrate.c + * + * Copyright (C) 2008-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +// migrate.c +// Moves a partition from one machine to another using the fabric messaging +// system. + + +//========================================================== +// Includes. +// + +#include "fabric/migrate.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_queue.h" +#include "citrusleaf/cf_rchash.h" + +#include "fault.h" +#include "msg.h" +#include "node.h" +#include "shash.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/rec_props.h" +#include "fabric/exchange.h" +#include "fabric/fabric.h" +#include "fabric/meta_batch.h" +#include "fabric/partition.h" +#include "fabric/partition_balance.h" +#include "storage/storage.h" + + +//========================================================== +// Typedefs & constants. +// + +const msg_template migrate_mt[] = { + { MIG_FIELD_OP, M_FT_UINT32 }, + { MIG_FIELD_UNUSED_1, M_FT_UINT32 }, + { MIG_FIELD_EMIG_ID, M_FT_UINT32 }, + { MIG_FIELD_NAMESPACE, M_FT_BUF }, + { MIG_FIELD_PARTITION, M_FT_UINT32 }, + { MIG_FIELD_DIGEST, M_FT_BUF }, + { MIG_FIELD_GENERATION, M_FT_UINT32 }, + { MIG_FIELD_RECORD, M_FT_BUF }, + { MIG_FIELD_CLUSTER_KEY, M_FT_UINT64 }, + { MIG_FIELD_UNUSED_9, M_FT_BUF }, + { MIG_FIELD_VOID_TIME, M_FT_UINT32 }, + { MIG_FIELD_UNUSED_11, M_FT_UINT32 }, + { MIG_FIELD_UNUSED_12, M_FT_BUF }, + { MIG_FIELD_INFO, M_FT_UINT32 }, + { MIG_FIELD_UNUSED_14, M_FT_UINT64 }, + { MIG_FIELD_UNUSED_15, M_FT_BUF }, + { MIG_FIELD_UNUSED_16, M_FT_BUF }, + { MIG_FIELD_UNUSED_17, M_FT_UINT32 }, + { MIG_FIELD_UNUSED_18, M_FT_UINT32 }, + { MIG_FIELD_LAST_UPDATE_TIME, M_FT_UINT64 }, + { MIG_FIELD_FEATURES, M_FT_UINT32 }, + { MIG_FIELD_UNUSED_21, M_FT_UINT32 }, + { MIG_FIELD_META_RECORDS, M_FT_BUF }, + { MIG_FIELD_META_SEQUENCE, M_FT_UINT32 }, + { MIG_FIELD_META_SEQUENCE_FINAL, M_FT_UINT32 }, + { MIG_FIELD_PARTITION_SIZE, M_FT_UINT64 }, + { MIG_FIELD_SET_NAME, M_FT_BUF }, + { MIG_FIELD_KEY, M_FT_BUF }, + { MIG_FIELD_UNUSED_28, M_FT_UINT32 }, + { MIG_FIELD_EMIG_INSERT_ID, M_FT_UINT64 } +}; + +COMPILER_ASSERT(sizeof(migrate_mt) / sizeof(msg_template) == NUM_MIG_FIELDS); + +#define MIG_MSG_SCRATCH_SIZE 192 + +#define EMIGRATION_SLOW_Q_WAIT_MS 1000 // 1 second +#define MIGRATE_RETRANSMIT_STARTDONE_MS 1000 // for now, not configurable +#define MIGRATE_RETRANSMIT_SIGNAL_MS 1000 // for now, not configurable +#define MAX_BYTES_EMIGRATING (16 * 1024 * 1024) + +#define IMMIGRATION_DEBOUNCE_MS (60 * 1000) // 1 minute + +typedef struct pickled_record_s { + cf_digest keyd; + uint32_t generation; + uint32_t void_time; + uint64_t last_update_time; + uint8_t *record_buf; // pickled! + size_t record_len; +} pickled_record; + +typedef enum { + EMIG_START_RESULT_OK, + EMIG_START_RESULT_ERROR, + EMIG_START_RESULT_EAGAIN +} emigration_start_result; + +typedef enum { + // Order matters - we use an atomic set-max that relies on it. + EMIG_STATE_ACTIVE, + EMIG_STATE_FINISHED, + EMIG_STATE_ABORTED +} emigration_state; + +typedef struct emigration_pop_info_s { + uint32_t order; + uint64_t dest_score; + uint64_t n_elements; + + uint64_t avoid_dest; +} emigration_pop_info; + +typedef struct emigration_reinsert_ctrl_s { + uint64_t xmit_ms; // time of last xmit - 0 when done + emigration *emig; + msg *m; +} emigration_reinsert_ctrl; + + +//========================================================== +// Globals. +// + +cf_rchash *g_emigration_hash = NULL; +cf_rchash *g_immigration_hash = NULL; + +static uint64_t g_avoid_dest = 0; +static cf_atomic32 g_emigration_id = 0; +static cf_queue g_emigration_q; +static cf_queue g_emigration_slow_q; + + +//========================================================== +// Forward declarations. +// + +// Various initializers and destructors. +void emigration_init(emigration *emig); +void emigration_destroy(void *parm); +int emigration_reinsert_destroy_reduce_fn(const void *key, void *data, void *udata); +void immigration_destroy(void *parm); +void pickled_record_destroy(pickled_record *pr); + +// Emigration. +void *run_emigration(void *arg); +void *run_emigration_slow(void *arg); +void emigration_pop(emigration **emigp); +int emigration_pop_reduce_fn(void *buf, void *udata); +void emigration_hash_insert(emigration *emig); +void emigration_hash_delete(emigration *emig); +bool emigrate_transfer(emigration *emig); +void emigrate_signal(emigration *emig); +emigration_start_result emigration_send_start(emigration *emig); +bool emigrate_tree(emigration *emig); +bool emigration_send_done(emigration *emig); +void *run_emigration_reinserter(void *arg); +void emigrate_tree_reduce_fn(as_index_ref *r_ref, void *udata); +int emigration_reinsert_reduce_fn(const void *key, void *data, void *udata); +void emigrate_record(emigration *emig, msg *m); + +// Immigration. +uint32_t immigration_hashfn(const void *value, uint32_t value_len); +void *run_immigration_reaper(void *arg); +int immigration_reaper_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata); + +// Migrate fabric message handling. +int migrate_receive_msg_cb(cf_node src, msg *m, void *udata); +void immigration_handle_start_request(cf_node src, msg *m); +void immigration_ack_start_request(cf_node src, msg *m, uint32_t op); +void immigration_handle_insert_request(cf_node src, msg *m); +void immigration_handle_done_request(cf_node src, msg *m); +void immigration_handle_all_done_request(cf_node src, msg *m); +void emigration_handle_insert_ack(cf_node src, msg *m); +void emigration_handle_ctrl_ack(cf_node src, msg *m, uint32_t op); + +// Info API helpers. +int emigration_dump_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata); +int immigration_dump_reduce_fn(const void *key, uint32_t keylen, void *object, void *udata); + + +//========================================================== +// Public API. +// + +void +as_migrate_init() +{ + g_avoid_dest = (uint64_t)g_config.self_node; + + cf_queue_init(&g_emigration_q, sizeof(emigration*), 4096, true); + cf_queue_init(&g_emigration_slow_q, sizeof(emigration*), 4096, true); + + cf_rchash_create(&g_emigration_hash, cf_rchash_fn_u32, emigration_destroy, + sizeof(uint32_t), 64, CF_RCHASH_MANY_LOCK); + + cf_rchash_create(&g_immigration_hash, immigration_hashfn, + immigration_destroy, sizeof(immigration_hkey), 64, + CF_RCHASH_BIG_LOCK); + + // Looks like an as_priority_thread_pool, but the reduce-pop is different. + + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + for (uint32_t i = 0; i < g_config.n_migrate_threads; i++) { + if (pthread_create(&thread, &attrs, run_emigration, NULL) != 0) { + cf_crash(AS_MIGRATE, "failed to create emigration thread"); + } + } + + if (pthread_create(&thread, &attrs, run_emigration_slow, NULL) != 0) { + cf_crash(AS_MIGRATE, "failed to create emigration slow thread"); + } + + if (pthread_create(&thread, &attrs, run_immigration_reaper, NULL) != 0) { + cf_crash(AS_MIGRATE, "failed to create immigration reaper thread"); + } + + as_fabric_register_msg_fn(M_TYPE_MIGRATE, migrate_mt, sizeof(migrate_mt), + MIG_MSG_SCRATCH_SIZE, migrate_receive_msg_cb, NULL); +} + + +// Kicks off an emigration. +void +as_migrate_emigrate(const pb_task *task) +{ + emigration *emig = cf_rc_alloc(sizeof(emigration)); + + emig->dest = task->dest; + emig->cluster_key = task->cluster_key; + emig->id = cf_atomic32_incr(&g_emigration_id); + emig->type = task->type; + emig->tx_flags = task->tx_flags; + emig->state = EMIG_STATE_ACTIVE; + emig->aborted = false; + + // Create these later only when we need them - we'll get lots at once. + emig->bytes_emigrating = 0; + emig->reinsert_hash = NULL; + emig->insert_id = 0; + emig->ctrl_q = NULL; + emig->meta_q = NULL; + + as_partition_reserve(task->ns, task->pid, &emig->rsv); + + emig->from_replica = is_self_replica(emig->rsv.p); + + cf_atomic_int_incr(&emig->rsv.ns->migrate_tx_instance_count); + + cf_queue_push(&g_emigration_q, &emig); +} + + +// Called via info command. Caller has sanity-checked n_threads. +void +as_migrate_set_num_xmit_threads(uint32_t n_threads) +{ + if (g_config.n_migrate_threads > n_threads) { + // Decrease the number of migrate transmit threads to n_threads. + while (g_config.n_migrate_threads > n_threads) { + void *death_msg = NULL; + + // Send terminator (NULL message). + cf_queue_push(&g_emigration_q, &death_msg); + g_config.n_migrate_threads--; + } + } + else { + // Increase the number of migrate transmit threads to n_threads. + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + while (g_config.n_migrate_threads < n_threads) { + if (pthread_create(&thread, &attrs, run_emigration, NULL) != 0) { + cf_warning(AS_MIGRATE, "failed to create emigration thread"); + return; + } + + g_config.n_migrate_threads++; + } + } +} + + +// Called via info command - print information about migration to the log. +void +as_migrate_dump(bool verbose) +{ + cf_info(AS_MIGRATE, "migration info:"); + cf_info(AS_MIGRATE, "---------------"); + cf_info(AS_MIGRATE, "number of emigrations in g_emigration_hash: %d", + cf_rchash_get_size(g_emigration_hash)); + cf_info(AS_MIGRATE, "number of requested emigrations waiting in g_emigration_q : %d", + cf_queue_sz(&g_emigration_q)); + cf_info(AS_MIGRATE, "number of requested emigrations waiting in g_emigration_slow_q : %d", + cf_queue_sz(&g_emigration_slow_q)); + cf_info(AS_MIGRATE, "number of immigrations in g_immigration_hash: %d", + cf_rchash_get_size(g_immigration_hash)); + cf_info(AS_MIGRATE, "current emigration id: %d", g_emigration_id); + + if (verbose) { + int item_num = 0; + + if (cf_rchash_get_size(g_emigration_hash) > 0) { + cf_info(AS_MIGRATE, "contents of g_emigration_hash:"); + cf_info(AS_MIGRATE, "------------------------------"); + + cf_rchash_reduce(g_emigration_hash, emigration_dump_reduce_fn, + &item_num); + } + + if (cf_rchash_get_size(g_immigration_hash) > 0) { + item_num = 0; + + cf_info(AS_MIGRATE, "contents of g_immigration_hash:"); + cf_info(AS_MIGRATE, "-------------------------------"); + + cf_rchash_reduce(g_immigration_hash, immigration_dump_reduce_fn, + &item_num); + } + } +} + + +//========================================================== +// Local helpers - various initializers and destructors. +// + +void +emigration_init(emigration *emig) +{ + emig->reinsert_hash = cf_shash_create(cf_shash_fn_u32, sizeof(uint64_t), + sizeof(emigration_reinsert_ctrl), 16 * 1024, CF_SHASH_MANY_LOCK); + emig->ctrl_q = cf_queue_create(sizeof(int), true); + emig->meta_q = meta_in_q_create(); +} + + +// Destructor handed to rchash. +void +emigration_destroy(void *parm) +{ + emigration *emig = (emigration *)parm; + + if (emig->reinsert_hash) { + cf_shash_reduce(emig->reinsert_hash, + emigration_reinsert_destroy_reduce_fn, NULL); + cf_shash_destroy(emig->reinsert_hash); + } + + if (emig->ctrl_q) { + cf_queue_destroy(emig->ctrl_q); + } + + if (emig->meta_q) { + meta_in_q_destroy(emig->meta_q); + } + + as_partition_release(&emig->rsv); + + cf_atomic_int_decr(&emig->rsv.ns->migrate_tx_instance_count); +} + + +int +emigration_reinsert_destroy_reduce_fn(const void *key, void *data, void *udata) +{ + emigration_reinsert_ctrl *ri_ctrl = (emigration_reinsert_ctrl *)data; + + as_fabric_msg_put(ri_ctrl->m); + + return CF_SHASH_REDUCE_DELETE; +} + + +void +emigration_release(emigration *emig) +{ + if (cf_rc_release(emig) == 0) { + emigration_destroy((void *)emig); + cf_rc_free(emig); + } +} + + +// Destructor handed to rchash. +void +immigration_destroy(void *parm) +{ + immigration *immig = (immigration *)parm; + + if (immig->rsv.p) { + as_partition_release(&immig->rsv); + } + + if (immig->meta_q) { + meta_out_q_destroy(immig->meta_q); + } + + cf_atomic_int_decr(&immig->ns->migrate_rx_instance_count); +} + + +void +immigration_release(immigration *immig) +{ + if (cf_rc_release(immig) == 0) { + immigration_destroy((void *)immig); + cf_rc_free(immig); + } +} + + +void +pickled_record_destroy(pickled_record *pr) +{ + cf_free(pr->record_buf); +} + + +//========================================================== +// Local helpers - emigration. +// + +void * +run_emigration(void *arg) +{ + while (true) { + emigration *emig; + + emigration_pop(&emig); + + // This is the case for intentionally stopping the migrate thread. + if (! emig) { + break; // signal of death + } + + as_partition_balance_emigration_yield(); + + if (emig->cluster_key != as_exchange_cluster_key()) { + emigration_hash_delete(emig); + continue; + } + + as_namespace *ns = emig->rsv.ns; + bool requeued = false; + + // Add the emigration to the global hash so acks can find it. + emigration_hash_insert(emig); + + switch (emig->type) { + case PB_TASK_EMIG_TRANSFER: + cf_atomic_int_incr(&ns->migrate_tx_partitions_active); + requeued = emigrate_transfer(emig); + cf_atomic_int_decr(&ns->migrate_tx_partitions_active); + break; + case PB_TASK_EMIG_SIGNAL_ALL_DONE: + cf_atomic_int_incr(&ns->migrate_signals_active); + emigrate_signal(emig); + cf_atomic_int_decr(&ns->migrate_signals_active); + break; + default: + cf_crash(AS_MIGRATE, "bad emig type %u", emig->type); + break; + } + + if (! requeued) { + emigration_hash_delete(emig); + } + } + + return NULL; +} + + +void * +run_emigration_slow(void *arg) +{ + while (true) { + emigration *emig; + + if (cf_queue_pop(&g_emigration_slow_q, (void *)&emig, + CF_QUEUE_FOREVER) != CF_QUEUE_OK) { + cf_crash(AS_MIGRATE, "emigration slow queue pop failed"); + } + + uint64_t now_ms = cf_getms(); + + if (emig->wait_until_ms > now_ms) { + usleep(1000 * (emig->wait_until_ms - now_ms)); + } + + cf_queue_push(&g_emigration_q, &emig); + } + + return NULL; +} + + +void +emigration_pop(emigration **emigp) +{ + emigration_pop_info best; + + best.order = 0xFFFFffff; + best.dest_score = 0; + best.n_elements = 0xFFFFffffFFFFffff; + + best.avoid_dest = 0; + + if (cf_queue_reduce_pop(&g_emigration_q, (void *)emigp, CF_QUEUE_FOREVER, + emigration_pop_reduce_fn, &best) != CF_QUEUE_OK) { + cf_crash(AS_MIGRATE, "emigration queue reduce pop failed"); + } +} + + +int +emigration_pop_reduce_fn(void *buf, void *udata) +{ + emigration_pop_info *best = (emigration_pop_info *)udata; + emigration *emig = *(emigration **)buf; + + if (! emig || // null emig terminates thread + emig->cluster_key != as_exchange_cluster_key()) { + return -1; // process immediately + } + + if (emig->ctrl_q && cf_queue_sz(emig->ctrl_q) > 0) { + // This emig was requeued after its start command got an ACK_EAGAIN, + // likely because dest hit 'migrate-max-num-incoming'. A new ack has + // arrived - if it's ACK_OK, don't leave remote node hanging. + + return -1; // process immediately + } + + if (emig->type == PB_TASK_EMIG_SIGNAL_ALL_DONE) { + return -1; // process immediately + } + + if (best->avoid_dest == 0) { + best->avoid_dest = g_avoid_dest; + } + + uint32_t order = emig->rsv.ns->migrate_order; + uint64_t dest_score = (uint64_t)emig->dest - best->avoid_dest; + uint64_t n_elements = as_index_tree_size(emig->rsv.tree); + + if (order < best->order || + (order == best->order && + (dest_score > best->dest_score || + (dest_score == best->dest_score && + n_elements < best->n_elements)))) { + best->order = order; + best->dest_score = dest_score; + best->n_elements = n_elements; + + g_avoid_dest = (uint64_t)emig->dest; + + return -2; // candidate + } + + return 0; // not interested +} + + +void +emigration_hash_insert(emigration *emig) +{ + if (! emig->ctrl_q) { + emigration_init(emig); // creates emig->ctrl_q etc. + + cf_rchash_put(g_emigration_hash, (void *)&emig->id, sizeof(emig->id), + (void *)emig); + } +} + + +void +emigration_hash_delete(emigration *emig) +{ + if (emig->ctrl_q) { + cf_rchash_delete(g_emigration_hash, (void *)&emig->id, + sizeof(emig->id)); + } + else { + emigration_release(emig); + } +} + + +bool +emigrate_transfer(emigration *emig) +{ + //-------------------------------------------- + // Send START request. + // + + emigration_start_result result = emigration_send_start(emig); + + if (result == EMIG_START_RESULT_EAGAIN) { + // Remote node refused migration, requeue and fetch another. + emig->wait_until_ms = cf_getms() + EMIGRATION_SLOW_Q_WAIT_MS; + + cf_queue_push(&g_emigration_slow_q, &emig); + + return true; // requeued + } + + if (result != EMIG_START_RESULT_OK) { + return false; // did not requeue + } + + //-------------------------------------------- + // Send whole tree - may block a while. + // + + if (! emigrate_tree(emig)) { + return false; // did not requeue + } + + //-------------------------------------------- + // Send DONE request. + // + + if (emigration_send_done(emig)) { + as_partition_emigrate_done(emig->rsv.ns, emig->rsv.p->id, + emig->cluster_key, emig->tx_flags); + } + + return false; // did not requeue +} + + +void +emigrate_signal(emigration *emig) +{ + as_namespace *ns = emig->rsv.ns; + msg *m = as_fabric_msg_get(M_TYPE_MIGRATE); + + switch (emig->type) { + case PB_TASK_EMIG_SIGNAL_ALL_DONE: + msg_set_uint32(m, MIG_FIELD_OP, OPERATION_ALL_DONE); + break; + default: + cf_crash(AS_MIGRATE, "signal: bad emig type %u", emig->type); + break; + } + + msg_set_uint32(m, MIG_FIELD_EMIG_ID, emig->id); + msg_set_uint64(m, MIG_FIELD_CLUSTER_KEY, emig->cluster_key); + msg_set_buf(m, MIG_FIELD_NAMESPACE, (const uint8_t *)ns->name, + strlen(ns->name), MSG_SET_COPY); + msg_set_uint32(m, MIG_FIELD_PARTITION, emig->rsv.p->id); + + uint64_t signal_xmit_ms = 0; + + while (true) { + if (emig->cluster_key != as_exchange_cluster_key()) { + as_fabric_msg_put(m); + return; + } + + uint64_t now = cf_getms(); + + if (signal_xmit_ms + MIGRATE_RETRANSMIT_SIGNAL_MS < now) { + msg_incr_ref(m); + + if (as_fabric_send(emig->dest, m, AS_FABRIC_CHANNEL_CTRL) != + AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } + + signal_xmit_ms = now; + } + + int op; + + if (cf_queue_pop(emig->ctrl_q, &op, MIGRATE_RETRANSMIT_SIGNAL_MS) == + CF_QUEUE_OK) { + switch (op) { + case OPERATION_ALL_DONE_ACK: + cf_atomic_int_decr(&ns->migrate_signals_remaining); + as_fabric_msg_put(m); + return; + default: + cf_warning(AS_MIGRATE, "signal: unexpected ctrl op %d", op); + break; + } + } + } +} + + +emigration_start_result +emigration_send_start(emigration *emig) +{ + as_namespace *ns = emig->rsv.ns; + msg *m = as_fabric_msg_get(M_TYPE_MIGRATE); + + msg_set_uint32(m, MIG_FIELD_OP, OPERATION_START); + msg_set_uint32(m, MIG_FIELD_FEATURES, MY_MIG_FEATURES); + msg_set_uint64(m, MIG_FIELD_PARTITION_SIZE, + as_index_tree_size(emig->rsv.tree)); + msg_set_uint32(m, MIG_FIELD_EMIG_ID, emig->id); + msg_set_uint64(m, MIG_FIELD_CLUSTER_KEY, emig->cluster_key); + msg_set_buf(m, MIG_FIELD_NAMESPACE, (const uint8_t *)ns->name, + strlen(ns->name), MSG_SET_COPY); + msg_set_uint32(m, MIG_FIELD_PARTITION, emig->rsv.p->id); + + uint64_t start_xmit_ms = 0; + + while (true) { + if (emig->cluster_key != as_exchange_cluster_key()) { + as_fabric_msg_put(m); + return EMIG_START_RESULT_ERROR; + } + + uint64_t now = cf_getms(); + + if (cf_queue_sz(emig->ctrl_q) == 0 && + start_xmit_ms + MIGRATE_RETRANSMIT_STARTDONE_MS < now) { + msg_incr_ref(m); + + if (as_fabric_send(emig->dest, m, AS_FABRIC_CHANNEL_CTRL) != + AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } + + start_xmit_ms = now; + } + + int op; + + if (cf_queue_pop(emig->ctrl_q, &op, MIGRATE_RETRANSMIT_STARTDONE_MS) == + CF_QUEUE_OK) { + switch (op) { + case OPERATION_START_ACK_OK: + as_fabric_msg_put(m); + return EMIG_START_RESULT_OK; + case OPERATION_START_ACK_EAGAIN: + as_fabric_msg_put(m); + return EMIG_START_RESULT_EAGAIN; + case OPERATION_START_ACK_FAIL: + cf_warning(AS_MIGRATE, "imbalance: dest refused migrate with ACK_FAIL"); + cf_atomic_int_incr(&ns->migrate_tx_partitions_imbalance); + as_fabric_msg_put(m); + return EMIG_START_RESULT_ERROR; + default: + cf_warning(AS_MIGRATE, "unexpected ctrl op %d", op); + break; + } + } + } + + // Should never get here. + cf_crash(AS_MIGRATE, "unexpected - exited infinite while loop"); + + return EMIG_START_RESULT_ERROR; +} + + +bool +emigrate_tree(emigration *emig) +{ + if (as_index_tree_size(emig->rsv.tree) == 0) { + return true; + } + + cf_atomic32_set(&emig->state, EMIG_STATE_ACTIVE); + + pthread_t thread; + + if (pthread_create(&thread, NULL, run_emigration_reinserter, emig) != 0) { + cf_crash(AS_MIGRATE, "could not start reinserter thread"); + } + + as_index_reduce(emig->rsv.tree, emigrate_tree_reduce_fn, emig); + + // Sets EMIG_STATE_FINISHED only if not already EMIG_STATE_ABORTED. + cf_atomic32_setmax(&emig->state, EMIG_STATE_FINISHED); + + pthread_join(thread, NULL); + + return emig->state != EMIG_STATE_ABORTED; +} + + +bool +emigration_send_done(emigration *emig) +{ + as_namespace *ns = emig->rsv.ns; + + if (! as_partition_pre_emigrate_done(ns, emig->rsv.p->id, emig->cluster_key, + emig->tx_flags)) { + return false; + } + + msg *m = as_fabric_msg_get(M_TYPE_MIGRATE); + + msg_set_uint32(m, MIG_FIELD_OP, OPERATION_DONE); + msg_set_uint32(m, MIG_FIELD_EMIG_ID, emig->id); + + uint64_t done_xmit_ms = 0; + + while (true) { + if (emig->cluster_key != as_exchange_cluster_key()) { + as_fabric_msg_put(m); + return false; + } + + uint64_t now = cf_getms(); + + if (done_xmit_ms + MIGRATE_RETRANSMIT_STARTDONE_MS < now) { + msg_incr_ref(m); + + if (as_fabric_send(emig->dest, m, AS_FABRIC_CHANNEL_CTRL) != + AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } + + done_xmit_ms = now; + } + + int op; + + if (cf_queue_pop(emig->ctrl_q, &op, MIGRATE_RETRANSMIT_STARTDONE_MS) == + CF_QUEUE_OK) { + if (op == OPERATION_DONE_ACK) { + as_fabric_msg_put(m); + return true; + } + } + } + + // Should never get here. + cf_crash(AS_MIGRATE, "unexpected - exited infinite while loop"); + + return false; +} + + +void * +run_emigration_reinserter(void *arg) +{ + emigration *emig = (emigration *)arg; + emigration_state emig_state; + + // Reduce over the reinsert hash until finished. + while ((emig_state = cf_atomic32_get(emig->state)) != EMIG_STATE_ABORTED) { + if (emig->cluster_key != as_exchange_cluster_key()) { + cf_atomic32_set(&emig->state, EMIG_STATE_ABORTED); + return NULL; + } + + usleep(1000); + + if (cf_shash_get_size(emig->reinsert_hash) == 0) { + if (emig_state == EMIG_STATE_FINISHED) { + return NULL; + } + + continue; + } + + cf_shash_reduce(emig->reinsert_hash, emigration_reinsert_reduce_fn, + (void *)cf_getms()); + } + + return NULL; +} + + +void +emigrate_tree_reduce_fn(as_index_ref *r_ref, void *udata) +{ + emigration *emig = (emigration *)udata; + as_namespace *ns = emig->rsv.ns; + + if (emig->aborted) { + as_record_done(r_ref, ns); + return; // no point continuing to reduce this tree + } + + if (emig->cluster_key != as_exchange_cluster_key()) { + as_record_done(r_ref, ns); + emig->aborted = true; + cf_atomic32_set(&emig->state, EMIG_STATE_ABORTED); + return; // no point continuing to reduce this tree + } + + if (! should_emigrate_record(emig, r_ref)) { + as_record_done(r_ref, ns); + return; + } + + //-------------------------------------------- + // Read the record and pickle it. + // + + as_record *r = r_ref->r; + as_storage_rd rd; + + as_storage_record_open(ns, r, &rd); + + as_storage_rd_load_n_bins(&rd); // TODO - handle error returned + + as_bin stack_bins[ns->storage_data_in_memory ? 0 : rd.n_bins]; + + as_storage_rd_load_bins(&rd, stack_bins); // TODO - handle error returned + + pickled_record pr; + + pr.keyd = r->keyd; + pr.generation = r->generation; + pr.void_time = r->void_time; + pr.last_update_time = r->last_update_time; + pr.record_buf = as_record_pickle(&rd, &pr.record_len); + + as_storage_record_get_key(&rd); + + const char *set_name = as_index_get_set_name(r, ns); + uint32_t key_size = rd.key_size; + uint8_t key[key_size]; + + if (key_size != 0) { + memcpy(key, rd.key, key_size); + } + + uint32_t info = emigration_pack_info(emig, r); + + as_storage_record_close(&rd); + as_record_done(r_ref, ns); + + //-------------------------------------------- + // Fill and send the fabric message. + // + + msg *m = as_fabric_msg_get(M_TYPE_MIGRATE); + + msg_set_uint32(m, MIG_FIELD_OP, OPERATION_INSERT); + msg_set_uint32(m, MIG_FIELD_EMIG_ID, emig->id); + msg_set_buf(m, MIG_FIELD_DIGEST, (const uint8_t *)&pr.keyd, + sizeof(cf_digest), MSG_SET_COPY); + msg_set_uint32(m, MIG_FIELD_GENERATION, pr.generation); + msg_set_uint64(m, MIG_FIELD_LAST_UPDATE_TIME, pr.last_update_time); + + if (pr.void_time != 0) { + msg_set_uint32(m, MIG_FIELD_VOID_TIME, pr.void_time); + } + + if (info != 0) { + msg_set_uint32(m, MIG_FIELD_INFO, info); + } + + // Note - after MSG_SET_HANDOFF_MALLOCs, no need to destroy pickled_record. + + if (set_name) { + msg_set_buf(m, MIG_FIELD_SET_NAME, (const uint8_t *)set_name, + strlen(set_name), MSG_SET_COPY); + } + + if (key_size != 0) { + msg_set_buf(m, MIG_FIELD_KEY, key, key_size, MSG_SET_COPY); + } + + msg_set_buf(m, MIG_FIELD_RECORD, pr.record_buf, pr.record_len, + MSG_SET_HANDOFF_MALLOC); + + // This might block if the queues are backed up. + emigrate_record(emig, m); + + cf_atomic_int_incr(&ns->migrate_records_transmitted); + + if (ns->migrate_sleep != 0) { + usleep(ns->migrate_sleep); + } + + uint32_t waits = 0; + + while (cf_atomic32_get(emig->bytes_emigrating) > MAX_BYTES_EMIGRATING && + emig->cluster_key == as_exchange_cluster_key()) { + usleep(1000); + + // Temporary paranoia to inform us old nodes aren't acking properly. + if (++waits % (ns->migrate_retransmit_ms * 4) == 0) { + cf_warning(AS_MIGRATE, "missing acks from node %lx", emig->dest); + } + } +} + + +int +emigration_reinsert_reduce_fn(const void *key, void *data, void *udata) +{ + emigration_reinsert_ctrl *ri_ctrl = (emigration_reinsert_ctrl *)data; + as_namespace *ns = ri_ctrl->emig->rsv.ns; + uint64_t now = (uint64_t)udata; + + if (ri_ctrl->xmit_ms + ns->migrate_retransmit_ms < now) { + msg_incr_ref(ri_ctrl->m); + + if (as_fabric_send(ri_ctrl->emig->dest, ri_ctrl->m, + AS_FABRIC_CHANNEL_BULK) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(ri_ctrl->m); + return -1; // this will stop the reduce + } + + ri_ctrl->xmit_ms = now; + cf_atomic_int_incr(&ns->migrate_record_retransmits); + } + + return 0; +} + + +void +emigrate_record(emigration *emig, msg *m) +{ + uint64_t insert_id = emig->insert_id++; + + msg_set_uint64(m, MIG_FIELD_EMIG_INSERT_ID, insert_id); + + emigration_reinsert_ctrl ri_ctrl; + + msg_incr_ref(m); // the reference in the hash + ri_ctrl.m = m; + ri_ctrl.emig = emig; + ri_ctrl.xmit_ms = cf_getms(); + + cf_shash_put(emig->reinsert_hash, &insert_id, &ri_ctrl); + + cf_atomic32_add(&emig->bytes_emigrating, (int32_t)msg_get_wire_size(m)); + + if (as_fabric_send(emig->dest, m, AS_FABRIC_CHANNEL_BULK) != + AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } +} + + +//========================================================== +// Local helpers - immigration. +// + +uint32_t +immigration_hashfn(const void *value, uint32_t value_len) +{ + return ((const immigration_hkey *)value)->emig_id; +} + + +void * +run_immigration_reaper(void *arg) +{ + while (true) { + cf_rchash_reduce(g_immigration_hash, immigration_reaper_reduce_fn, + NULL); + sleep(1); + } + + return NULL; +} + + +int +immigration_reaper_reduce_fn(const void *key, uint32_t keylen, void *object, + void *udata) +{ + immigration *immig = (immigration *)object; + + if (immig->start_recv_ms == 0) { + // If the start time isn't set, immigration is still being processed. + return CF_RCHASH_OK; + } + + if (immig->cluster_key != as_exchange_cluster_key() || + (immig->done_recv_ms != 0 && cf_getms() > immig->done_recv_ms + + IMMIGRATION_DEBOUNCE_MS)) { + if (immig->start_result == AS_MIGRATE_OK && + // If we started ok, must be a cluster key change - make sure + // DONE handler doesn't also decrement active counter. + cf_atomic32_incr(&immig->done_recv) == 1) { + as_namespace *ns = immig->rsv.ns; + + if (cf_atomic_int_decr(&ns->migrate_rx_partitions_active) < 0) { + cf_warning(AS_MIGRATE, "migrate_rx_partitions_active < 0"); + cf_atomic_int_incr(&ns->migrate_rx_partitions_active); + } + } + + return CF_RCHASH_REDUCE_DELETE; + } + + return CF_RCHASH_OK; +} + + +//========================================================== +// Local helpers - migrate fabric message handling. +// + +int +migrate_receive_msg_cb(cf_node src, msg *m, void *udata) +{ + uint32_t op; + + if (msg_get_uint32(m, MIG_FIELD_OP, &op) != 0) { + cf_warning(AS_MIGRATE, "received message with no op"); + as_fabric_msg_put(m); + return 0; + } + + switch (op) { + //-------------------------------------------- + // Emigration - handle requests: + // + case OPERATION_MERGE_META: + emigration_handle_meta_batch_request(src, m); + break; + + //-------------------------------------------- + // Immigration - handle requests: + // + case OPERATION_START: + immigration_handle_start_request(src, m); + break; + case OPERATION_INSERT: + immigration_handle_insert_request(src, m); + break; + case OPERATION_DONE: + immigration_handle_done_request(src, m); + break; + case OPERATION_ALL_DONE: + immigration_handle_all_done_request(src, m); + break; + + //-------------------------------------------- + // Emigration - handle acknowledgments: + // + case OPERATION_INSERT_ACK: + emigration_handle_insert_ack(src, m); + break; + case OPERATION_START_ACK_OK: + case OPERATION_START_ACK_EAGAIN: + case OPERATION_START_ACK_FAIL: + case OPERATION_DONE_ACK: + case OPERATION_ALL_DONE_ACK: + emigration_handle_ctrl_ack(src, m, op); + break; + + //-------------------------------------------- + // Immigration - handle acknowledgments: + // + case OPERATION_MERGE_META_ACK: + immigration_handle_meta_batch_ack(src, m); + break; + + default: + cf_detail(AS_MIGRATE, "received unexpected message op %u", op); + as_fabric_msg_put(m); + break; + } + + return 0; +} + + +//---------------------------------------------------------- +// Immigration - request message handling. +// + +void +immigration_handle_start_request(cf_node src, msg *m) +{ + uint32_t emig_id; + + if (msg_get_uint32(m, MIG_FIELD_EMIG_ID, &emig_id) != 0) { + cf_warning(AS_MIGRATE, "handle start: msg get for emig id failed"); + as_fabric_msg_put(m); + return; + } + + uint64_t cluster_key; + + if (msg_get_uint64(m, MIG_FIELD_CLUSTER_KEY, &cluster_key) != 0) { + cf_warning(AS_MIGRATE, "handle start: msg get for cluster key failed"); + as_fabric_msg_put(m); + return; + } + + uint8_t *ns_name; + size_t ns_name_len; + + if (msg_get_buf(m, MIG_FIELD_NAMESPACE, &ns_name, &ns_name_len, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_MIGRATE, "handle start: msg get for namespace failed"); + as_fabric_msg_put(m); + return; + } + + as_namespace *ns = as_namespace_get_bybuf(ns_name, ns_name_len); + + if (! ns) { + cf_warning(AS_MIGRATE, "handle start: bad namespace"); + as_fabric_msg_put(m); + return; + } + + uint32_t pid; + + if (msg_get_uint32(m, MIG_FIELD_PARTITION, &pid) != 0) { + cf_warning(AS_MIGRATE, "handle start: msg get for pid failed"); + as_fabric_msg_put(m); + return; + } + + uint32_t emig_features = 0; + + msg_get_uint32(m, MIG_FIELD_FEATURES, &emig_features); + + uint64_t emig_n_recs = 0; + + msg_get_uint64(m, MIG_FIELD_PARTITION_SIZE, &emig_n_recs); + + msg_preserve_fields(m, 1, MIG_FIELD_EMIG_ID); + + immigration *immig = cf_rc_alloc(sizeof(immigration)); + + cf_atomic_int_incr(&ns->migrate_rx_instance_count); + + immig->src = src; + immig->cluster_key = cluster_key; + immig->pid = pid; + immig->start_recv_ms = 0; + immig->done_recv = 0; + immig->done_recv_ms = 0; + immig->emig_id = emig_id; + immig->meta_q = meta_out_q_create(); + immig->features = MY_MIG_FEATURES; + immig->ns = ns; + immig->rsv.p = NULL; + + immigration_hkey hkey; + + hkey.src = src; + hkey.emig_id = emig_id; + + while (true) { + if (cf_rchash_put_unique(g_immigration_hash, (void *)&hkey, + sizeof(hkey), (void *)immig) == CF_RCHASH_OK) { + cf_rc_reserve(immig); // so either put or get yields ref-count 2 + + // First start request (not a retransmit) for this pid this round, + // or we had ack'd previous start request with 'EAGAIN'. + immig->start_result = as_partition_immigrate_start(ns, pid, + cluster_key, src); + break; + } + + immigration *immig0; + + if (cf_rchash_get(g_immigration_hash, (void *)&hkey, sizeof(hkey), + (void *)&immig0) == CF_RCHASH_OK) { + immigration_release(immig); // free just-alloc'd immig ... + + if (immig0->start_recv_ms == 0) { + immigration_release(immig0); + return; // allow previous thread to respond + } + + if (immig0->cluster_key != cluster_key) { + immigration_release(immig0); + return; // other node reused an immig_id, allow reaper to reap + } + + immig = immig0; // ... and use original + break; + } + } + + switch (immig->start_result) { + case AS_MIGRATE_OK: + break; + case AS_MIGRATE_FAIL: + immig->start_recv_ms = cf_getms(); // permits reaping + immig->done_recv_ms = immig->start_recv_ms; // permits reaping + immigration_release(immig); + immigration_ack_start_request(src, m, OPERATION_START_ACK_FAIL); + return; + case AS_MIGRATE_AGAIN: + // Remove from hash so that the immig can be tried again. + cf_rchash_delete(g_immigration_hash, (void *)&hkey, sizeof(hkey)); + immigration_release(immig); + immigration_ack_start_request(src, m, OPERATION_START_ACK_EAGAIN); + return; + default: + cf_crash(AS_MIGRATE, "unexpected as_partition_immigrate_start result"); + break; + } + + if (immig->start_recv_ms == 0) { + as_partition_reserve(ns, pid, &immig->rsv); + cf_atomic_int_incr(&immig->rsv.ns->migrate_rx_partitions_active); + + if (! immigration_start_meta_sender(immig, emig_features, + emig_n_recs)) { + immig->features &= ~MIG_FEATURE_MERGE; + } + + immig->start_recv_ms = cf_getms(); // permits reaping + } + + msg_set_uint32(m, MIG_FIELD_FEATURES, immig->features); + + immigration_release(immig); + immigration_ack_start_request(src, m, OPERATION_START_ACK_OK); +} + + +void +immigration_ack_start_request(cf_node src, msg *m, uint32_t op) +{ + msg_set_uint32(m, MIG_FIELD_OP, op); + + if (as_fabric_send(src, m, AS_FABRIC_CHANNEL_CTRL) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } +} + + +void +immigration_handle_insert_request(cf_node src, msg *m) +{ + uint32_t emig_id; + + if (msg_get_uint32(m, MIG_FIELD_EMIG_ID, &emig_id) != 0) { + cf_warning(AS_MIGRATE, "handle insert: msg get for emig id failed"); + as_fabric_msg_put(m); + return; + } + + immigration_hkey hkey; + + hkey.src = src; + hkey.emig_id = emig_id; + + immigration *immig; + + if (cf_rchash_get(g_immigration_hash, (void *)&hkey, sizeof(hkey), + (void **)&immig) != CF_RCHASH_OK) { + // The immig no longer exists, likely the cluster key advanced and this + // record immigration is from prior round. Do not ack this request. + as_fabric_msg_put(m); + return; + } + + if (immig->start_result != AS_MIGRATE_OK || immig->start_recv_ms == 0) { + // If this immigration didn't start and reserve a partition, it's + // likely in the hash on a retransmit and this insert is for the + // original - ignore, and let this immigration proceed. + immigration_release(immig); + as_fabric_msg_put(m); + return; + } + + cf_atomic_int_incr(&immig->rsv.ns->migrate_record_receives); + + if (immig->cluster_key != as_exchange_cluster_key()) { + immigration_release(immig); + as_fabric_msg_put(m); + return; + } + + as_remote_record rr = { .src = src, .rsv = &immig->rsv }; + + if (msg_get_buf(m, MIG_FIELD_DIGEST, (uint8_t **)&rr.keyd, NULL, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_MIGRATE, "handle insert: got no digest"); + as_fabric_msg_put(m); + return; + } + + if (msg_get_buf(m, MIG_FIELD_RECORD, (uint8_t **)&rr.record_buf, + &rr.record_buf_sz, MSG_GET_DIRECT) != 0 || rr.record_buf_sz < 2) { + cf_warning(AS_MIGRATE, "handle insert: got no or bad record"); + immigration_release(immig); + as_fabric_msg_put(m); + return; + } + + if (msg_get_uint32(m, MIG_FIELD_GENERATION, &rr.generation) != 0 || + rr.generation == 0) { + cf_warning(AS_MIGRATE, "handle insert: got no or bad generation"); + immigration_release(immig); + as_fabric_msg_put(m); + return; + } + + if (msg_get_uint64(m, MIG_FIELD_LAST_UPDATE_TIME, + &rr.last_update_time) != 0) { + cf_warning(AS_MIGRATE, "handle insert: got no last-update-time"); + immigration_release(immig); + as_fabric_msg_put(m); + return; + } + + msg_get_uint32(m, MIG_FIELD_VOID_TIME, &rr.void_time); + + msg_get_buf(m, MIG_FIELD_SET_NAME, (uint8_t **)&rr.set_name, + &rr.set_name_len, MSG_GET_DIRECT); + + msg_get_buf(m, MIG_FIELD_KEY, (uint8_t **)&rr.key, &rr.key_size, + MSG_GET_DIRECT); + + uint32_t info = 0; + + msg_get_uint32(m, MIG_FIELD_INFO, &info); + + if (immigration_ignore_pickle(rr.record_buf, info)) { + cf_warning_digest(AS_MIGRATE, rr.keyd, "handle insert: binless pickle "); + } + else { + immigration_init_repl_state(&rr, info); + + int rv = as_record_replace_if_better(&rr, false, false, false); + + // If replace failed, don't ack - it will be retransmitted. + if (! (rv == AS_PROTO_RESULT_OK || + // Migrations just treat these errors as successful no-ops: + rv == AS_PROTO_RESULT_FAIL_RECORD_EXISTS || + rv == AS_PROTO_RESULT_FAIL_GENERATION)) { + immigration_release(immig); + as_fabric_msg_put(m); + return; + } + } + + immigration_release(immig); + + msg_preserve_fields(m, 2, MIG_FIELD_EMIG_INSERT_ID, MIG_FIELD_EMIG_ID); + + msg_set_uint32(m, MIG_FIELD_OP, OPERATION_INSERT_ACK); + + if (as_fabric_send(src, m, AS_FABRIC_CHANNEL_BULK) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } +} + + +void +immigration_handle_done_request(cf_node src, msg *m) +{ + uint32_t emig_id; + + if (msg_get_uint32(m, MIG_FIELD_EMIG_ID, &emig_id) != 0) { + cf_warning(AS_MIGRATE, "handle done: msg get for emig id failed"); + as_fabric_msg_put(m); + return; + } + + msg_preserve_fields(m, 1, MIG_FIELD_EMIG_ID); + + // See if this migration already exists & has been notified. + immigration_hkey hkey; + + hkey.src = src; + hkey.emig_id = emig_id; + + immigration *immig; + + if (cf_rchash_get(g_immigration_hash, (void *)&hkey, sizeof(hkey), + (void **)&immig) == CF_RCHASH_OK) { + if (immig->start_result != AS_MIGRATE_OK || immig->start_recv_ms == 0) { + // If this immigration didn't start and reserve a partition, it's + // likely in the hash on a retransmit and this DONE is for the + // original - ignore, and let this immigration proceed. + immigration_release(immig); + as_fabric_msg_put(m); + return; + } + + if (cf_atomic32_incr(&immig->done_recv) == 1) { + // Record the time of the first DONE received. + immig->done_recv_ms = cf_getms(); + + as_namespace *ns = immig->rsv.ns; + + if (cf_atomic_int_decr(&ns->migrate_rx_partitions_active) < 0) { + cf_warning(AS_MIGRATE, "migrate_rx_partitions_active < 0"); + cf_atomic_int_incr(&ns->migrate_rx_partitions_active); + } + + as_partition_immigrate_done(ns, immig->rsv.p->id, + immig->cluster_key, immig->src); + } + // else - was likely a retransmitted done message. + + immigration_release(immig); + } + // else - garbage, or super-stale retransmitted done message. + + msg_set_uint32(m, MIG_FIELD_OP, OPERATION_DONE_ACK); + + if (as_fabric_send(src, m, AS_FABRIC_CHANNEL_CTRL) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + return; + } +} + + +void +immigration_handle_all_done_request(cf_node src, msg *m) +{ + uint32_t emig_id; + + if (msg_get_uint32(m, MIG_FIELD_EMIG_ID, &emig_id) != 0) { + cf_warning(AS_MIGRATE, "handle all done: msg get for emig id failed"); + as_fabric_msg_put(m); + return; + } + + uint64_t cluster_key; + + if (msg_get_uint64(m, MIG_FIELD_CLUSTER_KEY, &cluster_key) != 0) { + cf_warning(AS_MIGRATE, "handle all done: msg get for cluster key failed"); + as_fabric_msg_put(m); + return; + } + + uint8_t *ns_name; + size_t ns_name_len; + + if (msg_get_buf(m, MIG_FIELD_NAMESPACE, &ns_name, &ns_name_len, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_MIGRATE, "handle all done: msg get for namespace failed"); + as_fabric_msg_put(m); + return; + } + + as_namespace *ns = as_namespace_get_bybuf(ns_name, ns_name_len); + + if (! ns) { + cf_warning(AS_MIGRATE, "handle all done: bad namespace"); + as_fabric_msg_put(m); + return; + } + + uint32_t pid; + + if (msg_get_uint32(m, MIG_FIELD_PARTITION, &pid) != 0) { + cf_warning(AS_MIGRATE, "handle all done: msg get for pid failed"); + as_fabric_msg_put(m); + return; + } + + msg_preserve_fields(m, 1, MIG_FIELD_EMIG_ID); + + // TODO - optionally, for replicas we might use this to remove immig objects + // from hash and deprecate timer... + + if (as_partition_migrations_all_done(ns, pid, cluster_key) != + AS_MIGRATE_OK) { + as_fabric_msg_put(m); + return; + } + + msg_set_uint32(m, MIG_FIELD_OP, OPERATION_ALL_DONE_ACK); + + if (as_fabric_send(src, m, AS_FABRIC_CHANNEL_CTRL) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + return; + } +} + + +//---------------------------------------------------------- +// Emigration - acknowledgment message handling. +// + +void +emigration_handle_insert_ack(cf_node src, msg *m) +{ + uint32_t emig_id; + + if (msg_get_uint32(m, MIG_FIELD_EMIG_ID, &emig_id) != 0) { + cf_warning(AS_MIGRATE, "insert ack: msg get for emig id failed"); + as_fabric_msg_put(m); + return; + } + + emigration *emig; + + if (cf_rchash_get(g_emigration_hash, (void *)&emig_id, sizeof(emig_id), + (void **)&emig) != CF_RCHASH_OK) { + // Probably came from a migration prior to the latest rebalance. + as_fabric_msg_put(m); + return; + } + + uint64_t insert_id; + + if (msg_get_uint64(m, MIG_FIELD_EMIG_INSERT_ID, &insert_id) != 0) { + cf_warning(AS_MIGRATE, "insert ack: msg get for emig insert id failed"); + emigration_release(emig); + as_fabric_msg_put(m); + return; + } + + emigration_reinsert_ctrl *ri_ctrl = NULL; + pthread_mutex_t *vlock; + + if (cf_shash_get_vlock(emig->reinsert_hash, &insert_id, (void **)&ri_ctrl, + &vlock) == CF_SHASH_OK) { + if (src == emig->dest) { + if (cf_atomic32_sub(&emig->bytes_emigrating, + (int32_t)msg_get_wire_size(ri_ctrl->m)) < 0) { + cf_warning(AS_MIGRATE, "bytes_emigrating less than zero"); + } + + as_fabric_msg_put(ri_ctrl->m); + // At this point, the rt is *GONE*. + cf_shash_delete_lockfree(emig->reinsert_hash, &insert_id); + ri_ctrl = NULL; + } + else { + cf_warning(AS_MIGRATE, "insert ack: unexpected source %lx", src); + } + + pthread_mutex_unlock(vlock); + } + + emigration_release(emig); + as_fabric_msg_put(m); +} + + +void +emigration_handle_ctrl_ack(cf_node src, msg *m, uint32_t op) +{ + uint32_t emig_id; + + if (msg_get_uint32(m, MIG_FIELD_EMIG_ID, &emig_id) != 0) { + cf_warning(AS_MIGRATE, "ctrl ack: msg get for emig id failed"); + as_fabric_msg_put(m); + return; + } + + uint32_t immig_features = 0; + + msg_get_uint32(m, MIG_FIELD_FEATURES, &immig_features); + + as_fabric_msg_put(m); + + emigration *emig; + + if (cf_rchash_get(g_emigration_hash, (void *)&emig_id, sizeof(emig_id), + (void **)&emig) == CF_RCHASH_OK) { + if (emig->dest == src) { + if ((immig_features & MIG_FEATURE_MERGE) == 0) { + // TODO - rethink where this should go after further refactor. + if (op == OPERATION_START_ACK_OK && emig->meta_q) { + meta_in_q_rejected(emig->meta_q); + } + } + + cf_queue_push(emig->ctrl_q, &op); + } + else { + cf_warning(AS_MIGRATE, "ctrl ack (%d): unexpected source %lx", op, + src); + } + + emigration_release(emig); + } + else { + cf_detail(AS_MIGRATE, "ctrl ack (%d): can't find emig id %u", op, + emig_id); + } +} + + +//========================================================== +// Local helpers - info API helpers. +// + +int +emigration_dump_reduce_fn(const void *key, uint32_t keylen, void *object, + void *udata) +{ + uint32_t emig_id = *(const uint32_t *)key; + emigration *emig = (emigration *)object; + int *item_num = (int *)udata; + + cf_info(AS_MIGRATE, "[%d]: mig_id %u : id %u ; ck %lx", *item_num, emig_id, + emig->id, emig->cluster_key); + + *item_num += 1; + + return 0; +} + + +int +immigration_dump_reduce_fn(const void *key, uint32_t keylen, void *object, + void *udata) +{ + const immigration_hkey *hkey = (const immigration_hkey *)key; + immigration *immig = (immigration *)object; + int *item_num = (int *)udata; + + cf_info(AS_MIGRATE, "[%d]: src %016lx ; id %u : src %016lx ; done recv %u ; start recv ms %lu ; done recv ms %lu ; ck %lx", + *item_num, hkey->src, hkey->emig_id, immig->src, immig->done_recv, + immig->start_recv_ms, immig->done_recv_ms, immig->cluster_key); + + *item_num += 1; + + return 0; +} diff --git a/as/src/fabric/migrate_ce.c b/as/src/fabric/migrate_ce.c new file mode 100644 index 00000000..a3e98919 --- /dev/null +++ b/as/src/fabric/migrate_ce.c @@ -0,0 +1,94 @@ +/* migrate_ce.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + +//========================================================== +// Includes. +// + +#include "fabric/migrate.h" + +#include +#include +#include + +#include "fault.h" +#include "msg.h" +#include "node.h" + +#include "base/datamodel.h" +#include "fabric/fabric.h" + + +//========================================================== +// Typedefs & constants. +// + +const uint32_t MY_MIG_FEATURES = 0; + + +//========================================================== +// Community Edition API. +// + +bool +should_emigrate_record(emigration *emig, as_index_ref *r_ref) +{ + return true; +} + +uint32_t +emigration_pack_info(const emigration *emig, const as_record *r) +{ + return 0; +} + +void +emigration_handle_meta_batch_request(cf_node src, msg *m) +{ + cf_warning(AS_MIGRATE, "CE node received meta-batch request - unexpected"); + as_fabric_msg_put(m); +} + +bool +immigration_ignore_pickle(const uint8_t *buf, uint32_t info) +{ + return as_record_pickle_is_binless(buf); +} + +void +immigration_init_repl_state(as_remote_record* rr, uint32_t info) +{ +} + +void +immigration_handle_meta_batch_ack(cf_node src, msg *m) +{ + cf_warning(AS_MIGRATE, "CE node received meta-batch ack - unexpected"); + as_fabric_msg_put(m); +} + +bool +immigration_start_meta_sender(immigration *immig, uint32_t emig_features, + uint64_t emig_partition_sz) +{ + return false; +} diff --git a/as/src/fabric/partition.c b/as/src/fabric/partition.c new file mode 100644 index 00000000..5b8f599f --- /dev/null +++ b/as/src/fabric/partition.c @@ -0,0 +1,809 @@ +/* + * partition.c + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "fabric/partition.h" + +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_b64.h" + +#include "fault.h" +#include "node.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "fabric/partition_balance.h" + + +//========================================================== +// Forward declarations. +// + +cf_node find_best_node(const as_partition* p, bool is_read); +void accumulate_replica_stats(const as_partition* p, uint64_t* p_n_objects, uint64_t* p_n_tombstones); +void partition_reserve_lockfree(as_partition* p, as_namespace* ns, as_partition_reservation* rsv); +cf_node partition_getreplica_prole(as_namespace* ns, uint32_t pid); +char partition_descriptor(const as_partition* p); +int partition_get_replica_self_lockfree(const as_namespace* ns, uint32_t pid); + + +//========================================================== +// Public API. +// + +void +as_partition_init(as_namespace* ns, uint32_t pid) +{ + as_partition* p = &ns->partitions[pid]; + + // Note - as_partition has been zeroed since it's a member of as_namespace. + // Set non-zero members. + + pthread_mutex_init(&p->lock, NULL); + + p->id = pid; + + if (ns->cold_start) { + p->vp = as_index_tree_create(&ns->tree_shared, ns->arena); + } + else { + p->vp = as_index_tree_resume(&ns->tree_shared, ns->arena, + &ns->xmem_roots[pid * ns->tree_shared.n_sprigs]); + } +} + + +void +as_partition_shutdown(as_namespace* ns, uint32_t pid) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + as_index_tree_shutdown(p->vp, + &ns->xmem_roots[pid * ns->tree_shared.n_sprigs]); +} + + +void +as_partition_freeze(as_partition* p) +{ + // TODO - rearrange as_partition so we can call memset() here? + p->n_nodes = 0; + p->n_replicas = 0; + memset(p->replicas, 0, sizeof(p->replicas)); + + p->pending_emigrations = 0; + p->pending_immigrations = 0; + memset(p->immigrators, 0, sizeof(p->immigrators)); + + p->working_master = (cf_node)0; + + p->n_dupl = 0; + memset(p->dupls, 0, sizeof(p->dupls)); + + p->n_witnesses = 0; + memset(p->witnesses, 0, sizeof(p->witnesses)); +} + + +// Get a list of all nodes (excluding self) that are replicas for a specified +// partition: place the list in *nv and return the number of nodes found. +uint32_t +as_partition_get_other_replicas(as_partition* p, cf_node* nv) +{ + uint32_t n_other_replicas = 0; + + pthread_mutex_lock(&p->lock); + + for (uint32_t repl_ix = 0; repl_ix < p->n_replicas; repl_ix++) { + // Don't ever include yourself. + if (p->replicas[repl_ix] == g_config.self_node) { + continue; + } + + // Copy the node ID into the user-supplied vector. + nv[n_other_replicas++] = p->replicas[repl_ix]; + } + + pthread_mutex_unlock(&p->lock); + + return n_other_replicas; +} + + +cf_node +as_partition_writable_node(as_namespace* ns, uint32_t pid) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + if (p->n_replicas == 0) { + // This partition is unavailable. + pthread_mutex_unlock(&p->lock); + return (cf_node)0; + } + + cf_node best_node = find_best_node(p, false); + + pthread_mutex_unlock(&p->lock); + + return best_node; +} + + +// If this node is an eventual master, return the acting master, else return 0. +cf_node +as_partition_proxyee_redirect(as_namespace* ns, uint32_t pid) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + cf_node node = (cf_node)0; + + if (g_config.self_node == p->replicas[0] && + g_config.self_node != p->working_master) { + node = p->working_master; + } + + pthread_mutex_unlock(&p->lock); + + return node; +} + + +// TODO - deprecate in "six months". +void +as_partition_get_replicas_prole_str(cf_dyn_buf* db) +{ + uint8_t prole_bitmap[CLIENT_BITMAP_BYTES]; + char b64_bitmap[CLIENT_B64MAP_BYTES]; + + size_t db_sz = db->used_sz; + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace* ns = g_config.namespaces[ns_ix]; + + memset(prole_bitmap, 0, sizeof(uint8_t) * CLIENT_BITMAP_BYTES); + cf_dyn_buf_append_string(db, ns->name); + cf_dyn_buf_append_char(db, ':'); + + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + if (g_config.self_node == partition_getreplica_prole(ns, pid) ) { + prole_bitmap[pid >> 3] |= (0x80 >> (pid & 7)); + } + } + + cf_b64_encode(prole_bitmap, CLIENT_BITMAP_BYTES, b64_bitmap); + cf_dyn_buf_append_buf(db, (uint8_t*)b64_bitmap, CLIENT_B64MAP_BYTES); + cf_dyn_buf_append_char(db, ';'); + } + + if (db_sz != db->used_sz) { + cf_dyn_buf_chomp(db); + } +} + + +void +as_partition_get_replicas_master_str(cf_dyn_buf* db) +{ + size_t db_sz = db->used_sz; + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace* ns = g_config.namespaces[ns_ix]; + + cf_dyn_buf_append_string(db, ns->name); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_buf(db, (uint8_t*)ns->replica_maps[0].b64map, + sizeof(ns->replica_maps[0].b64map)); + cf_dyn_buf_append_char(db, ';'); + } + + if (db_sz != db->used_sz) { + cf_dyn_buf_chomp(db); + } +} + + +void +as_partition_get_replicas_all_str(cf_dyn_buf* db, bool include_regime) +{ + size_t db_sz = db->used_sz; + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace* ns = g_config.namespaces[ns_ix]; + + cf_dyn_buf_append_string(db, ns->name); + cf_dyn_buf_append_char(db, ':'); + + if (include_regime) { + cf_dyn_buf_append_uint32(db, ns->rebalance_regime); + cf_dyn_buf_append_char(db, ','); + } + + uint32_t repl_factor = ns->replication_factor; + + // If we haven't rebalanced yet, report 1 column with no ownership. + if (repl_factor == 0) { + repl_factor = 1; + } + + cf_dyn_buf_append_uint32(db, repl_factor); + + for (uint32_t repl_ix = 0; repl_ix < repl_factor; repl_ix++) { + cf_dyn_buf_append_char(db, ','); + cf_dyn_buf_append_buf(db, + (uint8_t*)&ns->replica_maps[repl_ix].b64map, + sizeof(ns->replica_maps[repl_ix].b64map)); + } + + cf_dyn_buf_append_char(db, ';'); + } + + if (db_sz != db->used_sz) { + cf_dyn_buf_chomp(db); + } +} + + +void +as_partition_get_replica_stats(as_namespace* ns, repl_stats* p_stats) +{ + memset(p_stats, 0, sizeof(repl_stats)); + + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + int self_n = find_self_in_replicas(p); // -1 if not + + if (g_config.self_node == p->working_master) { + accumulate_replica_stats(p, + &p_stats->n_master_objects, + &p_stats->n_master_tombstones); + } + else if (self_n >= 0) { + accumulate_replica_stats(p, + &p_stats->n_prole_objects, + &p_stats->n_prole_tombstones); + } + else { + accumulate_replica_stats(p, + &p_stats->n_non_replica_objects, + &p_stats->n_non_replica_tombstones); + } + + pthread_mutex_unlock(&p->lock); + } +} + + +// TODO - what if partition is unavailable? +void +as_partition_reserve(as_namespace* ns, uint32_t pid, + as_partition_reservation* rsv) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + partition_reserve_lockfree(p, ns, rsv); + + pthread_mutex_unlock(&p->lock); +} + + +// TODO - what if partition is unavailable? +int +as_partition_reserve_timeout(as_namespace* ns, uint32_t pid, + as_partition_reservation* rsv, int timeout_ms) +{ + as_partition* p = &ns->partitions[pid]; + + struct timespec tp; + cf_set_wait_timespec(timeout_ms, &tp); + + if (pthread_mutex_timedlock(&p->lock, &tp) != 0) { + return -1; + } + + partition_reserve_lockfree(p, ns, rsv); + + pthread_mutex_unlock(&p->lock); + + return 0; +} + + +int +as_partition_reserve_replica(as_namespace* ns, uint32_t pid, + as_partition_reservation* rsv) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + if (! is_self_replica(p)) { + pthread_mutex_unlock(&p->lock); + return AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH; + } + + partition_reserve_lockfree(p, ns, rsv); + + pthread_mutex_unlock(&p->lock); + + return AS_PROTO_RESULT_OK; +} + + +// Returns: +// 0 - reserved - node parameter returns self node +// -1 - not reserved - node parameter returns other "better" node +// -2 - not reserved - node parameter not filled - partition is unavailable +int +as_partition_reserve_write(as_namespace* ns, uint32_t pid, + as_partition_reservation* rsv, cf_node* node) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + // If this partition is frozen, return. + if (p->n_replicas == 0) { + if (node) { + *node = (cf_node)0; + } + + pthread_mutex_unlock(&p->lock); + return -2; + } + + cf_node best_node = find_best_node(p, false); + + if (node) { + *node = best_node; + } + + // If this node is not the appropriate one, return. + if (best_node != g_config.self_node) { + pthread_mutex_unlock(&p->lock); + return -1; + } + + partition_reserve_lockfree(p, ns, rsv); + + pthread_mutex_unlock(&p->lock); + + return 0; +} + + +// Returns: +// 0 - reserved - node parameter returns self node +// -1 - not reserved - node parameter returns other "better" node +// -2 - not reserved - node parameter not filled - partition is unavailable +int +as_partition_reserve_read(as_namespace* ns, uint32_t pid, + as_partition_reservation* rsv, bool would_dup_res, cf_node* node) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + // If this partition is unavailable, return. + if (p->n_replicas == 0) { + if (node) { + *node = (cf_node)0; + } + + pthread_mutex_unlock(&p->lock); + return -2; + } + + cf_node best_node = find_best_node(p, + ! partition_reserve_promote(ns, p, would_dup_res)); + + if (node) { + *node = best_node; + } + + // If this node is not the appropriate one, return. + if (best_node != g_config.self_node) { + pthread_mutex_unlock(&p->lock); + return -1; + } + + partition_reserve_lockfree(p, ns, rsv); + + pthread_mutex_unlock(&p->lock); + + return 0; +} + + +// Reserves all query-able partitions. +// Returns the number of partitions reserved. +int +as_partition_prereserve_query(as_namespace* ns, bool can_partition_query[], + as_partition_reservation rsv[]) +{ + int reserved = 0; + + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + if (as_partition_reserve_query(ns, pid, &rsv[pid])) { + can_partition_query[pid] = false; + } + else { + can_partition_query[pid] = true; + reserved++; + } + } + + return reserved; +} + + +// Reserve a partition for query. +// Return value 0 means the reservation was taken, -1 means not. +int +as_partition_reserve_query(as_namespace* ns, uint32_t pid, + as_partition_reservation* rsv) +{ + return as_partition_reserve_write(ns, pid, rsv, NULL); +} + + +// Obtain a partition reservation for XDR reads. Succeeds, if we are sync or +// zombie for the partition. +// TODO - what if partition is unavailable? +int +as_partition_reserve_xdr_read(as_namespace* ns, uint32_t pid, + as_partition_reservation* rsv) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + int res = -1; + + if (as_partition_version_has_data(&p->version)) { + partition_reserve_lockfree(p, ns, rsv); + res = 0; + } + + pthread_mutex_unlock(&p->lock); + + return res; +} + + +void +as_partition_reservation_copy(as_partition_reservation* dst, + as_partition_reservation* src) +{ + dst->ns = src->ns; + dst->p = src->p; + dst->tree = src->tree; + dst->regime = src->regime; + dst->n_dupl = src->n_dupl; + + if (dst->n_dupl != 0) { + memcpy(dst->dupl_nodes, src->dupl_nodes, sizeof(cf_node) * dst->n_dupl); + } +} + + +void +as_partition_release(as_partition_reservation* rsv) +{ + as_index_tree_release(rsv->tree); +} + + +void +as_partition_getinfo_str(cf_dyn_buf* db) +{ + size_t db_sz = db->used_sz; + + cf_dyn_buf_append_string(db, "namespace:partition:state:n_replicas:replica:" + "n_dupl:working_master:emigrates:immigrates:records:tombstones:" + "regime:version:final_version;"); + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace* ns = g_config.namespaces[ns_ix]; + + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + cf_dyn_buf_append_string(db, ns->name); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_uint32(db, pid); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_char(db, partition_descriptor(p)); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_uint32(db, p->n_replicas); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_int(db, find_self_in_replicas(p)); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_uint32(db, p->n_dupl); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_uint64_x(db, p->working_master); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_int(db, p->pending_emigrations); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_int(db, p->pending_immigrations); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_uint32(db, as_index_tree_size(p->vp)); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_uint64(db, p->n_tombstones); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_uint32(db, p->regime); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_string(db, VERSION_AS_STRING(&p->version)); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_string(db, VERSION_AS_STRING(&p->final_version)); + + cf_dyn_buf_append_char(db, ';'); + + pthread_mutex_unlock(&p->lock); + } + } + + if (db_sz != db->used_sz) { + cf_dyn_buf_chomp(db); // take back the final ';' + } +} + + +//========================================================== +// Public API - client view replica maps. +// + +void +client_replica_maps_create(as_namespace* ns) +{ + uint32_t size = sizeof(client_replica_map) * ns->cfg_replication_factor; + + ns->replica_maps = cf_malloc(size); + memset(ns->replica_maps, 0, size); + + for (uint32_t repl_ix = 0; repl_ix < ns->cfg_replication_factor; + repl_ix++) { + client_replica_map* repl_map = &ns->replica_maps[repl_ix]; + + pthread_mutex_init(&repl_map->write_lock, NULL); + + cf_b64_encode((uint8_t*)repl_map->bitmap, + (uint32_t)sizeof(repl_map->bitmap), (char*)repl_map->b64map); + } +} + + +void +client_replica_maps_clear(as_namespace* ns) +{ + memset(ns->replica_maps, 0, + sizeof(client_replica_map) * ns->cfg_replication_factor); + + for (uint32_t repl_ix = 0; repl_ix < ns->cfg_replication_factor; + repl_ix++) { + client_replica_map* repl_map = &ns->replica_maps[repl_ix]; + + cf_b64_encode((uint8_t*)repl_map->bitmap, + (uint32_t)sizeof(repl_map->bitmap), (char*)repl_map->b64map); + } +} + + +bool +client_replica_maps_update(as_namespace* ns, uint32_t pid) +{ + uint32_t byte_i = pid >> 3; + uint32_t byte_chunk = (byte_i / 3); + uint32_t chunk_bitmap_offset = byte_chunk * 3; + uint32_t chunk_b64map_offset = byte_chunk << 2; + + uint32_t bytes_from_end = CLIENT_BITMAP_BYTES - chunk_bitmap_offset; + uint32_t input_size = bytes_from_end > 3 ? 3 : bytes_from_end; + + int replica = partition_get_replica_self_lockfree(ns, pid); // -1 if not + uint8_t set_mask = 0x80 >> (pid & 0x7); + bool changed = false; + + for (int repl_ix = 0; repl_ix < (int)ns->cfg_replication_factor; + repl_ix++) { + client_replica_map* repl_map = &ns->replica_maps[repl_ix]; + + volatile uint8_t* mbyte = repl_map->bitmap + byte_i; + bool owned = replica == repl_ix; + bool is_set = (*mbyte & set_mask) != 0; + bool needs_update = (owned && ! is_set) || (! owned && is_set); + + if (! needs_update) { + continue; + } + + volatile uint8_t* bitmap_chunk = repl_map->bitmap + chunk_bitmap_offset; + volatile char* b64map_chunk = repl_map->b64map + chunk_b64map_offset; + + pthread_mutex_lock(&repl_map->write_lock); + + *mbyte ^= set_mask; + cf_b64_encode((uint8_t*)bitmap_chunk, input_size, (char*)b64map_chunk); + + pthread_mutex_unlock(&repl_map->write_lock); + + changed = true; + } + + return changed; +} + + +bool +client_replica_maps_is_partition_queryable(const as_namespace* ns, uint32_t pid) +{ + uint32_t byte_i = pid >> 3; + + const client_replica_map* repl_map = ns->replica_maps; + const volatile uint8_t* mbyte = repl_map->bitmap + byte_i; + + uint8_t set_mask = 0x80 >> (pid & 0x7); + + return (*mbyte & set_mask) != 0; +} + + +//========================================================== +// Local helpers. +// + +// Find best node to handle read/write. Called within partition lock. +cf_node +find_best_node(const as_partition* p, bool is_read) +{ + // Working master (final or acting) returns self, eventual master returns + // acting master. Others don't have p->working_master set. + if (p->working_master != (cf_node)0) { + return p->working_master; + } + + if (is_read && p->pending_immigrations == 0 && + find_self_in_replicas(p) > 0) { + return g_config.self_node; // may read from prole that's got everything + } + + return p->replicas[0]; // final master as a last resort +} + + +void +accumulate_replica_stats(const as_partition* p, uint64_t* p_n_objects, + uint64_t* p_n_tombstones) +{ + int64_t n_tombstones = (int64_t)p->n_tombstones; + int64_t n_objects = (int64_t)as_index_tree_size(p->vp) - n_tombstones; + + *p_n_objects += n_objects > 0 ? (uint64_t)n_objects : 0; + *p_n_tombstones += (uint64_t)n_tombstones; +} + + +void +partition_reserve_lockfree(as_partition* p, as_namespace* ns, + as_partition_reservation* rsv) +{ + cf_rc_reserve(p->vp); + + rsv->ns = ns; + rsv->p = p; + rsv->tree = p->vp; + rsv->regime = p->regime; + rsv->n_dupl = p->n_dupl; + + if (rsv->n_dupl != 0) { + memcpy(rsv->dupl_nodes, p->dupls, sizeof(cf_node) * rsv->n_dupl); + } +} + + +// TODO - deprecate in "six months". +cf_node +partition_getreplica_prole(as_namespace* ns, uint32_t pid) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + // Check is this is a master node. + cf_node best_node = find_best_node(p, false); + + if (best_node == g_config.self_node) { + // It's a master, return 0. + best_node = (cf_node)0; + } + else { + // Not a master, see if it's a prole. + best_node = find_best_node(p, true); + } + + pthread_mutex_unlock(&p->lock); + + return best_node; +} + + +char +partition_descriptor(const as_partition* p) +{ + int self_n = find_self_in_replicas(p); // -1 if not + + if (self_n >= 0) { + return p->pending_immigrations == 0 ? 'S' : 'D'; + } + + if (as_partition_version_is_null(&p->version)) { + return 'A'; + } + + return as_partition_version_has_data(&p->version) ? 'Z' : 'X'; +} + + +int +partition_get_replica_self_lockfree(const as_namespace* ns, uint32_t pid) +{ + const as_partition* p = &ns->partitions[pid]; + + int self_n = find_self_in_replicas(p); // -1 if not + + if (g_config.self_node == p->working_master) { + return 0; + } + + if (self_n > 0 && p->pending_immigrations == 0 && + // Check self_n < n_repl only because n_repl could be out-of-sync + // with (less than) partition's replica list count. + self_n < (int)ns->replication_factor) { + return self_n; + } + + return -1; // not a replica +} diff --git a/as/src/fabric/partition_balance.c b/as/src/fabric/partition_balance.c new file mode 100644 index 00000000..1c6d1634 --- /dev/null +++ b/as/src/fabric/partition_balance.c @@ -0,0 +1,1456 @@ +/* + * partition_balance.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "fabric/partition_balance.h" + +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_hash_math.h" +#include "citrusleaf/cf_queue.h" + +#include "compare.h" +#include "fault.h" +#include "node.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "fabric/exchange.h" +#include "fabric/hb.h" +#include "fabric/migrate.h" +#include "fabric/partition.h" +#include "storage/storage.h" + + +//========================================================== +// Typedefs & constants. +// + +const as_partition_version ZERO_VERSION = { 0 }; + + +//========================================================== +// Globals. +// + +cf_atomic32 g_partition_generation = (uint32_t)-1; + +// Using int for 4-byte size, but maintaining bool semantics. +// TODO - ok as non-volatile, but should selectively load/store in the future. +static int g_init_balance_done = false; + +static cf_atomic32 g_migrate_num_incoming = 0; + +// Using int for 4-byte size, but maintaining bool semantics. +volatile int g_allow_migrations = false; + +uint64_t g_hashed_pids[AS_PARTITIONS]; + +// Shortcuts to values set by as_exchange, for use in partition balance only. +uint32_t g_cluster_size = 0; +cf_node* g_succession = NULL; + +cf_node g_full_node_seq_table[AS_CLUSTER_SZ * AS_PARTITIONS]; +sl_ix_t g_full_sl_ix_table[AS_CLUSTER_SZ * AS_PARTITIONS]; + + +//========================================================== +// Forward declarations. +// + +// Only partition_balance hooks into exchange. +extern cf_node* as_exchange_succession_unsafe(); + +// Helpers - balance partitions. +void fill_global_tables(); +void apply_single_replica_limit_ap(as_namespace* ns); +uint32_t rack_count(const as_namespace* ns); +int find_working_master_ap(const as_partition* p, const sl_ix_t* ns_sl_ix, const as_namespace* ns); +uint32_t find_duplicates_ap(const as_partition* p, const cf_node* ns_node_seq, const sl_ix_t* ns_sl_ix, const struct as_namespace_s* ns, uint32_t working_master_n, cf_node dupls[]); +void advance_version_ap(as_partition* p, const sl_ix_t* ns_sl_ix, as_namespace* ns, uint32_t self_n, uint32_t working_master_n, uint32_t n_dupl, const cf_node dupls[]); +uint32_t fill_family_versions(const as_partition* p, const sl_ix_t* ns_sl_ix, const as_namespace* ns, uint32_t working_master_n, uint32_t n_dupl, const cf_node dupls[], as_partition_version family_versions[]); +bool has_replica_parent(const as_partition* p, const sl_ix_t* ns_sl_ix, const as_namespace* ns, const as_partition_version* subset_version, uint32_t subset_n); +uint32_t find_family(const as_partition_version* self_version, uint32_t n_families, const as_partition_version family_versions[]); + +// Helpers - migration-related. +bool partition_immigration_is_valid(const as_partition* p, cf_node source_node, const as_namespace* ns, const char* tag); + + +//========================================================== +// Inlines & macros. +// + +static inline bool +is_self_final_master(const as_partition* p) +{ + return p->replicas[0] == g_config.self_node; +} + + +//========================================================== +// Public API - regulate migrations. +// + +void +as_partition_balance_disallow_migrations() +{ + cf_detail(AS_PARTITION, "disallow migrations"); + + g_allow_migrations = false; +} + + +bool +as_partition_balance_are_migrations_allowed() +{ + return g_allow_migrations; +} + + +void +as_partition_balance_synchronize_migrations() +{ + // Acquire and release each partition lock to ensure threads acquiring a + // partition lock after this will be forced to check the latest cluster key. + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace* ns = g_config.namespaces[ns_ix]; + + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + pthread_mutex_unlock(&p->lock); + } + } + + // Prior-round migrations won't decrement g_migrate_num_incoming due to + // cluster key check. + cf_atomic32_set(&g_migrate_num_incoming, 0); +} + + +//========================================================== +// Public API - balance partitions. +// + +void +as_partition_balance_init() +{ + // Cache hashed pids for all future rebalances. + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + g_hashed_pids[pid] = cf_hash_fnv64((const uint8_t*)&pid, + sizeof(uint32_t)); + } + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace* ns = g_config.namespaces[ns_ix]; + + uint32_t n_stored = 0; + + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + as_partition* p = &ns->partitions[pid]; + + as_storage_info_get(ns, p); + + if (as_partition_version_has_data(&p->version)) { + as_partition_isolate_version(ns, p); + n_stored++; + } + } + + cf_info(AS_PARTITION, "{%s} %u partitions: found %u absent, %u stored", + ns->name, AS_PARTITIONS, AS_PARTITIONS - n_stored, n_stored); + } + + partition_balance_init(); +} + + +// Has the node resolved as operating either in a multi-node cluster or as a +// single-node cluster? +bool +as_partition_balance_is_init_resolved() +{ + return g_init_balance_done; +} + + +void +as_partition_balance_revert_to_orphan() +{ + g_init_balance_done = false; + g_allow_migrations = false; + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace* ns = g_config.namespaces[ns_ix]; + + client_replica_maps_clear(ns); + + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + as_partition_freeze(p); + as_partition_isolate_version(ns, p); + + pthread_mutex_unlock(&p->lock); + } + + ns->n_unavailable_partitions = AS_PARTITIONS; + } + + cf_atomic32_incr(&g_partition_generation); +} + + +void +as_partition_balance() +{ + // Temporary paranoia. + static uint64_t last_cluster_key = 0; + + if (last_cluster_key == as_exchange_cluster_key()) { + cf_warning(AS_PARTITION, "as_partition_balance: cluster key %lx same as last time", + last_cluster_key); + return; + } + + last_cluster_key = as_exchange_cluster_key(); + // End - temporary paranoia. + + // These shortcuts must only be used within the scope of this function. + g_cluster_size = as_exchange_cluster_size(); + g_succession = as_exchange_succession_unsafe(); + + // Each partition separately shuffles the node succession list to generate + // its own node sequence. + fill_global_tables(); + + cf_queue mq; + + cf_queue_init(&mq, sizeof(pb_task), g_config.n_namespaces * AS_PARTITIONS, + false); + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + balance_namespace(g_config.namespaces[ns_ix], &mq); + } + + prepare_for_appeals(); + + // All partitions now have replicas assigned, ok to allow transactions. + g_init_balance_done = true; + cf_atomic32_incr(&g_partition_generation); + + g_allow_migrations = true; + cf_detail(AS_PARTITION, "allow migrations"); + + process_pb_tasks(&mq); + cf_queue_destroy(&mq); +} + + +uint64_t +as_partition_balance_remaining_migrations() +{ + uint64_t remaining_migrations = 0; + + for (uint32_t ns_ix = 0; ns_ix < g_config.n_namespaces; ns_ix++) { + as_namespace* ns = g_config.namespaces[ns_ix]; + + remaining_migrations += ns->migrate_tx_partitions_remaining; + remaining_migrations += ns->migrate_rx_partitions_remaining; + } + + return remaining_migrations; +} + + +//========================================================== +// Public API - migration-related as_partition methods. +// + +// Currently used only for enterprise build. +bool +as_partition_pending_migrations(as_partition* p) +{ + pthread_mutex_lock(&p->lock); + + bool pending = p->pending_immigrations + p->pending_emigrations > 0; + + pthread_mutex_unlock(&p->lock); + + return pending; +} + + +void +as_partition_emigrate_done(as_namespace* ns, uint32_t pid, + uint64_t orig_cluster_key, uint32_t tx_flags) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + if (! g_allow_migrations || orig_cluster_key != as_exchange_cluster_key()) { + cf_debug(AS_PARTITION, "{%s:%u} emigrate_done - cluster key mismatch", + ns->name, pid); + pthread_mutex_unlock(&p->lock); + return; + } + + if (p->pending_emigrations == 0) { + cf_warning(AS_PARTITION, "{%s:%u} emigrate_done - no pending emigrations", + ns->name, pid); + pthread_mutex_unlock(&p->lock); + return; + } + + p->pending_emigrations--; + + int64_t migrates_tx_remaining = + cf_atomic_int_decr(&ns->migrate_tx_partitions_remaining); + + if (migrates_tx_remaining < 0){ + cf_warning(AS_PARTITION, "{%s:%u} (%d,%ld) emigrate_done - counter went negative", + ns->name, pid, p->pending_emigrations, migrates_tx_remaining); + } + + if (! is_self_final_master(p)) { + emigrate_done_advance_non_master_version(ns, p, tx_flags); + } + + if (client_replica_maps_update(ns, pid)) { + cf_atomic32_incr(&g_partition_generation); + } + + cf_queue mq; + pb_task task; + int w_ix = -1; + + if (is_self_final_master(p) && + p->pending_emigrations == 0 && p->pending_immigrations == 0) { + cf_queue_init(&mq, sizeof(pb_task), p->n_witnesses, false); + + for (w_ix = 0; w_ix < (int)p->n_witnesses; w_ix++) { + pb_task_init(&task, p->witnesses[w_ix], ns, pid, orig_cluster_key, + PB_TASK_EMIG_SIGNAL_ALL_DONE, TX_FLAGS_NONE); + cf_queue_push(&mq, &task); + } + } + + pthread_mutex_unlock(&p->lock); + + if (w_ix >= 0) { + while (cf_queue_pop(&mq, &task, CF_QUEUE_NOWAIT) == CF_QUEUE_OK) { + as_migrate_emigrate(&task); + } + + cf_queue_destroy(&mq); + } +} + + +as_migrate_result +as_partition_immigrate_start(as_namespace* ns, uint32_t pid, + uint64_t orig_cluster_key, cf_node source_node) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + if (! g_allow_migrations || orig_cluster_key != as_exchange_cluster_key() || + immigrate_yield()) { + cf_debug(AS_PARTITION, "{%s:%u} immigrate_start - cluster key mismatch", + ns->name, pid); + pthread_mutex_unlock(&p->lock); + return AS_MIGRATE_AGAIN; + } + + uint32_t num_incoming = (uint32_t)cf_atomic32_incr(&g_migrate_num_incoming); + + if (num_incoming > g_config.migrate_max_num_incoming) { + cf_debug(AS_PARTITION, "{%s:%u} immigrate_start - exceeded max_num_incoming", + ns->name, pid); + cf_atomic32_decr(&g_migrate_num_incoming); + pthread_mutex_unlock(&p->lock); + return AS_MIGRATE_AGAIN; + } + + if (! partition_immigration_is_valid(p, source_node, ns, "start")) { + cf_atomic32_decr(&g_migrate_num_incoming); + pthread_mutex_unlock(&p->lock); + return AS_MIGRATE_FAIL; + } + + if (! is_self_final_master(p)) { + immigrate_start_advance_non_master_version(ns, p); + as_storage_info_set(ns, p, true); + } + + pthread_mutex_unlock(&p->lock); + + return AS_MIGRATE_OK; +} + + +as_migrate_result +as_partition_immigrate_done(as_namespace* ns, uint32_t pid, + uint64_t orig_cluster_key, cf_node source_node) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + if (! g_allow_migrations || orig_cluster_key != as_exchange_cluster_key()) { + cf_debug(AS_PARTITION, "{%s:%u} immigrate_done - cluster key mismatch", + ns->name, pid); + pthread_mutex_unlock(&p->lock); + return AS_MIGRATE_FAIL; + } + + cf_atomic32_decr(&g_migrate_num_incoming); + + if (! partition_immigration_is_valid(p, source_node, ns, "done")) { + pthread_mutex_unlock(&p->lock); + return AS_MIGRATE_FAIL; + } + + p->pending_immigrations--; + + int64_t migrates_rx_remaining = + cf_atomic_int_decr(&ns->migrate_rx_partitions_remaining); + + // Sanity-check only. + if (migrates_rx_remaining < 0) { + cf_warning(AS_PARTITION, "{%s:%u} (%d,%ld) immigrate_done - counter went negative", + ns->name, pid, p->pending_immigrations, migrates_rx_remaining); + } + + if (p->pending_immigrations == 0 && + ! as_partition_version_same(&p->version, &p->final_version)) { + p->version = p->final_version; + as_storage_info_set(ns, p, true); + } + + if (! is_self_final_master(p)) { + if (client_replica_maps_update(ns, pid)) { + cf_atomic32_incr(&g_partition_generation); + } + + pthread_mutex_unlock(&p->lock); + return AS_MIGRATE_OK; + } + + // Final master finished an immigration, adjust duplicates. + + if (source_node == p->working_master) { + p->working_master = g_config.self_node; + + immigrate_done_advance_final_master_version(ns, p); + } + else { + p->n_dupl = remove_node(p->dupls, p->n_dupl, source_node); + } + + if (client_replica_maps_update(ns, pid)) { + cf_atomic32_incr(&g_partition_generation); + } + + if (p->pending_immigrations != 0) { + pthread_mutex_unlock(&p->lock); + return AS_MIGRATE_OK; + } + + // Final master finished all immigration. + + cf_queue mq; + pb_task task; + + if (p->pending_emigrations != 0) { + cf_queue_init(&mq, sizeof(pb_task), p->n_replicas - 1, false); + + for (uint32_t repl_ix = 1; repl_ix < p->n_replicas; repl_ix++) { + if (p->immigrators[repl_ix]) { + pb_task_init(&task, p->replicas[repl_ix], ns, pid, + orig_cluster_key, PB_TASK_EMIG_TRANSFER, TX_FLAGS_NONE); + cf_queue_push(&mq, &task); + } + } + } + else { + cf_queue_init(&mq, sizeof(pb_task), p->n_witnesses, false); + + for (uint32_t w_ix = 0; w_ix < p->n_witnesses; w_ix++) { + pb_task_init(&task, p->witnesses[w_ix], ns, pid, orig_cluster_key, + PB_TASK_EMIG_SIGNAL_ALL_DONE, TX_FLAGS_NONE); + cf_queue_push(&mq, &task); + } + } + + pthread_mutex_unlock(&p->lock); + + while (cf_queue_pop(&mq, &task, 0) == CF_QUEUE_OK) { + as_migrate_emigrate(&task); + } + + cf_queue_destroy(&mq); + + return AS_MIGRATE_OK; +} + + +as_migrate_result +as_partition_migrations_all_done(as_namespace* ns, uint32_t pid, + uint64_t orig_cluster_key) +{ + as_partition* p = &ns->partitions[pid]; + + pthread_mutex_lock(&p->lock); + + if (! g_allow_migrations || orig_cluster_key != as_exchange_cluster_key()) { + cf_debug(AS_PARTITION, "{%s:%u} all_done - cluster key mismatch", + ns->name, pid); + pthread_mutex_unlock(&p->lock); + return AS_MIGRATE_FAIL; + } + + if (p->pending_emigrations != 0) { + cf_debug(AS_PARTITION, "{%s:%u} all_done - eagain", + ns->name, pid); + pthread_mutex_unlock(&p->lock); + return AS_MIGRATE_AGAIN; + } + + // Not a replica - drop partition. + if (! is_self_replica(p)) { + p->version = ZERO_VERSION; + as_storage_info_set(ns, p, true); + drop_trees(p, ns); + } + + pthread_mutex_unlock(&p->lock); + + return AS_MIGRATE_OK; +} + + +//========================================================== +// Local helpers - generic. +// + +void +pb_task_init(pb_task* task, cf_node dest, as_namespace* ns, + uint32_t pid, uint64_t cluster_key, pb_task_type type, + uint32_t tx_flags) +{ + task->dest = dest; + task->ns = ns; + task->pid = pid; + task->type = type; + task->tx_flags = tx_flags; + task->cluster_key = cluster_key; +} + + +void +drop_trees(as_partition* p, as_namespace* ns) +{ + as_index_tree* temp = p->vp; + + p->vp = as_index_tree_create(&ns->tree_shared, ns->arena); + as_index_tree_release(temp); + + // TODO - consider p->n_tombstones? + cf_atomic64_set(&p->max_void_time, 0); +} + + +//========================================================== +// Local helpers - balance partitions. +// + +// fill_global_tables() +// +// Succession list - all nodes in cluster +// +---------------+ +// | A | B | C | D | +// +---------------+ +// +// Succession list index (sl_ix) - used as version table and rack-id index +// +---------------+ +// | 0 | 1 | 2 | 3 | +// +---------------+ +// +// Every partition shuffles the succession list independently, e.g. for pid 0: +// Hash the node names with the pid: +// H(A,0) = Y, H(B,0) = X, H(C,0) = W, H(D,0) = Z +// Store sl_ix in last byte of hash results so it doesn't affect sort: +// +-----------------------+ +// | Y_0 | X_1 | W_2 | Z_3 | +// +-----------------------+ +// This sorts to: +// +-----------------------+ +// | W_2 | X_1 | Y_0 | Z_3 | +// +-----------------------+ +// Replace original node names, and keep sl_ix order, resulting in: +// +---------------+ +---------------+ +// | C | B | A | D | | 2 | 1 | 0 | 3 | +// +---------------+ +---------------+ +// +// Node sequence table Succession list index table +// pid pid +// +===+---------------+ +===+---------------+ +// | 0 | C | B | A | D | | 0 | 2 | 1 | 0 | 3 | +// +===+---------------+ +===+---------------+ +// | 1 | A | D | C | B | | 1 | 0 | 3 | 2 | 1 | +// +===+---------------+ +===+---------------+ +// | 2 | D | C | B | A | | 2 | 3 | 2 | 1 | 0 | +// +===+---------------+ +===+---------------+ +// | 3 | B | A | D | C | | 3 | 1 | 0 | 3 | 2 | +// +===+---------------+ +===+---------------+ +// | 4 | D | B | C | A | | 4 | 3 | 1 | 2 | 0 | +// +===+---------------+ +===+---------------+ +// ... to pid 4095. +// +// We keep the succession list index table so we can refer back to namespaces' +// partition version tables and rack-id lists, where nodes are in the original +// succession list order. +void +fill_global_tables() +{ + uint64_t hashed_nodes[g_cluster_size]; + + for (uint32_t n = 0; n < g_cluster_size; n++) { + hashed_nodes[n] = cf_hash_fnv64((const uint8_t*)&g_succession[n], + sizeof(cf_node)); + } + + // Build the node sequence table. + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + inter_hash h; + + h.hashed_pid = g_hashed_pids[pid]; + + for (uint32_t n = 0; n < g_cluster_size; n++) { + h.hashed_node = hashed_nodes[n]; + + cf_node* node_p = &FULL_NODE_SEQ(pid, n); + + *node_p = cf_hash_jen64((const uint8_t*)&h, sizeof(h)); + + // Overlay index onto last byte. + *node_p &= AS_CLUSTER_SZ_MASKP; + *node_p += n; + } + + // Sort the hashed node values. + qsort(&FULL_NODE_SEQ(pid, 0), g_cluster_size, sizeof(cf_node), + cf_node_compare_desc); + + // Overwrite the sorted hash values with the original node IDs. + for (uint32_t n = 0; n < g_cluster_size; n++) { + cf_node* node_p = &FULL_NODE_SEQ(pid, n); + sl_ix_t sl_ix = (sl_ix_t)(*node_p & AS_CLUSTER_SZ_MASKN); + + *node_p = g_succession[sl_ix]; + + // Saved to refer back to partition version table and rack-id list. + FULL_SL_IX(pid, n) = sl_ix; + } + } +} + + +void +balance_namespace_ap(as_namespace* ns, cf_queue* mq) +{ + bool ns_less_than_global = ns->cluster_size != g_cluster_size; + + if (ns_less_than_global) { + cf_info(AS_PARTITION, "{%s} is on %u of %u nodes", ns->name, + ns->cluster_size, g_cluster_size); + } + + // Figure out effective replication factor in the face of node failures. + apply_single_replica_limit_ap(ns); + + uint32_t n_racks = rack_count(ns); + + // If a namespace is not on all nodes or is rack aware, it can't use the + // global node sequence and index tables. + bool ns_not_equal_global = ns_less_than_global || n_racks != 1; + + // The translation array is used to convert global table rows to namespace + // rows, if necessary. + int translation[ns_less_than_global ? g_cluster_size : 0]; + + if (ns_less_than_global) { + fill_translation(translation, ns); + } + + uint32_t ns_pending_immigrations = 0; + uint32_t ns_pending_emigrations = 0; + uint32_t ns_pending_signals = 0; + + uint32_t ns_fresh_partitions = 0; + + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + as_partition* p = &ns->partitions[pid]; + + cf_node* full_node_seq = &FULL_NODE_SEQ(pid, 0); + sl_ix_t* full_sl_ix = &FULL_SL_IX(pid, 0); + + // Usually a namespace can simply use the global tables... + cf_node* ns_node_seq = full_node_seq; + sl_ix_t* ns_sl_ix = full_sl_ix; + + cf_node stack_node_seq[ns_not_equal_global ? ns->cluster_size : 0]; + sl_ix_t stack_sl_ix[ns_not_equal_global ? ns->cluster_size : 0]; + + // ... but sometimes a namespace is different. + if (ns_not_equal_global) { + ns_node_seq = stack_node_seq; + ns_sl_ix = stack_sl_ix; + + fill_namespace_rows(full_node_seq, full_sl_ix, ns_node_seq, + ns_sl_ix, ns, translation); + + if (n_racks != 1) { + rack_aware_adjust_row(ns_node_seq, ns_sl_ix, + ns->replication_factor, ns->rack_ids, ns->cluster_size, + n_racks, 1); + } + } + + pthread_mutex_lock(&p->lock); + + p->n_replicas = ns->replication_factor; + memset(p->replicas, 0, sizeof(p->replicas)); + memcpy(p->replicas, ns_node_seq, p->n_replicas * sizeof(cf_node)); + + p->pending_emigrations = 0; + p->pending_immigrations = 0; + memset(p->immigrators, 0, sizeof(p->immigrators)); + + p->working_master = (cf_node)0; + + p->n_dupl = 0; + memset(p->dupls, 0, sizeof(p->dupls)); + + p->n_witnesses = 0; + memset(p->witnesses, 0, sizeof(p->witnesses)); + + uint32_t self_n = find_self(ns_node_seq, ns); + + as_partition_version final_version = { + .ckey = as_exchange_cluster_key() + }; + + p->final_version = final_version; + p->final_version.master = self_n == 0 ? 1 : 0; + + int working_master_n = find_working_master_ap(p, ns_sl_ix, ns); + + uint32_t n_dupl = 0; + cf_node dupls[ns->cluster_size]; + + memset(dupls, 0, sizeof(dupls)); + + // TEMPORARY debugging. + uint32_t debug_n_immigrators = 0; + as_partition_version debug_orig = ZERO_VERSION; + + if (working_master_n == -1) { + // No existing versions - assign fresh version to replicas. + working_master_n = 0; + + if (self_n < p->n_replicas) { + p->version = p->final_version; + } + + ns_fresh_partitions++; + } + else { + n_dupl = find_duplicates_ap(p, ns_node_seq, ns_sl_ix, ns, + (uint32_t)working_master_n, dupls); + + uint32_t n_immigrators = fill_immigrators(p, ns_sl_ix, ns, + (uint32_t)working_master_n, n_dupl); + + // TEMPORARY debugging. + debug_n_immigrators = n_immigrators; + debug_orig = p->version; + + if (n_immigrators != 0) { + // Migrations required - advance versions for next rebalance, + // queue migrations for this rebalance. + + advance_version_ap(p, ns_sl_ix, ns, self_n, + (uint32_t)working_master_n, n_dupl, dupls); + + queue_namespace_migrations(p, ns, self_n, + ns_node_seq[working_master_n], n_dupl, dupls, mq); + + if (self_n == 0) { + fill_witnesses(p, ns_node_seq, ns_sl_ix, ns); + ns_pending_signals += p->n_witnesses; + } + } + else if (self_n < p->n_replicas) { + // No migrations required - refresh replicas' versions (only + // truly necessary if replication factor decreased). + p->version = p->final_version; + } + else { + // No migrations required - drop superfluous non-replica + // partitions immediately. + p->version = ZERO_VERSION; + as_storage_info_set(ns, p, false); + drop_trees(p, ns); + } + } + + if (self_n == 0 || self_n == working_master_n) { + p->working_master = ns_node_seq[working_master_n]; + } + + if (! as_partition_version_is_null(&p->version)) { + as_storage_info_set(ns, p, false); + } + + ns_pending_immigrations += (uint32_t)p->pending_immigrations; + ns_pending_emigrations += (uint32_t)p->pending_emigrations; + + // TEMPORARY debugging. + if (pid < 20) { + cf_debug(AS_PARTITION, "ck%012lX %02u (%d %d) %s -> %s - self_n %u wm_n %d repls %u dupls %u immigrators %u", + as_exchange_cluster_key(), pid, p->pending_emigrations, + p->pending_immigrations, VERSION_AS_STRING(&debug_orig), + VERSION_AS_STRING(&p->version), self_n, working_master_n, + p->n_replicas, n_dupl, debug_n_immigrators); + } + + client_replica_maps_update(ns, pid); + + pthread_mutex_unlock(&p->lock); + } + + // Commit partition versions to device. + // TODO - always flush each partition's version on storage format change. + as_storage_info_flush(ns); + + cf_info(AS_PARTITION, "{%s} rebalanced: expected-migrations (%u,%u) expected-signals %u fresh-partitions %u", + ns->name, ns_pending_emigrations, ns_pending_immigrations, + ns_pending_signals, ns_fresh_partitions); + + ns->n_unavailable_partitions = 0; + + ns->migrate_tx_partitions_initial = ns_pending_emigrations; + ns->migrate_tx_partitions_remaining = ns_pending_emigrations; + + ns->migrate_rx_partitions_initial = ns_pending_immigrations; + ns->migrate_rx_partitions_remaining = ns_pending_immigrations; + + ns->migrate_signals_remaining = ns_pending_signals; +} + + +void +apply_single_replica_limit_ap(as_namespace* ns) +{ + // Replication factor can't be bigger than observed cluster. + uint32_t repl_factor = ns->cluster_size < ns->cfg_replication_factor ? + ns->cluster_size : ns->cfg_replication_factor; + + // Reduce the replication factor to 1 if the cluster size is less than or + // equal to the specified limit. + ns->replication_factor = + ns->cluster_size <= g_config.paxos_single_replica_limit ? + 1 : repl_factor; + + cf_info(AS_PARTITION, "{%s} replication factor is %u", ns->name, + ns->replication_factor); +} + + +uint32_t +rack_count(const as_namespace* ns) +{ + uint32_t ids[ns->cluster_size]; + + memcpy(ids, ns->rack_ids, sizeof(ids)); + qsort(ids, ns->cluster_size, sizeof(uint32_t), cf_compare_uint32_desc); + + if (ids[0] == ids[ns->cluster_size - 1]) { + return 1; // common path - not rack-aware + } + + uint32_t n_racks = 1; + uint32_t cur_id = ids[0]; + + for (uint32_t i = 1; i < ns->cluster_size; i++) { + if (ids[i] != cur_id) { + cur_id = ids[i]; + n_racks++; + } + } + + return n_racks; +} + + +void +fill_translation(int translation[], const as_namespace* ns) +{ + int ns_n = 0; + + for (uint32_t full_n = 0; full_n < g_cluster_size; full_n++) { + translation[full_n] = ns_n < ns->cluster_size && + g_succession[full_n] == ns->succession[ns_n] ? ns_n++ : -1; + } +} + + +void +fill_namespace_rows(const cf_node* full_node_seq, const sl_ix_t* full_sl_ix, + cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, const as_namespace* ns, + const int translation[]) +{ + if (ns->cluster_size == g_cluster_size) { + // Rack-aware but namespace is on all nodes - just copy. Rack-aware will + // rearrange the copies - we can't rearrange the global originals. + memcpy(ns_node_seq, full_node_seq, g_cluster_size * sizeof(cf_node)); + memcpy(ns_sl_ix, full_sl_ix, g_cluster_size * sizeof(sl_ix_t)); + + return; + } + + // Fill namespace sequences from global table rows using translation array. + uint32_t n = 0; + + for (uint32_t full_n = 0; full_n < g_cluster_size; full_n++) { + int ns_n = translation[full_sl_ix[full_n]]; + + if (ns_n != -1) { + ns_node_seq[n] = ns->succession[ns_n]; + ns_sl_ix[n] = (sl_ix_t)ns_n; + n++; + } + } +} + + +uint32_t +find_self(const cf_node* ns_node_seq, const as_namespace* ns) +{ + int n = index_of_node(ns_node_seq, ns->cluster_size, g_config.self_node); + + cf_assert(n != -1, AS_PARTITION, "{%s} self node not in succession list", + ns->name); + + return (uint32_t)n; +} + + +// Preference: Vm > V > Ve > Vs > Vse > absent. +int +find_working_master_ap(const as_partition* p, const sl_ix_t* ns_sl_ix, + const as_namespace* ns) +{ + int best_n = -1; + int best_score = -1; + + for (int n = 0; n < (int)ns->cluster_size; n++) { + const as_partition_version* version = INPUT_VERSION(n); + + // Skip versions with no data. + if (! as_partition_version_has_data(version)) { + continue; + } + + // If previous working master exists, use it. (There can be more than + // one after split brains. Also, the flag is only to prevent superfluous + // master swaps on rebalance when rack-aware.) + if (version->master == 1) { + return n; + } + // else - keep going but remember the best so far. + + // V = 3 > Ve = 2 > Vs = 1 > Vse = 0. + int score = (version->evade == 1 ? 0 : 1) + + (version->subset == 1 ? 0 : 2); + + if (score > best_score) { + best_score = score; + best_n = n; + } + } + + return best_n; +} + + +uint32_t +find_duplicates_ap(const as_partition* p, const cf_node* ns_node_seq, + const sl_ix_t* ns_sl_ix, const as_namespace* ns, + uint32_t working_master_n, cf_node dupls[]) +{ + uint32_t n_dupl = 0; + as_partition_version parent_dupl_versions[ns->cluster_size]; + + memset(parent_dupl_versions, 0, sizeof(parent_dupl_versions)); + + for (uint32_t n = 0; n < ns->cluster_size; n++) { + const as_partition_version* version = INPUT_VERSION(n); + + // Skip versions without data, and postpone subsets to next pass. + if (! as_partition_version_has_data(version) || version->subset == 1) { + continue; + } + + // Every unique version is a duplicate. + if (version->family == VERSION_FAMILY_UNIQUE) { + dupls[n_dupl++] = ns_node_seq[n]; + continue; + } + + // Add parent versions as duplicates, unless they are already in. + + uint32_t d; + + for (d = 0; d < n_dupl; d++) { + if (is_family_same(&parent_dupl_versions[d], version)) { + break; + } + } + + if (d == n_dupl) { + // Not in dupls. + parent_dupl_versions[n_dupl] = *version; + dupls[n_dupl++] = ns_node_seq[n]; + } + } + + // Second pass to deal with subsets. + for (uint32_t n = 0; n < ns->cluster_size; n++) { + const as_partition_version* version = INPUT_VERSION(n); + + if (version->subset == 0) { + continue; + } + + uint32_t d; + + for (d = 0; d < n_dupl; d++) { + if (is_family_same(&parent_dupl_versions[d], version)) { + break; + } + } + + if (d == n_dupl) { + // Not in dupls. + // Leave 0 in parent_dupl_versions array. + dupls[n_dupl++] = ns_node_seq[n]; + } + } + + // Remove working master from 'variants' to leave duplicates. + return remove_node(dupls, n_dupl, ns_node_seq[working_master_n]); +} + + +uint32_t +fill_immigrators(as_partition* p, const sl_ix_t* ns_sl_ix, as_namespace* ns, + uint32_t working_master_n, uint32_t n_dupl) +{ + uint32_t n_immigrators = 0; + + for (uint32_t repl_ix = 0; repl_ix < p->n_replicas; repl_ix++) { + const as_partition_version* version = INPUT_VERSION(repl_ix); + + if (n_dupl != 0 || (repl_ix != working_master_n && + (! as_partition_version_has_data(version) || + version->subset == 1))) { + p->immigrators[repl_ix] = true; + n_immigrators++; + } + } + + return n_immigrators; +} + + +void +advance_version_ap(as_partition* p, const sl_ix_t* ns_sl_ix, as_namespace* ns, + uint32_t self_n, uint32_t working_master_n, uint32_t n_dupl, + const cf_node dupls[]) +{ + // Advance working master. + if (self_n == working_master_n) { + p->version.ckey = p->final_version.ckey; + p->version.family = (self_n == 0 || n_dupl == 0) ? 0 : 1; + p->version.master = 1; + p->version.subset = 0; + p->version.evade = 0; + + return; + } + + p->version.master = 0; + + bool self_is_versionless = ! as_partition_version_has_data(&p->version); + + // Advance eventual master. + if (self_n == 0) { + bool was_subset = p->version.subset == 1; + + p->version.ckey = p->final_version.ckey; + p->version.family = 0; + p->version.subset = n_dupl == 0 ? 1 : 0; + + if (self_is_versionless || (was_subset && p->version.subset == 0)) { + p->version.evade = 1; + } + // else - don't change evade flag. + + return; + } + + // Advance version-less proles and non-replicas (common case). + if (self_is_versionless) { + if (self_n < p->n_replicas) { + p->version.ckey = p->final_version.ckey; + p->version.family = 0; + p->version.subset = 1; + p->version.evade = 1; + } + // else - non-replicas remain version-less. + + return; + } + + // Fill family versions. + + uint32_t max_n_families = p->n_replicas + 1; + + if (max_n_families > AS_PARTITION_N_FAMILIES) { + max_n_families = AS_PARTITION_N_FAMILIES; + } + + as_partition_version family_versions[max_n_families]; + uint32_t n_families = fill_family_versions(p, ns_sl_ix, ns, + working_master_n, n_dupl, dupls, family_versions); + + uint32_t family = find_family(&p->version, n_families, family_versions); + + // Advance non-masters with prior versions ... + + // ... proles ... + if (self_n < p->n_replicas) { + p->version.ckey = p->final_version.ckey; + p->version.family = family; + + if (n_dupl != 0 && p->version.family == 0) { + p->version.subset = 1; + } + // else - don't change either subset or evade flag. + + return; + } + + // ... or non-replicas. + if (family != VERSION_FAMILY_UNIQUE && + family_versions[family].subset == 0) { + p->version.ckey = p->final_version.ckey; + p->version.family = family; + p->version.subset = 1; + } + // else - leave version as-is. +} + + +uint32_t +fill_family_versions(const as_partition* p, const sl_ix_t* ns_sl_ix, + const as_namespace* ns, uint32_t working_master_n, uint32_t n_dupl, + const cf_node dupls[], as_partition_version family_versions[]) +{ + uint32_t n_families = 1; + const as_partition_version* final_master_version = INPUT_VERSION(0); + + family_versions[0] = *final_master_version; + + if (working_master_n != 0) { + const as_partition_version* working_master_version = + INPUT_VERSION(working_master_n); + + if (n_dupl == 0) { + family_versions[0] = *working_master_version; + } + else { + family_versions[0] = p->final_version; // not matchable + family_versions[1] = *working_master_version; + n_families = 2; + } + } + + for (uint32_t repl_ix = 1; + repl_ix < p->n_replicas && n_families < AS_PARTITION_N_FAMILIES; + repl_ix++) { + if (repl_ix == working_master_n) { + continue; + } + + const as_partition_version* version = INPUT_VERSION(repl_ix); + + if (contains_node(dupls, n_dupl, p->replicas[repl_ix])) { + family_versions[n_families++] = *version; + } + else if (version->subset == 1 && + ! has_replica_parent(p, ns_sl_ix, ns, version, repl_ix)) { + family_versions[n_families++] = *version; + } + } + + return n_families; +} + + +bool +has_replica_parent(const as_partition* p, const sl_ix_t* ns_sl_ix, + const as_namespace* ns, const as_partition_version* subset_version, + uint32_t subset_n) +{ + for (uint32_t repl_ix = 1; repl_ix < p->n_replicas; repl_ix++) { + if (repl_ix == subset_n) { + continue; + } + + const as_partition_version* version = INPUT_VERSION(repl_ix); + + if (version->subset == 0 && is_family_same(version, subset_version)) { + return true; + } + } + + return false; +} + + +uint32_t +find_family(const as_partition_version* self_version, uint32_t n_families, + const as_partition_version family_versions[]) +{ + for (uint32_t n = 0; n < n_families; n++) { + if (is_family_same(self_version, &family_versions[n])) { + return n; + } + } + + return VERSION_FAMILY_UNIQUE; +} + + +void +queue_namespace_migrations(as_partition* p, as_namespace* ns, uint32_t self_n, + cf_node working_master, uint32_t n_dupl, cf_node dupls[], cf_queue* mq) +{ + pb_task task; + + if (self_n == 0) { + // <><><><><><> Final Master <><><><><><> + + if (g_config.self_node == working_master) { + p->pending_immigrations = (int)n_dupl; + } + else { + // Remove self from duplicates. + n_dupl = remove_node(dupls, n_dupl, g_config.self_node); + + p->pending_immigrations = (int)n_dupl + 1; + } + + if (n_dupl != 0) { + p->n_dupl = n_dupl; + memcpy(p->dupls, dupls, n_dupl * sizeof(cf_node)); + } + + if (p->pending_immigrations != 0) { + for (uint32_t repl_ix = 1; repl_ix < p->n_replicas; repl_ix++) { + if (p->immigrators[repl_ix]) { + p->pending_emigrations++; + } + } + + // Emigrate later, after all immigration is complete. + return; + } + + // Emigrate now, no immigrations to wait for. + for (uint32_t repl_ix = 1; repl_ix < p->n_replicas; repl_ix++) { + if (p->immigrators[repl_ix]) { + p->pending_emigrations++; + pb_task_init(&task, p->replicas[repl_ix], ns, p->id, + as_exchange_cluster_key(), PB_TASK_EMIG_TRANSFER, + TX_FLAGS_NONE); + cf_queue_push(mq, &task); + } + } + + return; + } + // else - <><><><><><> Not Final Master <><><><><><> + + if (g_config.self_node == working_master) { + if (n_dupl != 0) { + p->n_dupl = n_dupl; + memcpy(p->dupls, dupls, n_dupl * sizeof(cf_node)); + } + + p->pending_emigrations = 1; + pb_task_init(&task, p->replicas[0], ns, p->id, + as_exchange_cluster_key(), PB_TASK_EMIG_TRANSFER, + TX_FLAGS_ACTING_MASTER); + cf_queue_push(mq, &task); + } + else if (contains_self(dupls, n_dupl)) { + p->pending_emigrations = 1; + pb_task_init(&task, p->replicas[0], ns, p->id, + as_exchange_cluster_key(), PB_TASK_EMIG_TRANSFER, + TX_FLAGS_NONE); + cf_queue_push(mq, &task); + } + + if (self_n < p->n_replicas && p->immigrators[self_n]) { + p->pending_immigrations = 1; + } +} + + +void +fill_witnesses(as_partition* p, const cf_node* ns_node_seq, + const sl_ix_t* ns_sl_ix, as_namespace* ns) +{ + for (uint32_t n = 1; n < ns->cluster_size; n++) { + const as_partition_version* version = INPUT_VERSION(n); + + // Note - 0e versions (CP) are witnesses. + if (n < p->n_replicas || ! as_partition_version_is_null(version)) { + p->witnesses[p->n_witnesses++] = ns_node_seq[n]; + } + } +} + + +//========================================================== +// Local helpers - migration-related as_partition methods. +// + +// Sanity checks for immigrations commands. +bool +partition_immigration_is_valid(const as_partition* p, cf_node source_node, + const as_namespace* ns, const char* tag) +{ + char* failure_reason = NULL; + + if (p->pending_immigrations == 0) { + failure_reason = "no immigrations expected"; + } + else if (is_self_final_master(p)) { + if (source_node != p->working_master && + ! contains_node(p->dupls, p->n_dupl, source_node)) { + failure_reason = "final master's source not acting master or duplicate"; + } + } + else if (source_node != p->replicas[0]) { + failure_reason = "prole's source not final working master"; + } + + if (failure_reason) { + cf_warning(AS_PARTITION, "{%s:%u} immigrate_%s - source %lx working-master %lx pending-immigrations %d - %s", + ns->name, p->id, tag, source_node, p->working_master, + p->pending_immigrations, failure_reason); + + return false; + } + + return true; +} + + +void +emigrate_done_advance_non_master_version_ap(as_namespace* ns, as_partition* p, + uint32_t tx_flags) +{ + if ((tx_flags & TX_FLAGS_ACTING_MASTER) != 0) { + p->working_master = (cf_node)0; + p->n_dupl = 0; + p->version.master = 0; + } + + p->version.ckey = p->final_version.ckey; + p->version.family = 0; + + if (p->pending_immigrations != 0 || ! is_self_replica(p)) { + p->version.subset = 1; + } + // else - must already be a parent. + + as_storage_info_set(ns, p, true); +} + + +void +immigrate_start_advance_non_master_version_ap(as_partition* p) +{ + // Become subset of final version if not already such. + if (! (p->version.ckey == p->final_version.ckey && + p->version.family == 0 && p->version.subset == 1)) { + p->version.ckey = p->final_version.ckey; + p->version.family = 0; + p->version.master = 0; // racing emigrate done if we were acting master + p->version.subset = 1; + // Leave evade flag as-is. + } +} + + +void +immigrate_done_advance_final_master_version_ap(as_namespace* ns, + as_partition* p) +{ + if (! as_partition_version_same(&p->version, &p->final_version)) { + p->version = p->final_version; + as_storage_info_set(ns, p, true); + } +} diff --git a/as/src/fabric/partition_balance_ce.c b/as/src/fabric/partition_balance_ce.c new file mode 100644 index 00000000..0f59de87 --- /dev/null +++ b/as/src/fabric/partition_balance_ce.c @@ -0,0 +1,126 @@ +/* + * partition_balance_ce.c + * + * Copyright (C) 2017-2018 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "fabric/partition_balance.h" + +#include +#include + +#include "citrusleaf/cf_queue.h" + +#include "node.h" + +#include "base/datamodel.h" +#include "fabric/partition.h" +#include "fabric/migrate.h" + + +//========================================================== +// Public API. +// + +void +as_partition_balance_emigration_yield() +{ +} + +bool +as_partition_balance_revive(as_namespace* ns) +{ + cf_warning(AS_PARTITION, "revive is an enterprise feature"); + return true; +} + +bool +as_partition_pre_emigrate_done(as_namespace* ns, uint32_t pid, + uint64_t orig_cluster_key, uint32_t tx_flags) +{ + return true; +} + + +//========================================================== +// Private API - for enterprise separation only. +// + +void +partition_balance_init() +{ +} + +void +balance_namespace(as_namespace* ns, cf_queue* mq) +{ + balance_namespace_ap(ns, mq); +} + +void +prepare_for_appeals() +{ +} + +void +process_pb_tasks(cf_queue* tq) +{ + pb_task task; + + while (cf_queue_pop(tq, &task, CF_QUEUE_NOWAIT) == CF_QUEUE_OK) { + as_migrate_emigrate(&task); + } +} + +void +rack_aware_adjust_row(cf_node* ns_node_seq, sl_ix_t* ns_sl_ix, + uint32_t replication_factor, const uint32_t* rack_ids, uint32_t n_ids, + uint32_t n_racks, uint32_t start_n) +{ + cf_crash(AS_PARTITION, "CE code called rack_aware_adjust_row()"); +} + +void +emigrate_done_advance_non_master_version(as_namespace* ns, as_partition* p, + uint32_t tx_flags) +{ + emigrate_done_advance_non_master_version_ap(ns, p, tx_flags); +} + +void +immigrate_start_advance_non_master_version(as_namespace* ns, as_partition* p) +{ + immigrate_start_advance_non_master_version_ap(p); +} + +void +immigrate_done_advance_final_master_version(as_namespace* ns, as_partition* p) +{ + immigrate_done_advance_final_master_version_ap(ns, p); +} + +bool +immigrate_yield() +{ + return false; +} diff --git a/as/src/fabric/partition_ce.c b/as/src/fabric/partition_ce.c new file mode 100644 index 00000000..86520f73 --- /dev/null +++ b/as/src/fabric/partition_ce.c @@ -0,0 +1,67 @@ +/* + * partition_ce.c + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "fabric/partition.h" + +#include + +#include "node.h" + +#include "base/datamodel.h" +#include "base/proto.h" + + +//========================================================== +// Public API. +// + +void +as_partition_isolate_version(const as_namespace* ns, as_partition* p) +{ + if (as_partition_version_has_data(&p->version)) { + p->version.master = 0; + p->version.subset = 1; + } +} + +int +as_partition_check_source(const as_namespace* ns, as_partition* p, cf_node src, + bool* from_replica) +{ + return AS_PROTO_RESULT_OK; +} + + +//========================================================== +// Private API - for enterprise separation only. +// + +bool +partition_reserve_promote(const as_namespace* ns, const as_partition* p, + bool would_dup_res) +{ + return p->n_dupl != 0 && would_dup_res; +} diff --git a/as/src/fabric/roster_ce.c b/as/src/fabric/roster_ce.c new file mode 100644 index 00000000..f86be963 --- /dev/null +++ b/as/src/fabric/roster_ce.c @@ -0,0 +1,50 @@ +/* + * roster_ce.c + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "fabric/roster.h" + +#include + +#include "fault.h" + + +//========================================================== +// Public API. +// + +void +as_roster_init_smd() +{ + // CE Code doesn't invoke roster SMD module. + // TODO - how to handle with future static SMD module initialization? +} + +bool +as_roster_set_nodes_cmd(const char* ns_name, const char* nodes) +{ + cf_warning(AS_ROSTER, "roster is an enterprise feature"); + return false; +} diff --git a/as/src/fabric/skew_monitor.c b/as/src/fabric/skew_monitor.c new file mode 100644 index 00000000..44ea0339 --- /dev/null +++ b/as/src/fabric/skew_monitor.c @@ -0,0 +1,611 @@ +/* + * skew_monitor.c + * + * Copyright (C) 2012-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "fabric/skew_monitor.h" + +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" + +#include "msg.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "fabric/clustering.h" +#include "fabric/exchange.h" +#include "fabric/hb.h" + +/* + * Overview + * ======== + * Monitors skew across nodes in a cluster to allow other modules to handle skew + * beyond tolerances. For example CP namespaces block transctions on skew beyond + * tolerable limits. + * + * Principle of skew monitoring + * ============================ + * The hlc clock forms a pretty close upper bound on the physical clocks for + * adjacent nodes within the bounds of network trip time. + * + * Lets call the difference between a node's physical component of hlc time and + * physical time at the same instant as its hlc_delta. + * The premise is that the difference between the min hlc_delta and max + * hlc_delta observed for adjacent nodes closely follows the maximum clock skew + * in the cluster. + * + * The clock skew monitor adds a physical timestamp field to each heartbeat + * pulse message. + * For a peer node on receipt of a heartbeat pulse, hlc_delta is computed as + * hlc_delta = physical-component(pulse-hlc) - pulse-timestamp + * + * We maintain a exponential moving average of the hlc_delta to buffer against + * small fluctuations + * avg_hlc_delta = (ALPHA)(hlc_delta) + (1-ALPHA)(avg_hlc_delta) + * + * where ALPHA is set to weigh current values more over older values. + * + * Cluster wide clock ckew is updated at periodic intervals. A low water mark + * breach of the skew generates warnings and a high water mark breach causes + * (TODO: ????). + * + * Design + * ======= + * The monitor is ticks on heartbeat message sends without requiring an + * additional thread. This is alright as heartbeat pulse messages are the + * vehicle used for skew detection. The amount of computation amortized across + * sent heartbeat pulse messages is minimal and should be maintained so. + */ + +/* + * ---------------------------------------------------------------------------- + * Constants + * ---------------------------------------------------------------------------- + */ + +/** + * Weightage of current clock delta over current moving average. For now weigh + * recent values heavily over older values. + */ +#define ALPHA (0.65) + +/* + * ---------------------------------------------------------------------------- + * Logging + * ---------------------------------------------------------------------------- + */ +#define CRASH(format, ...) cf_crash(AS_SKEW, format, ##__VA_ARGS__) +#define WARNING(format, ...) cf_warning(AS_SKEW, format, ##__VA_ARGS__) +#define INFO(format, ...) cf_info(AS_SKEW, format, ##__VA_ARGS__) +#define DEBUG(format, ...) cf_debug(AS_SKEW, format, ##__VA_ARGS__) +#define DETAIL(format, ...) cf_detail(AS_SKEW, format, ##__VA_ARGS__) + +/* + * ---------------------------------------------------------------------------- + * Skew monitor data structures + * ---------------------------------------------------------------------------- + */ + +/** + * A struct to hold and its skew related information. + */ +typedef struct as_skew_monitor_node_skew_data_s +{ + cf_node nodeid; + int64_t delta; +} as_skew_monitor_node_skew_data; + +/** + * HB plugin data iterate to get node hlc deltas. + */ +typedef struct as_skew_monitor_hlc_delta_udata_s +{ + int num_nodes; + as_skew_monitor_node_skew_data skew_data[AS_CLUSTER_SZ]; +} as_skew_monitor_hlc_delta_udata; + +/* + * ---------------------------------------------------------------------------- + * External protected API for skew monitor + * ---------------------------------------------------------------------------- + */ +extern int +as_hb_msg_send_hlc_ts_get(msg* msg, as_hlc_timestamp* send_ts); + +/* + * ---------------------------------------------------------------------------- + * Globals + * ---------------------------------------------------------------------------- + */ + +/** + * Last time skew was checked. + */ +cf_atomic64 g_last_skew_check_time = 0; + +/** + * Current value of clock skew. + */ +cf_atomic64 g_skew = 0; + +/** + * Moving average of the clock skew for self node. + */ +volatile int64_t g_self_skew_avg = 0; + +/* + * ---------------------------------------------------------------------------- + * Skew intervals and limits + * ---------------------------------------------------------------------------- + */ + +/** + * Interval at which skew checks should be made. + */ +static uint64_t +skew_check_interval() +{ + return MIN(2000, as_clustering_quantum_interval() / 2); +} + +/** + * Threshold for outlier detection. Skew values less than this threshold will + * not invoke outlier detection. + */ +static uint64_t +skew_monitor_outlier_detection_threshold() +{ + return as_clustering_quantum_interval(); +} + +/* + * ---------------------------------------------------------------------------- + * HLC delta related + * ---------------------------------------------------------------------------- + */ + +/** + * Find min and max skew using difference between physical clock and hlc. + */ +static void +skew_monitor_delta_collect_iterate(cf_node nodeid, void* plugin_data, + size_t plugin_data_size, cf_clock recv_monotonic_ts, + as_hlc_msg_timestamp* msg_hlc_ts, void* udata) +{ + int64_t delta = 0; + as_skew_monitor_hlc_delta_udata* deltas = + (as_skew_monitor_hlc_delta_udata*)udata; + + if (!plugin_data || plugin_data_size < sizeof(uint64_t)) { + // Assume missing nodes share the same delta as self. + // Note: self node will not be in adjacency list and hence will also + // follow same code path. + delta = g_self_skew_avg; + } + else { + delta = *(int64_t*)plugin_data; + } + + int index = deltas->num_nodes; + deltas->skew_data[index].delta = delta; + deltas->skew_data[index].nodeid = nodeid; + deltas->num_nodes++; +} + +/** + * Compute the skew across the cluster. + */ +static uint64_t +skew_monitor_compute_skew() +{ + uint64_t skew = 0; + uint8_t buffer[AS_CLUSTER_SZ * sizeof(cf_node)]; + cf_vector succession = { 0 }; + + cf_vector_init_smalloc(&succession, sizeof(cf_node), buffer, sizeof(buffer), + VECTOR_FLAG_INITZERO); + as_exchange_succession(&succession); + + if (cf_vector_size(&succession) <= 1) { + // Self node is an orphan or single node cluster. No cluster wide skew. + skew = 0; + goto Cleanup; + } + + as_skew_monitor_hlc_delta_udata udata = { 0 }; + as_hb_plugin_data_iterate(&succession, AS_HB_PLUGIN_SKEW_MONITOR, + skew_monitor_delta_collect_iterate, &udata); + + int64_t min = INT64_MAX; + int64_t max = INT64_MIN; + + for (int i = 0; i < udata.num_nodes; i++) { + int64_t delta = udata.skew_data[i].delta; + if (delta < min) { + min = delta; + } + + if (delta > max) { + max = delta; + } + } + skew = max - min; + +Cleanup: + cf_vector_destroy(&succession); + return skew; +} + +/** + * Update clock skew and fire skew events. + */ +static void +skew_monitor_update() +{ + cf_clock now = cf_getms(); + cf_atomic64_set(&g_last_skew_check_time, now); + + uint64_t skew = skew_monitor_compute_skew(); + uint64_t avg_skew = cf_atomic64_get(g_skew); + avg_skew = ALPHA * skew + (1 - ALPHA) * avg_skew; + cf_atomic64_set(&g_skew, avg_skew); + + for (int i = 0; i < g_config.n_namespaces; i++) { + as_namespace* ns = g_config.namespaces[i]; + handle_clock_skew(ns, avg_skew); + } +} + +/* + * ---------------------------------------------------------------------------- + * Outlier detection + * ---------------------------------------------------------------------------- + */ + +/** + * Comparator for deltas. + */ +static int +skew_monitor_hlc_delta_compare(const void* o1, const void* o2) +{ + int64_t delta1 = ((as_skew_monitor_node_skew_data*)o1)->delta; + int64_t delta2 = ((as_skew_monitor_node_skew_data*)o2)->delta; + + return delta1 > delta2 ? 1 : (delta1 == delta2 ? 0 : -1); +} + +/** + * Compute the median of the data. + * @param values the values sorted. + * @param from the start index (inclusive) + * @param to the end index (inclusive) + * @return the index of the median element + */ +static int +skew_monitor_median_index(int from, int to) +{ + int numElements = to - from + 1; + if (numElements < 0) { + return from; + } + return (to + from) / 2; +} + +/** + * Return the currently estimated outliers from our cluster. + * Outliers should have space to hold at least AS_CLUSTER_SZ nodes. + */ +static uint32_t +skew_monitor_outliers_from_skew_data(cf_vector* outliers, + as_skew_monitor_hlc_delta_udata* udata) +{ + // Use inter-quartile distance to detect outliers. + // Sort the deltas in ascending order. + qsort(udata->skew_data, udata->num_nodes, + sizeof(as_skew_monitor_node_skew_data), + skew_monitor_hlc_delta_compare); + int q2_index = skew_monitor_median_index(0, udata->num_nodes - 1); + int q3_index = skew_monitor_median_index(q2_index, udata->num_nodes - 1); + int q1_index = skew_monitor_median_index(0, q2_index); + int64_t q3 = udata->skew_data[q3_index].delta; + int64_t q1 = udata->skew_data[q1_index].delta; + + // Compute the inter quartile range. Lower bound iqr to network latency to + // allow that allow some fuzziness with tigth clock grouping. + int64_t iqr = MAX(q3 - q1, g_config.fabric_latency_max_ms); + double lower_bound = q1 - 1.5 * iqr; + double upper_bound = q3 + 1.5 * iqr; + + uint32_t num_outliers = 0; + + // Isolate outliers + for (int i = 0; i < udata->num_nodes; i++) { + if (udata->skew_data[i].delta < lower_bound + || udata->skew_data[i].delta > upper_bound) { + if (outliers) { + cf_vector_append(outliers, &udata->skew_data[i].nodeid); + } + + num_outliers++; + } + } + + return num_outliers; +} + +/** + * Return the currently estimated outliers from our cluster. + * Outliers should have space to hold at least AS_CLUSTER_SZ nodes. + */ +static uint32_t +skew_monitor_outliers(cf_vector* outliers) +{ + if (as_skew_monitor_skew() < skew_monitor_outlier_detection_threshold()) { + // Skew is not significant. Skip printing outliers. + return 0; + } + + uint8_t buffer[AS_CLUSTER_SZ * sizeof(cf_node)]; + cf_vector succession; + cf_vector_init_smalloc(&succession, sizeof(cf_node), buffer, sizeof(buffer), + VECTOR_FLAG_INITZERO); + as_exchange_succession(&succession); + + uint32_t num_outliers = 0; + + uint32_t cluster_size = cf_vector_size(&succession); + if (cluster_size <= 1) { + // Self node is an orphan or single node cluster. No cluster wide skew. + goto Cleanup; + } + + as_skew_monitor_hlc_delta_udata udata = { 0 }; + as_hb_plugin_data_iterate(&succession, AS_HB_PLUGIN_SKEW_MONITOR, + skew_monitor_delta_collect_iterate, &udata); + + num_outliers = skew_monitor_outliers_from_skew_data(outliers, &udata); + +Cleanup: + cf_vector_destroy(&succession); + + return num_outliers; +} + +/* + * ---------------------------------------------------------------------------- + * HB plugin functions + * ---------------------------------------------------------------------------- + */ + +/** + * Push current timestamp for self node into the heartbeat pulse message. + */ +static void +skew_monitor_hb_plugin_set_fn(msg* msg) +{ + cf_clock send_ts = cf_clock_getabsolute(); + if (msg_set_uint64(msg, AS_HB_MSG_SKEW_MONITOR_DATA, send_ts) != 0) { + CRASH("error setting current timestamp on msg"); + } + + // Update self skew. + as_hlc_timestamp send_hlc_ts = as_hlc_timestamp_now(); + int64_t clock_delta = as_hlc_physical_ts_get(send_hlc_ts) - send_ts; + + // Update the average delta for self. + g_self_skew_avg = clock_delta * ALPHA + (1 - ALPHA) * (g_self_skew_avg); + + cf_clock now = cf_getms(); + if (cf_atomic64_get(g_last_skew_check_time) + skew_check_interval() < now) { + skew_monitor_update(); + } +} + +/** + * Compare the HLC timestamp and the physical clock and store the difference as + * plugin data for the source node to enable skew detection. + */ +static void +skew_monitor_hb_plugin_parse_data_fn(msg* msg, cf_node source, + as_hb_plugin_node_data* plugin_data) +{ + cf_clock send_ts = 0; + as_hlc_timestamp send_hlc_ts = 0; + if (msg_get_uint64(msg, AS_HB_MSG_SKEW_MONITOR_DATA, &send_ts) != 0 + || as_hb_msg_send_hlc_ts_get(msg, &send_hlc_ts) != 0) { + // Pre CP mode node. For now assumes it shares the same delta with hlc + // as us. + send_hlc_ts = as_hlc_timestamp_now(); + send_ts = cf_clock_getabsolute(); + } + + size_t required_capacity = sizeof(int64_t); + if (required_capacity > plugin_data->data_capacity) { + plugin_data->data = cf_realloc(plugin_data->data, required_capacity); + + if (plugin_data->data == NULL) { + CRASH( + "error allocating space for storing succession list for node %"PRIx64, + source); + } + plugin_data->data_capacity = required_capacity; + memset(plugin_data->data, 0, required_capacity); + } + + int64_t clock_delta = as_hlc_physical_ts_get(send_hlc_ts) - send_ts; + int64_t* average_clock_delta = (int64_t*)plugin_data->data; + + if (plugin_data->data_size == 0) { + // This is the first data point. + *average_clock_delta = clock_delta; + } + + plugin_data->data_size = required_capacity; + + // update the average + *average_clock_delta = clock_delta * ALPHA + + (1 - ALPHA) * (*average_clock_delta); + + DETAIL("node %"PRIx64" hlc:%lu clock:%lu delta:%ld moving-average:%ld", source, send_hlc_ts, send_ts, clock_delta, *average_clock_delta); +} + +/* + * ---------------------------------------------------------------------------- + * Protceted API only mean for clustering. + * ---------------------------------------------------------------------------- + */ + +/** + * Update clock skew and fire skew events. + */ +void +as_skew_monitor_update() +{ + skew_monitor_update(); +} + +/* + * ---------------------------------------------------------------------------- + * Public API + * ---------------------------------------------------------------------------- + */ + +/** + * Initialize skew monitor. + */ +void +as_skew_monitor_init() +{ + as_hb_plugin skew_monitor_plugin = { 0 }; + + skew_monitor_plugin.id = AS_HB_PLUGIN_SKEW_MONITOR; + skew_monitor_plugin.wire_size_fixed = sizeof(int64_t); + // Size of the node in succession list. + skew_monitor_plugin.wire_size_per_node = 0; + skew_monitor_plugin.set_fn = skew_monitor_hb_plugin_set_fn; + skew_monitor_plugin.parse_fn = skew_monitor_hb_plugin_parse_data_fn; + as_hb_plugin_register(&skew_monitor_plugin); + + DETAIL("skew monitor initialized"); +} + +/** + * Return the current estimate of the clock skew in the cluster. + */ +uint64_t +as_skew_monitor_skew() +{ + return cf_atomic64_get(g_skew); +} + +/** + * Return the currently estimated outliers from our cluster. + * Outliers should have space to hold at least AS_CLUSTER_SZ nodes. + */ +uint32_t +as_skew_monitor_outliers(cf_vector* outliers) +{ + return skew_monitor_outliers(outliers); +} + +/** + * Print skew outliers to a dynamic buffer. + */ +uint32_t +as_skew_monitor_outliers_append(cf_dyn_buf* db) +{ + uint8_t buffer[AS_CLUSTER_SZ * sizeof(cf_node)]; + cf_vector outliers; + cf_vector_init_smalloc(&outliers, sizeof(cf_node), buffer, sizeof(buffer), + VECTOR_FLAG_INITZERO); + uint32_t num_outliers = skew_monitor_outliers(&outliers); + + for (uint32_t i = 0; i < num_outliers; i++) { + cf_node outlier_id; + cf_vector_get(&outliers, i, &outlier_id); + cf_dyn_buf_append_uint64_x(db, outlier_id); + cf_dyn_buf_append_char(db, ','); + } + + if (num_outliers) { + cf_dyn_buf_chomp(db); + } + + cf_vector_destroy(&outliers); + + return num_outliers; +} + +/** + * Print skew monitor info to a dynamic buffer. + */ +void +as_skew_monitor_info(cf_dyn_buf* db) +{ + cf_dyn_buf_append_string(db, "cluster_clock_skew_outliers="); + uint32_t num_outliers = as_skew_monitor_outliers_append(db); + if (num_outliers == 0) { + cf_dyn_buf_append_string(db, "null"); + } + cf_dyn_buf_append_char(db, ';'); +} + +/** + * Dump some debugging information to the logs. + */ +void +as_skew_monitor_dump() +{ + uint8_t buffer[AS_CLUSTER_SZ * sizeof(cf_node)]; + cf_vector node_vector; + cf_vector_init_smalloc(&node_vector, sizeof(cf_node), buffer, + sizeof(buffer), VECTOR_FLAG_INITZERO); + as_exchange_succession(&node_vector); + + INFO("CSM: cluster-clock-skew:%ld", as_skew_monitor_skew()); + if (cf_vector_size(&node_vector) <= 1) { + // Self node is an orphan or single node cluster. No cluster wide skew. + goto Cleanup; + } + + as_skew_monitor_hlc_delta_udata udata = { 0 }; + as_hb_plugin_data_iterate(&node_vector, AS_HB_PLUGIN_SKEW_MONITOR, + skew_monitor_delta_collect_iterate, &udata); + + for (int i = 0; i < udata.num_nodes; i++) { + INFO("CSM: node:%"PRIx64" hlc-delta:%ld", udata.skew_data[i].nodeid, udata.skew_data[i].delta); + } + + // Log the outliers. + cf_vector_clear(&node_vector); + skew_monitor_outliers(&node_vector); + if (cf_vector_size(&node_vector)) { + as_clustering_log_cf_node_vector(AS_INFO, AS_SKEW, + "CSM: Estimated clock outliers", &node_vector); + } + +Cleanup: + cf_vector_destroy(&node_vector); +} diff --git a/as/src/geospatial/geojson.cc b/as/src/geospatial/geojson.cc new file mode 100644 index 00000000..2c5cc384 --- /dev/null +++ b/as/src/geospatial/geojson.cc @@ -0,0 +1,344 @@ +/* + * Copyright 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more + * contributor license agreements. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. You + * may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "geospatial/scoped.h" +#include "geospatial/throwstream.h" +#include "geospatial/geojson.h" + +using namespace std; + +namespace { + +S2Point +traverse_point(json_t * coord) +{ + if (! coord) { + throwstream(runtime_error, "missing coordinates"); + } + + if (! json_is_array(coord)) { + throwstream(runtime_error, "coordinates are not array"); + } + + if (json_array_size(coord) != 2) { + throwstream(runtime_error, "expected 2 coordinates, saw " + << json_array_size(coord)); + } + + double lngval; + json_t * lng = json_array_get(coord, 0); + if (json_is_real(lng)) { + lngval = json_real_value(lng); + } + else if (json_is_integer(lng)) { + lngval = double(json_integer_value(lng)); + } + else { + throwstream(runtime_error, "longitude not numeric value"); + } + + double latval; + json_t * lat = json_array_get(coord, 1); + if (json_is_real(lat)) { + latval = json_real_value(lat); + } + else if (json_is_integer(lat)) { + latval = double(json_integer_value(lat)); + } + else { + throwstream(runtime_error, "latitude not numeric value"); + } + + // cout << setprecision(15) << latval << ", " << lngval << endl; + + S2LatLng latlng = S2LatLng::FromDegrees(latval, lngval).Normalized(); + if (! latlng.is_valid()) { + throwstream(runtime_error, "invalid latitude-longitude"); + } + return latlng.ToPoint(); +} + +S2Loop * +traverse_loop(json_t * vertices) +{ + if (! vertices) { + throwstream(runtime_error, "missing vertices"); + } + + if (! json_is_array(vertices)) { + throwstream(runtime_error, "vertices are not array"); + } + + vector points; + + for (size_t ii = 0; ii < json_array_size(vertices); ++ii) { + points.push_back(traverse_point(json_array_get(vertices, ii))); + } + + // Remove duplicate points. + for (size_t ii = 1; ii < points.size(); ++ii) { + if (points[ii - 1] == points[ii]) { + points.erase(points.begin() + ii); + --ii; + } + } + + if (points.size() < 4) { + throwstream(runtime_error, "loop contains less than 4 points"); + } + if (points[0] != points[points.size()-1]) { + throwstream(runtime_error, "loop not closed"); + } + points.pop_back(); + + auto_ptr loop(new S2Loop(points)); + loop->Normalize(); + return loop.release(); +} + +S2Polygon * +traverse_polygon(json_t * loops) +{ + if (! loops) { + throwstream(runtime_error, "missing polygon body"); + } + + if (! json_is_array(loops)) { + throwstream(runtime_error, "polygon body is not array"); + } + + vector loopv; + try + { + for (size_t ii = 0; ii < json_array_size(loops); ++ii) { + loopv.push_back(traverse_loop(json_array_get(loops, ii))); + } + + return new S2Polygon(&loopv); + } + catch (...) + { + for (size_t ii = 0; ii < loopv.size(); ++ii) { + delete loopv[ii]; + } + throw; + } +} + +void process_point(GeoJSON::GeometryHandler & geohand, json_t * coord) +{ + geohand.handle_point(S2CellId::FromPoint(traverse_point(coord))); +} + +void +process_polygon(GeoJSON::GeometryHandler & geohand, json_t * coord) +{ + if (! coord) { + throwstream(runtime_error, "missing coordinates"); + } + + if (! json_is_array(coord)) { + throwstream(runtime_error, "coordinates are not array"); + } + + S2Polygon * poly = traverse_polygon(coord); + if (geohand.handle_region(poly)) { + delete poly; + } +} + +void +process_multipolygon(GeoJSON::GeometryHandler & geohand, json_t * coord) +{ + if (! coord) { + throwstream(runtime_error, "missing coordinates"); + } + + if (! json_is_array(coord)) { + throwstream(runtime_error, "coordinates are not array"); + } + + auto_ptr regionsp(new S2RegionUnion); + + for (size_t ii = 0; ii < json_array_size(coord); ++ii) { + regionsp->Add(traverse_polygon(json_array_get(coord, ii))); + } + + if (! geohand.handle_region(regionsp.get())) { + // Handler took ownership. + regionsp.release(); + } +} + +void +process_circle(GeoJSON::GeometryHandler & geohand, json_t * coord) +{ + // { + // "type": "AeroCircle", + // "coordinates": [[-122.097837, 37.421363], 1000.0] + // } + + if (! coord) { + throwstream(runtime_error, "missing coordinates"); + } + + if (! json_is_array(coord)) { + throwstream(runtime_error, "coordinates are not array"); + } + + if (json_array_size(coord) != 2) { + throwstream(runtime_error, "malformed circle coordinate array"); + } + + S2Point center = traverse_point(json_array_get(coord, 0)); + + double radius; + json_t * radiusobj = json_array_get(coord, 1); + if (json_is_real(radiusobj)) { + radius = json_real_value(radiusobj); + } + else if (json_is_integer(radiusobj)) { + radius = double(json_integer_value(radiusobj)); + } + else { + throwstream(runtime_error, "radius not numeric value"); + } + + S1Angle angle = S1Angle::Radians(radius / geohand.earth_radius_meters()); + + auto_ptr capp(S2Cap::FromAxisAngle(center, angle).Clone()); + + if (! geohand.handle_region(capp.get())) { + // Handler took ownership. + capp.release(); + } +} + +void traverse_geometry(GeoJSON::GeometryHandler & geohand, json_t * geom) +{ + if (! geom) { + throwstream(runtime_error, "missing geometry element"); + } + + if (! json_is_object(geom)) { + throwstream(runtime_error, "geometry is not object"); + } + + json_t * type = json_object_get(geom, "type"); + if (! type) { + throwstream(runtime_error, "missing geometry type"); + } + + if (! json_is_string(type)) { + throwstream(runtime_error, "geometry type is not string"); + } + + string typestr(json_string_value(type)); + if (typestr == "Point") { + process_point(geohand, json_object_get(geom, "coordinates")); + } + else if (typestr == "Polygon") { + process_polygon(geohand, json_object_get(geom, "coordinates")); + } + else if (typestr == "MultiPolygon") { + process_multipolygon(geohand, json_object_get(geom, "coordinates")); + } + else if (typestr == "AeroCircle") { + process_circle(geohand, json_object_get(geom, "coordinates")); + } + else { + throwstream(runtime_error, "unknown geometry type: " << typestr); + } +} + +} // end namespace + +namespace GeoJSON { + +void GeometryHandler::handle_point(S2CellId const & i_cellid) +{ + // nothing by default +} + +bool GeometryHandler::handle_region(S2Region * i_regionp) +{ + // By default, caller should delete the region. + return true; +} + +void parse(GeometryHandler & geohand, string const & geostr) +{ + json_error_t err; + Scoped geojson(json_loadb(geostr.data(), geostr.size(), 0, &err), + NULL, json_decref); + if (! geojson) { + throwstream(runtime_error, "failed to parse geojson: " + << err.line << ": " << err.text); + } + + geohand.set_json(geojson); + + if (! json_is_object(geojson)) { + throwstream(runtime_error, "top level geojson element not object"); + } + + json_t * type = json_object_get(geojson, "type"); + if (! type) { + throwstream(runtime_error, "missing top-level type in geojson element"); + } + + if (! json_is_string(type)) { + throwstream(runtime_error, "top-level type is not string"); + } + + string typestr(json_string_value(type)); + if (typestr == "Feature") { + traverse_geometry(geohand, json_object_get(geojson, "geometry")); + } + else if (typestr == "Point") { + process_point(geohand, json_object_get(geojson, "coordinates")); + } + else if (typestr == "Polygon") { + process_polygon(geohand, json_object_get(geojson, "coordinates")); + } + else if (typestr == "MultiPolygon") { + process_multipolygon(geohand, json_object_get(geojson, "coordinates")); + } + else if (typestr == "AeroCircle") { + process_circle(geohand, json_object_get(geojson, "coordinates")); + } + else { + throwstream(runtime_error, "unknown top-level type: " << typestr); + } +} + +} // end namespace GeoJSON diff --git a/as/src/geospatial/geospatial.cc b/as/src/geospatial/geospatial.cc new file mode 100644 index 00000000..17825c23 --- /dev/null +++ b/as/src/geospatial/geospatial.cc @@ -0,0 +1,228 @@ +/* + * geospatial.cpp + * + * Copyright (C) 2015 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include +#include +#include + +#include + +#include + +extern "C" { +#include "fault.h" +#include "base/datamodel.h" +} // end extern "C" + +#include "geospatial/geospatial.h" +#include "geospatial/geojson.h" + +using namespace std; + +class PointRegionHandler: public GeoJSON::GeometryHandler +{ +public: + PointRegionHandler(as_namespace * ns) + : m_cellid(0) + , m_regionp(NULL) + { + m_earth_radius_meters = + ns ? double(ns->geo2dsphere_within_earth_radius_meters) : 6371000; + } + + virtual void handle_point(S2CellId const & cellid) { + m_cellid = cellid; + } + + virtual bool handle_region(S2Region * regionp) { + m_regionp = regionp; + return false; // Don't delete this region, please. + } + + virtual double earth_radius_meters() { + return m_earth_radius_meters; + } + + double m_earth_radius_meters; + S2CellId m_cellid; + S2Region * m_regionp; +}; + +bool +geo_parse(as_namespace * ns, + const char * buf, + size_t bufsz, + uint64_t * cellidp, + geo_region_t * regionp) +{ + try + { + PointRegionHandler prhandler(ns); + GeoJSON::parse(prhandler, string(buf, bufsz)); + *cellidp = prhandler.m_cellid.id(); + *regionp = (geo_region_t) prhandler.m_regionp; + return true; + } + catch (exception const & ex) + { + cf_warning(AS_GEO, (char *) "failed to parse point: %s", ex.what()); + return false; + } +} + +bool +geo_region_cover(as_namespace * ns, + geo_region_t region, + int maxnumcells, + uint64_t * cellctrp, + uint64_t * cellminp, + uint64_t * cellmaxp, + int * numcellsp) +{ + try + { + S2Region * regionp = (S2Region *) region; + + S2RegionCoverer coverer; + if (ns) { + coverer.set_min_level(ns->geo2dsphere_within_min_level); + coverer.set_max_level(ns->geo2dsphere_within_max_level); + coverer.set_max_cells(ns->geo2dsphere_within_max_cells); + coverer.set_level_mod(ns->geo2dsphere_within_level_mod); + } + else { + // FIXME - we really don't want to hardcode these values, but + // some callers can't provide the namespace context ... + coverer.set_min_level(1); + coverer.set_max_level(30); + coverer.set_max_cells(12); + coverer.set_level_mod(1); + } + vector covering; + coverer.GetCovering(*regionp, &covering); + + // The coverer can always return 6 cells, even when max cells is + // less (regions which intersect all cube faces). If we get more + // then we asked for and it's greater then 6 something is wrong. + if (covering.size() > max(size_t(6), size_t(coverer.max_cells()))) { + return false; + } + + for (size_t ii = 0; ii < covering.size(); ++ii) + { + if (ii == (size_t) maxnumcells) + { + cf_warning(AS_GEO, (char *) "region covered with %zu cells, " + "only %d allowed", covering.size(), maxnumcells); + return false; + } + + if (cellctrp) { + cellctrp[ii] = covering[ii].id(); + } + if (cellminp) { + cellminp[ii] = covering[ii].range_min().id(); + } + if (cellmaxp) { + cellmaxp[ii] = covering[ii].range_max().id(); + } + + if (cellctrp) { + cf_detail(AS_GEO, (char *) "cell[%zu]: 0x%lx", + ii, cellctrp[ii]); + } + + if (cellminp && cellmaxp) { + cf_detail(AS_GEO, (char *) "cell[%zu]: [0x%lx, 0x%lx]", + ii, cellminp[ii], cellmaxp[ii]); + } + } + + *numcellsp = covering.size(); + return true; + } + catch (exception const & ex) + { + cf_warning(AS_GEO, (char *) "geo_region_cover failed: %s", ex.what()); + return false; + } +} + +bool +geo_point_centers(as_namespace * ns, + uint64_t cellidval, + int maxnumcenters, + uint64_t * center, + int * numcentersp) +{ + try + { + S2CellId incellid(cellidval); + + *numcentersp = 0; + + for (S2CellId cellid = incellid; + cellid.level() > 0; + cellid = cellid.parent()) + { + // Make sure we don't overwrite the output array. + if (*numcentersp == maxnumcenters) { + break; + } + center[*numcentersp] = cellid.id(); + *numcentersp += 1; + } + return true; + } + catch (exception const & ex) + { + cf_warning(AS_GEO, (char *) "geo_point_centers failed: %s", ex.what()); + return false; + } +} + +bool +geo_point_within(uint64_t cellidval, geo_region_t region) +{ + try + { + S2Region * regionp = (S2Region *) region; + S2CellId cellid(cellidval); + bool iswithin = regionp->VirtualContainsPoint(cellid.ToPoint()); + return iswithin; + } + catch (exception const & ex) + { + cf_warning(AS_GEO, (char *) "exception in geo_point_within: %s", + ex.what()); + return false; + } +} + +void +geo_region_destroy(geo_region_t region) +{ + S2Region * regionp = (S2Region *) region; + if (regionp) { + delete regionp; + } +} diff --git a/as/src/storage/drv_memory.c b/as/src/storage/drv_memory.c new file mode 100644 index 00000000..913033ed --- /dev/null +++ b/as/src/storage/drv_memory.c @@ -0,0 +1,78 @@ +/* + * drv_memory.c + * + * Copyright (C) 2009-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * in-memory storage engine driver + * + */ + +#include +#include + +#include "citrusleaf/cf_queue.h" + +#include "base/datamodel.h" +#include "base/truncate.h" +#include "storage/storage.h" + + +/* SYNOPSIS + * In-memory storage driver + * + * This code almost entirely performs no-ops, because all the in-memory state + * is correct already. + * Note that this code is mostly for the NON-PERSISTENT main memory namespace. + * The File-backed (persistent) main memory namespace is NOT type 1 (MM) for + * some calls, but is instead treated as type 2 (SSD); hence in some cases + * the SSD functions, like as_storage_bin_can_fit(), are applied with an SSD + * context rather than a transient main memory context. (tjl) + */ + +int +as_storage_namespace_init_memory(as_namespace *ns, cf_queue *complete_q, void *udata) +{ + as_truncate_done_startup(ns); + + void *_t = NULL; + + cf_queue_push(complete_q, &_t); + + return 0; +} + +int +as_storage_namespace_destroy_memory(as_namespace *ns) +{ + return(0); +} + +int +as_storage_stats_memory(as_namespace *ns, int *available_pct, uint64_t *used_disk_bytes) +{ + if (available_pct) { + *available_pct = 100; + } + if (used_disk_bytes) { + *used_disk_bytes = 0; + } + return(0); +} diff --git a/as/src/storage/drv_memory_ce.c b/as/src/storage/drv_memory_ce.c new file mode 100644 index 00000000..26121924 --- /dev/null +++ b/as/src/storage/drv_memory_ce.c @@ -0,0 +1,44 @@ +/* + * drv_memory_ce.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "base/datamodel.h" +#include "fabric/partition.h" +#include "storage/storage.h" + + +void +as_storage_start_tomb_raider_memory(as_namespace* ns) +{ + // Tomb raider is for enterprise version only. +} + + +int +as_storage_record_write_memory(as_storage_rd* rd) +{ + return 0; +} + +void +as_storage_info_get_memory(as_namespace *ns, as_partition *p) +{ +} diff --git a/as/src/storage/drv_ssd.c b/as/src/storage/drv_ssd.c new file mode 100644 index 00000000..4d3c99c6 --- /dev/null +++ b/as/src/storage/drv_ssd.c @@ -0,0 +1,4312 @@ +/* + * drv_ssd.c + * + * Copyright (C) 2009-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* SYNOPSIS + * "file" based storage driver, which applies to both SSD namespaces and, in + * some cases, to file-backed main-memory namespaces. + */ + +#include "storage/drv_ssd.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // for BLKGETSIZE64 +#include +#include // for MAX() + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_queue.h" +#include "citrusleaf/cf_random.h" + +#include "cf_mutex.h" +#include "fault.h" +#include "hist.h" +#include "vmapx.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/rec_props.h" +#include "base/secondary_index.h" +#include "base/truncate.h" +#include "fabric/partition.h" +#include "storage/storage.h" +#include "transaction/rw_utils.h" + + +//========================================================== +// Forward declarations. +// + +// Defined in thr_nsup.c, for historical reasons. +extern bool as_cold_start_evict_if_needed(as_namespace* ns); + + +//========================================================== +// Constants. +// + +#define DEFRAG_STARTUP_RESERVE 4 +#define DEFRAG_RUNTIME_RESERVE 4 + + +//========================================================== +// Miscellaneous utility functions. +// + +// Get an open file descriptor from the pool, or a fresh one if necessary. +int +ssd_fd_get(drv_ssd *ssd) +{ + int fd = -1; + int rv = cf_queue_pop(ssd->fd_q, (void*)&fd, CF_QUEUE_NOWAIT); + + if (rv != CF_QUEUE_OK) { + fd = open(ssd->name, ssd->open_flag, S_IRUSR | S_IWUSR); + + if (-1 == fd) { + cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED open: errno %d (%s)", + ssd->name, errno, cf_strerror(errno)); + } + } + + return fd; +} + + +int +ssd_shadow_fd_get(drv_ssd *ssd) +{ + int fd = -1; + int rv = cf_queue_pop(ssd->shadow_fd_q, (void*)&fd, CF_QUEUE_NOWAIT); + + if (rv != CF_QUEUE_OK) { + fd = open(ssd->shadow_name, ssd->open_flag, S_IRUSR | S_IWUSR); + + if (-1 == fd) { + cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED open: errno %d (%s)", + ssd->shadow_name, errno, cf_strerror(errno)); + } + } + + return fd; +} + + +// Save an open file descriptor in the pool +static inline void +ssd_fd_put(drv_ssd *ssd, int fd) +{ + cf_queue_push(ssd->fd_q, (void*)&fd); +} + + +static inline void +ssd_shadow_fd_put(drv_ssd *ssd, int fd) +{ + cf_queue_push(ssd->shadow_fd_q, (void*)&fd); +} + + +// Decide which device a record belongs on. +static inline uint32_t +ssd_get_file_id(drv_ssds *ssds, cf_digest *keyd) +{ + return *(uint32_t*)&keyd->digest[DIGEST_STORAGE_BASE_BYTE] % ssds->n_ssds; +} + + +// Put a wblock on the free queue for reuse. +void +push_wblock_to_free_q(drv_ssd *ssd, uint32_t wblock_id, e_free_to free_to) +{ + if (! ssd->free_wblock_q) { // null until devices are loaded at startup + return; + } + + // temp debugging: + if (wblock_id >= ssd->alloc_table->n_wblocks) { + cf_warning(AS_DRV_SSD, "pushing invalid wblock_id %d to free_wblock_q", + (int32_t)wblock_id); + return; + } + + if (free_to == FREE_TO_HEAD) { + cf_queue_push_head(ssd->free_wblock_q, &wblock_id); + } + else { + cf_queue_push(ssd->free_wblock_q, &wblock_id); + } +} + + +// Put a wblock on the defrag queue. +static inline void +push_wblock_to_defrag_q(drv_ssd *ssd, uint32_t wblock_id) +{ + if (ssd->defrag_wblock_q) { // null until devices are loaded at startup + ssd->alloc_table->wblock_state[wblock_id].state = WBLOCK_STATE_DEFRAG; + cf_queue_push(ssd->defrag_wblock_q, &wblock_id); + cf_atomic64_incr(&ssd->n_defrag_wblock_reads); + } +} + + +// Available contiguous size. +static inline uint64_t +available_size(drv_ssd *ssd) +{ + return ssd->free_wblock_q ? // null until devices are loaded at startup + (uint64_t)cf_queue_sz(ssd->free_wblock_q) * ssd->write_block_size : + ssd->file_size; + + // Note - returns 100% available during cold start, to make it irrelevant in + // cold start eviction threshold check. +} + + +// Since UDF writes can't yet unwind on failure, we ensure that they'll succeed +// by checking before writing on all threads that there's at least one wblock +// per thread. TODO - deprecate this methodology when everything can unwind. +static inline int +min_free_wblocks(as_namespace *ns) +{ + // Data-in-memory namespaces process transactions in service threads. + int n_service_threads = ns->storage_data_in_memory ? + (int)g_config.n_service_threads : 0; + + int n_transaction_threads = (int) + (g_config.n_transaction_queues * g_config.n_transaction_threads_per_queue); + + return n_service_threads + // client writes + n_transaction_threads + // client writes + g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_RW] + // prole writes + g_config.n_fabric_channel_recv_threads[AS_FABRIC_CHANNEL_BULK] + // migration writes + 1 + // always 1 defrag thread + DEFRAG_RUNTIME_RESERVE + // reserve for defrag at runtime + DEFRAG_STARTUP_RESERVE; // reserve for defrag at startup +} + + +void +ssd_release_vacated_wblock(drv_ssd *ssd, uint32_t wblock_id, + ssd_wblock_state* p_wblock_state) +{ + // Sanity checks. + cf_assert(! p_wblock_state->swb, AS_DRV_SSD, + "device %s: wblock-id %u swb not null while defragging", + ssd->name, wblock_id); + cf_assert(p_wblock_state->state == WBLOCK_STATE_DEFRAG, AS_DRV_SSD, + "device %s: wblock-id %u state not DEFRAG while defragging", + ssd->name, wblock_id); + + int32_t n_vac_dests = cf_atomic32_decr(&p_wblock_state->n_vac_dests); + + if (n_vac_dests > 0) { + return; + } + // else - all wblocks we defragged into have been flushed. + + cf_assert(n_vac_dests == 0, AS_DRV_SSD, + "device %s: wblock-id %u vacation destinations underflow", + ssd->name, wblock_id); + + cf_mutex_lock(&p_wblock_state->LOCK); + + p_wblock_state->state = WBLOCK_STATE_NONE; + + // Free the wblock if it's empty. + if (cf_atomic32_get(p_wblock_state->inuse_sz) == 0 && + // TODO - given assertions above, this condition is superfluous: + ! p_wblock_state->swb) { + push_wblock_to_free_q(ssd, wblock_id, FREE_TO_HEAD); + } + + cf_mutex_unlock(&p_wblock_state->LOCK); +} + + +//------------------------------------------------ +// ssd_write_buf "swb" methods. +// + +#define VACATED_CAPACITY_STEP 128 // allocate in 1K chunks + +static inline ssd_write_buf* +swb_create(drv_ssd *ssd) +{ + ssd_write_buf *swb = (ssd_write_buf*)cf_malloc(sizeof(ssd_write_buf)); + + swb->buf = cf_valloc(ssd->write_block_size); + + swb->n_vacated = 0; + swb->vacated_capacity = VACATED_CAPACITY_STEP; + swb->vacated_wblocks = + cf_malloc(sizeof(vacated_wblock) * swb->vacated_capacity); + + return swb; +} + +static inline void +swb_destroy(ssd_write_buf *swb) +{ + cf_free(swb->vacated_wblocks); + cf_free(swb->buf); + cf_free(swb); +} + +static inline void +swb_reset(ssd_write_buf *swb) +{ + swb->skip_post_write_q = false; + swb->wblock_id = STORAGE_INVALID_WBLOCK; + swb->pos = 0; +} + +#define swb_reserve(_swb) cf_atomic32_incr(&(_swb)->rc) + +static inline void +swb_check_and_reserve(ssd_wblock_state *wblock_state, ssd_write_buf **p_swb) +{ + cf_mutex_lock(&wblock_state->LOCK); + + if (wblock_state->swb) { + *p_swb = wblock_state->swb; + swb_reserve(*p_swb); + } + + cf_mutex_unlock(&wblock_state->LOCK); +} + +static inline void +swb_release(ssd_write_buf *swb) +{ + if (0 == cf_atomic32_decr(&swb->rc)) { + swb_reset(swb); + + // Put the swb back on the free queue for reuse. + cf_queue_push(swb->ssd->swb_free_q, &swb); + } +} + +static inline void +swb_dereference_and_release(drv_ssd *ssd, uint32_t wblock_id, + ssd_write_buf *swb) +{ + ssd_wblock_state *wblock_state = &ssd->alloc_table->wblock_state[wblock_id]; + + cf_mutex_lock(&wblock_state->LOCK); + + if (swb != wblock_state->swb) { + cf_warning(AS_DRV_SSD, "releasing wrong swb! %p (%d) != %p (%d), thread %lu", + swb, (int32_t)swb->wblock_id, wblock_state->swb, + (int32_t)wblock_state->swb->wblock_id, pthread_self()); + } + + swb_release(wblock_state->swb); + wblock_state->swb = 0; + + if (wblock_state->state != WBLOCK_STATE_DEFRAG) { + uint32_t inuse_sz = cf_atomic32_get(wblock_state->inuse_sz); + + // Free wblock if all three gating conditions hold. + if (inuse_sz == 0) { + push_wblock_to_free_q(ssd, wblock_id, FREE_TO_HEAD); + } + // Queue wblock for defrag if applicable. + else if (inuse_sz < ssd->ns->defrag_lwm_size) { + push_wblock_to_defrag_q(ssd, wblock_id); + } + } + else { + cf_warning(AS_DRV_SSD, "device %s: wblock-id %u state is DEFRAG on swb release", + ssd->name, wblock_id); + } + + cf_mutex_unlock(&wblock_state->LOCK); +} + +ssd_write_buf * +swb_get(drv_ssd *ssd) +{ + ssd_write_buf *swb; + + if (CF_QUEUE_OK != cf_queue_pop(ssd->swb_free_q, &swb, CF_QUEUE_NOWAIT)) { + swb = swb_create(ssd); + swb->rc = 0; + swb->n_writers = 0; + swb->skip_post_write_q = false; + swb->ssd = ssd; + swb->wblock_id = STORAGE_INVALID_WBLOCK; + swb->pos = 0; + } + + // Find a device block to write to. + if (CF_QUEUE_OK != cf_queue_pop(ssd->free_wblock_q, &swb->wblock_id, + CF_QUEUE_NOWAIT)) { + cf_queue_push(ssd->swb_free_q, &swb); + return NULL; + } + + ssd_wblock_state* p_wblock_state = + &ssd->alloc_table->wblock_state[swb->wblock_id]; + + // Sanity checks. + if (cf_atomic32_get(p_wblock_state->inuse_sz) != 0) { + cf_warning(AS_DRV_SSD, "device %s: wblock-id %u inuse-size %u off free-q", + ssd->name, swb->wblock_id, + cf_atomic32_get(p_wblock_state->inuse_sz)); + } + if (p_wblock_state->swb) { + cf_warning(AS_DRV_SSD, "device %s: wblock-id %u swb not null off free-q", + ssd->name, swb->wblock_id); + } + if (p_wblock_state->state != WBLOCK_STATE_NONE) { + cf_warning(AS_DRV_SSD, "device %s: wblock-id %u state not NONE off free-q", + ssd->name, swb->wblock_id); + } + + cf_mutex_lock(&p_wblock_state->LOCK); + + swb_reserve(swb); + p_wblock_state->swb = swb; + + cf_mutex_unlock(&p_wblock_state->LOCK); + + return swb; +} + +bool +swb_add_unique_vacated_wblock(ssd_write_buf* swb, uint32_t src_file_id, + uint32_t src_wblock_id) +{ + for (uint32_t i = 0; i < swb->n_vacated; i++) { + vacated_wblock *vw = &swb->vacated_wblocks[i]; + + if (vw->wblock_id == src_wblock_id && vw->file_id == src_file_id) { + return false; // already present + } + } + + if (swb->n_vacated == swb->vacated_capacity) { + swb->vacated_capacity += VACATED_CAPACITY_STEP; + swb->vacated_wblocks = cf_realloc(swb->vacated_wblocks, + sizeof(vacated_wblock) * swb->vacated_capacity); + } + + swb->vacated_wblocks[swb->n_vacated].file_id = src_file_id; + swb->vacated_wblocks[swb->n_vacated].wblock_id = src_wblock_id; + swb->n_vacated++; + + return true; // added to list +} + +void +swb_release_all_vacated_wblocks(ssd_write_buf* swb) +{ + drv_ssds *ssds = (drv_ssds *)swb->ssd->ns->storage_private; + + for (uint32_t i = 0; i < swb->n_vacated; i++) { + vacated_wblock *vw = &swb->vacated_wblocks[i]; + + drv_ssd *src_ssd = &ssds->ssds[vw->file_id]; + ssd_alloc_table* at = src_ssd->alloc_table; + ssd_wblock_state* p_wblock_state = &at->wblock_state[vw->wblock_id]; + + ssd_release_vacated_wblock(src_ssd, vw->wblock_id, p_wblock_state); + } + + swb->n_vacated = 0; +} + +// +// END - ssd_write_buf "swb" methods. +//------------------------------------------------ + + +// Reduce wblock's used size, if result is 0 put it in the "free" pool, if it's +// below the defrag threshold put it in the defrag queue. +void +ssd_block_free(drv_ssd *ssd, uint64_t rblock_id, uint64_t n_rblocks, char *msg) +{ + if (n_rblocks == 0) { + cf_warning(AS_DRV_SSD, "%s: %s: freeing 0 rblocks, rblock_id %lu", + ssd->name, msg, rblock_id); + return; + } + + // Determine which wblock we're reducing used size in. + uint64_t start_byte = RBLOCKS_TO_BYTES(rblock_id); + uint64_t size = RBLOCKS_TO_BYTES(n_rblocks); + uint32_t wblock_id = BYTES_TO_WBLOCK_ID(ssd, start_byte); + uint32_t end_wblock_id = BYTES_TO_WBLOCK_ID(ssd, start_byte + size - 1); + ssd_alloc_table *at = ssd->alloc_table; + + // Sanity-checks. + if (! (start_byte >= SSD_HEADER_SIZE && wblock_id < at->n_wblocks && + wblock_id == end_wblock_id)) { + cf_warning(AS_DRV_SSD, "%s: %s: invalid range to free, rblock_id %lu, n_rblocks %lu", + ssd->name, msg, rblock_id, n_rblocks); + return; + } + + cf_atomic64_sub(&ssd->inuse_size, size); + + ssd_wblock_state *p_wblock_state = &at->wblock_state[wblock_id]; + + cf_mutex_lock(&p_wblock_state->LOCK); + + int64_t resulting_inuse_sz = cf_atomic32_sub(&p_wblock_state->inuse_sz, + (int32_t)size); + + if (resulting_inuse_sz < 0 || + resulting_inuse_sz >= (int64_t)ssd->write_block_size) { + cf_warning(AS_DRV_SSD, "%s: %s: wblock %d %s, subtracted %d now %ld", + ssd->name, msg, wblock_id, + resulting_inuse_sz < 0 ? "over-freed" : "has crazy inuse_sz", + (int32_t)size, resulting_inuse_sz); + + // TODO - really? + cf_atomic32_set(&p_wblock_state->inuse_sz, ssd->write_block_size); + } + else if (! p_wblock_state->swb && + p_wblock_state->state != WBLOCK_STATE_DEFRAG) { + // Free wblock if all three gating conditions hold. + if (resulting_inuse_sz == 0) { + push_wblock_to_free_q(ssd, wblock_id, FREE_TO_HEAD); + } + // Queue wblock for defrag if appropriate. + else if (resulting_inuse_sz < ssd->ns->defrag_lwm_size) { + push_wblock_to_defrag_q(ssd, wblock_id); + } + } + + cf_mutex_unlock(&p_wblock_state->LOCK); +} + + +static void +log_bad_record(const char* ns_name, uint32_t n_bins, uint32_t block_bins, + const drv_ssd_bin* ssd_bin, const char* tag) +{ + cf_info(AS_DRV_SSD, "untrustworthy data from disk [%s]", tag); + cf_info(AS_DRV_SSD, " ns->name = %s", ns_name); + cf_info(AS_DRV_SSD, " bin %u [of %u]", (block_bins - n_bins) + 1, block_bins); + + if (ssd_bin) { + cf_info(AS_DRV_SSD, " ssd_bin->offset = %u", ssd_bin->offset); + cf_info(AS_DRV_SSD, " ssd_bin->len = %u", ssd_bin->len); + cf_info(AS_DRV_SSD, " ssd_bin->next = %u", ssd_bin->next); + } +} + + +// TODO - sanity-check rec-props? +bool +is_valid_record(const drv_ssd_block* block, const char* ns_name) +{ + uint8_t* block_head = (uint8_t*)block; + uint64_t size = (uint64_t)(block->length + LENGTH_BASE); + drv_ssd_bin* ssd_bin_end = (drv_ssd_bin*)(block_head + size - sizeof(drv_ssd_bin)); + drv_ssd_bin* ssd_bin = (drv_ssd_bin*)(block->data + block->bins_offset); + uint32_t n_bins = block->n_bins; + + if (! ssd_cold_start_is_valid_n_bins(n_bins)) { + log_bad_record(ns_name, n_bins, n_bins, NULL, "bins"); + return false; + } + + while (n_bins > 0) { + if (ssd_bin > ssd_bin_end) { + log_bad_record(ns_name, n_bins, block->n_bins, NULL, "bin ptr"); + return false; + } + + uint64_t data_offset = (uint64_t)((uint8_t*)(ssd_bin + 1) - block_head); + + if ((uint64_t)ssd_bin->offset != data_offset) { + log_bad_record(ns_name, n_bins, block->n_bins, ssd_bin, "offset"); + return false; + } + + uint64_t bin_end_offset = data_offset + (uint64_t)ssd_bin->len; + + if (bin_end_offset > size) { + log_bad_record(ns_name, n_bins, block->n_bins, ssd_bin, "length"); + return false; + } + + if (n_bins > 1) { + if ((uint64_t)ssd_bin->next != bin_end_offset) { + log_bad_record(ns_name, n_bins, block->n_bins, ssd_bin, "next ptr"); + return false; + } + + ssd_bin = (drv_ssd_bin*)(block_head + ssd_bin->next); + } + + n_bins--; + } + + return true; +} + + +void +defrag_move_record(drv_ssd *src_ssd, uint32_t src_wblock_id, + drv_ssd_block *block, as_index *r) +{ + uint64_t old_rblock_id = r->rblock_id; + uint16_t old_n_rblocks = r->n_rblocks; + + drv_ssds *ssds = (drv_ssds*)src_ssd->ns->storage_private; + + // Figure out which device to write to. When replacing an old record, it's + // possible this is different from the old device (e.g. if we've added a + // fresh device), so derive it from the digest each time. + drv_ssd *ssd = &ssds->ssds[ssd_get_file_id(ssds, &block->keyd)]; + + if (! ssd) { + cf_warning(AS_DRV_SSD, "{%s} defrag_move_record: no drv_ssd for file_id %u", + ssds->ns->name, ssd->file_id); + return; + } + + uint32_t write_size = block->length + LENGTH_BASE; + + pthread_mutex_lock(&ssd->defrag_lock); + + ssd_write_buf *swb = ssd->defrag_swb; + + if (! swb) { + swb = swb_get(ssd); + ssd->defrag_swb = swb; + + if (! swb) { + cf_warning(AS_DRV_SSD, "defrag_move_record: couldn't get swb"); + pthread_mutex_unlock(&ssd->defrag_lock); + return; + } + } + + // Check if there's enough space in defrag buffer - if not, free and zero + // any remaining unused space, enqueue it to be flushed to device, and grab + // a new buffer. + if (write_size > ssd->write_block_size - swb->pos) { + if (ssd->write_block_size != swb->pos) { + // Clean the end of the buffer before pushing to write queue. + memset(swb->buf + swb->pos, 0, ssd->write_block_size - swb->pos); + } + + // Enqueue the buffer, to be flushed to device. + swb->skip_post_write_q = true; + cf_queue_push(ssd->swb_write_q, &swb); + cf_atomic64_incr(&ssd->n_defrag_wblock_writes); + + // Get the new buffer. + swb = swb_get(ssd); + ssd->defrag_swb = swb; + + if (! swb) { + cf_warning(AS_DRV_SSD, "defrag_move_record: couldn't get swb"); + pthread_mutex_unlock(&ssd->defrag_lock); + return; + } + } + + memcpy(swb->buf + swb->pos, (const uint8_t*)block, write_size); + + uint64_t write_offset = WBLOCK_ID_TO_BYTES(ssd, swb->wblock_id) + swb->pos; + + ssd_encrypt(ssd, write_offset, (drv_ssd_block *)(swb->buf + swb->pos)); + + r->file_id = ssd->file_id; + r->rblock_id = BYTES_TO_RBLOCKS(write_offset); + r->n_rblocks = BYTES_TO_RBLOCKS(write_size); + + swb->pos += write_size; + + cf_atomic64_add(&ssd->inuse_size, (int64_t)write_size); + cf_atomic32_add(&ssd->alloc_table->wblock_state[swb->wblock_id].inuse_sz, (int32_t)write_size); + + // If we just defragged into a new destination swb, count it. + if (swb_add_unique_vacated_wblock(swb, src_ssd->file_id, src_wblock_id)) { + ssd_wblock_state* p_wblock_state = + &src_ssd->alloc_table->wblock_state[src_wblock_id]; + + cf_atomic32_incr(&p_wblock_state->n_vac_dests); + } + + pthread_mutex_unlock(&ssd->defrag_lock); + + ssd_block_free(src_ssd, old_rblock_id, old_n_rblocks, "defrag-write"); +} + + +int +ssd_record_defrag(drv_ssd *ssd, uint32_t wblock_id, drv_ssd_block *block, + uint64_t rblock_id, uint32_t n_rblocks) +{ + as_namespace *ns = ssd->ns; + as_partition_reservation rsv; + uint32_t pid = as_partition_getid(&block->keyd); + + as_partition_reserve(ns, pid, &rsv); + + int rv; + as_index_ref r_ref; + r_ref.skip_lock = false; + + bool found = 0 == as_record_get(rsv.tree, &block->keyd, &r_ref); + + if (found) { + as_index *r = r_ref.r; + + if (r->file_id == ssd->file_id && r->rblock_id == rblock_id) { + if (r->generation != block->generation) { + cf_warning_digest(AS_DRV_SSD, &r->keyd, "device %s defrag: rblock_id %lu generation mismatch (%u:%u) ", + ssd->name, rblock_id, r->generation, block->generation); + } + + if (r->n_rblocks != n_rblocks) { + cf_warning_digest(AS_DRV_SSD, &r->keyd, "device %s defrag: rblock_id %lu n_blocks mismatch (%u:%u) ", + ssd->name, rblock_id, r->n_rblocks, n_rblocks); + } + + defrag_move_record(ssd, wblock_id, block, r); + + rv = 0; // record was in index tree and current - moved it + } + else { + rv = -1; // record was in index tree - presumably was overwritten + } + + as_record_done(&r_ref, ns); + } + else { + rv = -2; // record was not in index tree - presumably was deleted + } + + as_partition_release(&rsv); + + return rv; +} + + +bool +ssd_is_full(drv_ssd *ssd, uint32_t wblock_id) +{ + if (cf_queue_sz(ssd->free_wblock_q) > DEFRAG_STARTUP_RESERVE) { + return false; + } + + ssd_wblock_state* p_wblock_state = &ssd->alloc_table->wblock_state[wblock_id]; + + cf_mutex_lock(&p_wblock_state->LOCK); + + if (cf_atomic32_get(p_wblock_state->inuse_sz) == 0) { + // Lucky - wblock is empty, let ssd_defrag_wblock() free it. + cf_mutex_unlock(&p_wblock_state->LOCK); + + return false; + } + + cf_warning(AS_DRV_SSD, "{%s}: defrag: drive %s totally full, re-queuing wblock %u", + ssd->ns->name, ssd->name, wblock_id); + + // Not using push_wblock_to_defrag_q() - state is already DEFRAG, we + // definitely have a queue, and it's better to push back to head. + cf_queue_push_head(ssd->defrag_wblock_q, &wblock_id); + + cf_mutex_unlock(&p_wblock_state->LOCK); + + // If we got here, we used all our runtime reserve wblocks, but the wblocks + // we defragged must still have non-zero inuse_sz. Must wait for those to + // become free. Sleep prevents retries from overwhelming the log. + sleep(1); + + return true; +} + + +int +ssd_defrag_wblock(drv_ssd *ssd, uint32_t wblock_id, uint8_t *read_buf) +{ + if (ssd_is_full(ssd, wblock_id)) { + return 0; + } + + int record_count = 0; + int num_old_records = 0; + int num_deleted_records = 0; + + ssd_wblock_state* p_wblock_state = &ssd->alloc_table->wblock_state[wblock_id]; + + cf_assert(p_wblock_state->n_vac_dests == 0, AS_DRV_SSD, + "n-vacations not 0 beginning defrag wblock"); + + // Make sure this can't decrement to 0 while defragging this wblock. + cf_atomic32_set(&p_wblock_state->n_vac_dests, 1); + + if (cf_atomic32_get(p_wblock_state->inuse_sz) == 0) { + goto Finished; + } + + int fd = ssd_fd_get(ssd); + uint64_t file_offset = WBLOCK_ID_TO_BYTES(ssd, wblock_id); + + uint64_t start_ns = ssd->ns->storage_benchmarks_enabled ? cf_getns() : 0; + + if (lseek(fd, (off_t)file_offset, SEEK_SET) != (off_t)file_offset) { + cf_warning(AS_DRV_SSD, "%s: seek failed: offset %lu: errno %d (%s)", + ssd->name, file_offset, errno, cf_strerror(errno)); + close(fd); + fd = -1; + goto Finished; + } + + ssize_t rlen = read(fd, read_buf, ssd->write_block_size); + + if (rlen != (ssize_t)ssd->write_block_size) { + cf_warning(AS_DRV_SSD, "%s: read failed (%ld): errno %d (%s)", + ssd->name, rlen, errno, cf_strerror(errno)); + close(fd); + fd = -1; + goto Finished; + } + + if (start_ns != 0) { + histogram_insert_data_point(ssd->hist_large_block_read, start_ns); + } + + ssd_fd_put(ssd, fd); + + size_t wblock_offset = 0; // current offset within the wblock, in bytes + + while (wblock_offset < ssd->write_block_size && + cf_atomic32_get(p_wblock_state->inuse_sz) != 0) { + drv_ssd_block *block = (drv_ssd_block*)&read_buf[wblock_offset]; + + ssd_decrypt(ssd, file_offset + wblock_offset, block); + + if (block->magic != SSD_BLOCK_MAGIC) { + // First block must have magic. + if (wblock_offset == 0) { + cf_warning(AS_DRV_SSD, "BLOCK CORRUPTED: device %s has bad data on wblock %d", + ssd->name, wblock_id); + break; + } + + // Later blocks may have no magic, just skip to next block. + wblock_offset += RBLOCK_SIZE; + continue; + } + + // Note - if block->length is sane, we don't need to round up to a + // multiple of RBLOCK_SIZE, but let's do it anyway just to be safe. + size_t next_wblock_offset = wblock_offset + + BYTES_TO_RBLOCK_BYTES(block->length + LENGTH_BASE); + + if (next_wblock_offset > ssd->write_block_size) { + cf_warning(AS_DRV_SSD, "error: block extends over read size: foff %lu boff %lu blen %lu", + file_offset, wblock_offset, (uint64_t)block->length); + break; + } + + // Found a good record, move it if it's current. + int rv = ssd_record_defrag(ssd, wblock_id, block, + BYTES_TO_RBLOCKS(file_offset + wblock_offset), + (uint32_t)BYTES_TO_RBLOCKS(next_wblock_offset - wblock_offset)); + + if (rv == 0) { + record_count++; + } + else if (rv == -1) { + num_old_records++; + } + else if (rv == -2) { + num_deleted_records++; + } + + wblock_offset = next_wblock_offset; + } + +Finished: + + // Note - usually wblock's inuse_sz is 0 here, but may legitimately be non-0 + // e.g. if a dropped partition's tree is not done purging. In this case, we + // may have found deleted records in the wblock whose used-size contribution + // has not yet been subtracted. + + cf_detail(AS_DRV_SSD, "device %s: wblock-id %u defragged, final in-use-sz %d records (%d:%d:%d)", + ssd->name, wblock_id, cf_atomic32_get(p_wblock_state->inuse_sz), + record_count, num_old_records, num_deleted_records); + + ssd_release_vacated_wblock(ssd, wblock_id, p_wblock_state); + + return record_count; +} + + +// Thread "run" function to service a device's defrag queue. +void* +run_defrag(void *pv_data) +{ + drv_ssd *ssd = (drv_ssd*)pv_data; + uint32_t wblock_id; + uint8_t *read_buf = cf_valloc(ssd->write_block_size); + + while (true) { + uint32_t q_min = ssd->ns->storage_defrag_queue_min; + + if (q_min != 0) { + if (cf_queue_sz(ssd->defrag_wblock_q) > q_min) { + if (CF_QUEUE_OK != + cf_queue_pop(ssd->defrag_wblock_q, &wblock_id, + CF_QUEUE_NOWAIT)) { + // Should never get here! + break; + } + } + else { + usleep(1000 * 50); + continue; + } + } + else { + if (CF_QUEUE_OK != + cf_queue_pop(ssd->defrag_wblock_q, &wblock_id, + CF_QUEUE_FOREVER)) { + // Should never get here! + break; + } + } + + ssd_defrag_wblock(ssd, wblock_id, read_buf); + + uint32_t sleep_us = ssd->ns->storage_defrag_sleep; + + if (sleep_us != 0) { + usleep(sleep_us); + } + } + + // Although we ever expect to get here... + cf_free(read_buf); + cf_warning(AS_DRV_SSD, "device %s: quit defrag - queue error", ssd->name); + + return NULL; +} + + +void +ssd_start_defrag_threads(drv_ssds *ssds) +{ + cf_info(AS_DRV_SSD, "{%s} starting defrag threads", ssds->ns->name); + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + if (pthread_create(&ssd->defrag_thread, NULL, run_defrag, + (void*)ssd) != 0) { + cf_crash(AS_DRV_SSD, "%s defrag thread failed", ssd->name); + } + } +} + + +//------------------------------------------------ +// defrag_pen class. +// + +#define DEFRAG_PEN_INIT_CAPACITY (8 * 1024) + +typedef struct defrag_pen_s { + uint32_t n_ids; + uint32_t capacity; + uint32_t *ids; + uint32_t stack_ids[DEFRAG_PEN_INIT_CAPACITY]; +} defrag_pen; + +static void +defrag_pen_init(defrag_pen *pen) +{ + pen->n_ids = 0; + pen->capacity = DEFRAG_PEN_INIT_CAPACITY; + pen->ids = pen->stack_ids; +} + +static void +defrag_pen_destroy(defrag_pen *pen) +{ + if (pen->ids != pen->stack_ids) { + cf_free(pen->ids); + } +} + +static void +defrag_pen_add(defrag_pen *pen, uint32_t wblock_id) +{ + if (pen->n_ids == pen->capacity) { + if (pen->capacity == DEFRAG_PEN_INIT_CAPACITY) { + pen->capacity <<= 2; + pen->ids = cf_malloc(pen->capacity * sizeof(uint32_t)); + memcpy(pen->ids, pen->stack_ids, sizeof(pen->stack_ids)); + } + else { + pen->capacity <<= 1; + pen->ids = cf_realloc(pen->ids, pen->capacity * sizeof(uint32_t)); + } + } + + pen->ids[pen->n_ids++] = wblock_id; +} + +static void +defrag_pen_transfer(defrag_pen *pen, drv_ssd *ssd) +{ + // For speed, "customize" instead of using push_wblock_to_defrag_q()... + for (uint32_t i = 0; i < pen->n_ids; i++) { + uint32_t wblock_id = pen->ids[i]; + + ssd->alloc_table->wblock_state[wblock_id].state = WBLOCK_STATE_DEFRAG; + cf_queue_push(ssd->defrag_wblock_q, &wblock_id); + } +} + +static void +defrag_pens_dump(defrag_pen pens[], uint32_t n_pens, const char* ssd_name) +{ + char buf[2048]; + uint32_t n = 0; + int pos = sprintf(buf, "%u", pens[n++].n_ids); + + while (n < n_pens) { + pos += sprintf(buf + pos, ",%u", pens[n++].n_ids); + } + + cf_info(AS_DRV_SSD, "%s init defrag profile: %s", ssd_name, buf); +} + +// +// END - defrag_pen class. +//------------------------------------------------ + + +// Thread "run" function to create and load a device's (wblock) free & defrag +// queues at startup. Sorts defrag-eligible wblocks so the most depleted ones +// are at the head of the defrag queue. +void* +run_load_queues(void *pv_data) +{ + drv_ssd *ssd = (drv_ssd*)pv_data; + + // TODO - would be nice to have a queue create of specified capacity. + ssd->free_wblock_q = cf_queue_create(sizeof(uint32_t), true); + ssd->defrag_wblock_q = cf_queue_create(sizeof(uint32_t), true); + + as_namespace *ns = ssd->ns; + uint32_t lwm_pct = ns->storage_defrag_lwm_pct; + uint32_t lwm_size = ns->defrag_lwm_size; + defrag_pen pens[lwm_pct]; + + for (uint32_t n = 0; n < lwm_pct; n++) { + defrag_pen_init(&pens[n]); + } + + ssd_alloc_table* at = ssd->alloc_table; + uint32_t first_id = BYTES_TO_WBLOCK_ID(ssd, SSD_HEADER_SIZE); + uint32_t last_id = at->n_wblocks; + + for (uint32_t wblock_id = first_id; wblock_id < last_id; wblock_id++) { + uint32_t inuse_sz = at->wblock_state[wblock_id].inuse_sz; + + if (inuse_sz == 0) { + // Faster than using push_wblock_to_free_q() here... + cf_queue_push(ssd->free_wblock_q, &wblock_id); + } + else if (inuse_sz < lwm_size) { + defrag_pen_add(&pens[(inuse_sz * lwm_pct) / lwm_size], wblock_id); + } + } + + defrag_pens_dump(pens, lwm_pct, ssd->name); + + for (uint32_t n = 0; n < lwm_pct; n++) { + defrag_pen_transfer(&pens[n], ssd); + defrag_pen_destroy(&pens[n]); + } + + ssd->n_defrag_wblock_reads = (uint64_t)cf_queue_sz(ssd->defrag_wblock_q); + + return NULL; +} + + +void +ssd_load_wblock_queues(drv_ssds *ssds) +{ + cf_info(AS_DRV_SSD, "{%s} loading free & defrag queues", ssds->ns->name); + + // Split this task across multiple threads. + pthread_t q_load_threads[ssds->n_ssds]; + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + if (pthread_create(&q_load_threads[i], NULL, run_load_queues, + (void*)ssd) != 0) { + cf_crash(AS_DRV_SSD, "%s load queues thread failed", ssd->name); + } + } + + for (int i = 0; i < ssds->n_ssds; i++) { + pthread_join(q_load_threads[i], NULL); + } + // Now we're single-threaded again. + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + cf_info(AS_DRV_SSD, "%s init wblock free-q %d, defrag-q %d", ssd->name, + cf_queue_sz(ssd->free_wblock_q), + cf_queue_sz(ssd->defrag_wblock_q)); + } +} + + +void +ssd_wblock_init(drv_ssd *ssd) +{ + uint32_t n_wblocks = (uint32_t)(ssd->file_size / ssd->write_block_size); + + cf_info(AS_DRV_SSD, "%s has %u wblocks of size %u", ssd->name, n_wblocks, + ssd->write_block_size); + + ssd_alloc_table *at = cf_malloc(sizeof(ssd_alloc_table) + (n_wblocks * sizeof(ssd_wblock_state))); + + at->n_wblocks = n_wblocks; + + // Device header wblocks' inuse_sz will (also) be 0 but that doesn't matter. + for (uint32_t i = 0; i < n_wblocks; i++) { + ssd_wblock_state * p_wblock_state = &at->wblock_state[i]; + + cf_atomic32_set(&p_wblock_state->inuse_sz, 0); + cf_mutex_init(&p_wblock_state->LOCK); + p_wblock_state->swb = NULL; + p_wblock_state->state = WBLOCK_STATE_NONE; + p_wblock_state->n_vac_dests = 0; + } + + ssd->alloc_table = at; +} + + +//========================================================== +// Record reading utilities. +// + +int +ssd_read_record(as_storage_rd *rd) +{ + as_namespace *ns = rd->ns; + as_record *r = rd->r; + + if (STORAGE_RBLOCK_IS_INVALID(r->rblock_id)) { + cf_warning_digest(AS_DRV_SSD, &r->keyd, "{%s} read_ssd: invalid rblock_id ", + ns->name); + return -1; + } + + uint64_t record_offset = RBLOCKS_TO_BYTES(r->rblock_id); + uint64_t record_size = RBLOCKS_TO_BYTES(r->n_rblocks); + + uint8_t *read_buf = NULL; + drv_ssd_block *block = NULL; + + drv_ssd *ssd = rd->ssd; + ssd_write_buf *swb = 0; + uint32_t wblock = RBLOCK_ID_TO_WBLOCK_ID(ssd, r->rblock_id); + + swb_check_and_reserve(&ssd->alloc_table->wblock_state[wblock], &swb); + + if (swb) { + // Data is in write buffer, so read it from there. + cf_atomic32_incr(&ns->n_reads_from_cache); + + read_buf = cf_malloc(record_size); + block = (drv_ssd_block*)read_buf; + + int swb_offset = record_offset - WBLOCK_ID_TO_BYTES(ssd, wblock); + memcpy(read_buf, swb->buf + swb_offset, record_size); + swb_release(swb); + + ssd_decrypt(ssd, record_offset, block); + } + else { + // Normal case - data is read from device. + cf_atomic32_incr(&ns->n_reads_from_device); + + uint64_t record_end_offset = record_offset + record_size; + uint64_t read_offset = BYTES_DOWN_TO_IO_MIN(ssd, record_offset); + uint64_t read_end_offset = BYTES_UP_TO_IO_MIN(ssd, record_end_offset); + size_t read_size = read_end_offset - read_offset; + uint64_t record_buf_indent = record_offset - read_offset; + + read_buf = cf_valloc(read_size); + + int fd = ssd_fd_get(ssd); + + uint64_t start_ns = ns->storage_benchmarks_enabled ? cf_getns() : 0; + + if (lseek(fd, (off_t)read_offset, SEEK_SET) != (off_t)read_offset) { + cf_warning(AS_DRV_SSD, "%s: seek failed: offset %lu: errno %d (%s)", + ssd->name, read_offset, errno, cf_strerror(errno)); + cf_free(read_buf); + close(fd); + return -1; + } + + ssize_t rv = read(fd, read_buf, read_size); + + if (rv != (ssize_t)read_size) { + cf_warning(AS_DRV_SSD, "%s: read failed (%ld): size %lu: errno %d (%s)", + ssd->name, rv, read_size, errno, cf_strerror(errno)); + cf_free(read_buf); + close(fd); + return -1; + } + + if (start_ns != 0) { + histogram_insert_data_point(ssd->hist_read, start_ns); + } + + ssd_fd_put(ssd, fd); + + block = (drv_ssd_block*)(read_buf + record_buf_indent); + ssd_decrypt(ssd, record_offset, block); + + // Sanity checks. + + if (block->magic != SSD_BLOCK_MAGIC) { + cf_warning(AS_DRV_SSD, "read: bad block magic offset %lu", + read_offset); + cf_free(read_buf); + return -1; + } + + if (block->length + LENGTH_BASE > read_size) { + cf_warning(AS_DRV_SSD, "read: bad block length %u", block->length); + cf_free(read_buf); + return -1; + } + + if (0 != cf_digest_compare(&block->keyd, &r->keyd)) { + cf_warning(AS_DRV_SSD, "read: read wrong key: expecting %lx got %lx", + *(uint64_t*)&r->keyd, *(uint64_t*)&block->keyd); + cf_free(read_buf); + return -1; + } + + if (block->n_bins > BIN_NAMES_QUOTA) { + cf_warning(AS_DRV_SSD, "read: bad block n_bins %u", block->n_bins); + cf_free(read_buf); + return -1; + } + + if (block->bins_offset + offsetof(drv_ssd_block, data) > read_size) { + cf_warning(AS_DRV_SSD, "read: bad block bins_offset %u", block->bins_offset); + cf_free(read_buf); + return -1; + } + + if (ns->storage_benchmarks_enabled) { + histogram_insert_raw(ns->device_read_size_hist, read_size); + } + } + + rd->block = block; + rd->must_free_block = read_buf; + + return 0; +} + + +//========================================================== +// Storage API implementation: reading records. +// + +int +as_storage_record_load_n_bins_ssd(as_storage_rd *rd) +{ + if (! as_record_is_live(rd->r)) { + rd->n_bins = 0; + return 0; // no need to read device + } + + // If the record hasn't been read, read it. + if (! rd->block && ssd_read_record(rd) != 0) { + cf_warning(AS_DRV_SSD, "load_n_bins: failed ssd_read_record()"); + return -1; + } + + rd->n_bins = rd->block->n_bins; + return 0; +} + + +int +as_storage_record_load_bins_ssd(as_storage_rd *rd) +{ + if (! as_record_is_live(rd->r)) { + return 0; // no need to read device + } + + // If the record hasn't been read, read it. + if (! rd->block && ssd_read_record(rd) != 0) { + cf_warning(AS_DRV_SSD, "load_bins: failed ssd_read_record()"); + return -1; + } + + drv_ssd_block *block = rd->block; + uint8_t *block_head = (uint8_t*)rd->block; + + drv_ssd_bin *ssd_bin = (drv_ssd_bin*)(block->data + block->bins_offset); + + for (uint16_t i = 0; i < block->n_bins; i++) { + as_bin_set_id_from_name(rd->ns, &rd->bins[i], ssd_bin->name); + + int rv = as_bin_particle_cast_from_flat(&rd->bins[i], + block_head + ssd_bin->offset, ssd_bin->len); + + if (0 != rv) { + return rv; + } + + ssd_bin = (drv_ssd_bin*)(block_head + ssd_bin->next); + } + + return 0; +} + + +bool +as_storage_record_get_key_ssd(as_storage_rd *rd) +{ + // If the record hasn't been read, read it. + if (! rd->block && ssd_read_record(rd) != 0) { + cf_warning(AS_DRV_SSD, "get_key: failed ssd_read_record()"); + return false; + } + + drv_ssd_block *block = rd->block; + as_rec_props props; + + props.size = block->bins_offset; + + if (props.size == 0) { + return false; + } + + props.p_data = block->data; + + return as_rec_props_get_value(&props, CL_REC_PROPS_FIELD_KEY, + &rd->key_size, &rd->key) == 0; +} + + +//========================================================== +// Record writing utilities. +// + +void +ssd_flush_swb(drv_ssd *ssd, ssd_write_buf *swb) +{ + // Wait for all writers to finish. + while (cf_atomic32_get(swb->n_writers) != 0) { + ; + } + + int fd = ssd_fd_get(ssd); + off_t write_offset = (off_t)WBLOCK_ID_TO_BYTES(ssd, swb->wblock_id); + + uint64_t start_ns = ssd->ns->storage_benchmarks_enabled ? cf_getns() : 0; + + if (lseek(fd, write_offset, SEEK_SET) != write_offset) { + cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED seek: offset %ld: errno %d (%s)", + ssd->name, write_offset, errno, cf_strerror(errno)); + } + + ssize_t rv_s = write(fd, swb->buf, ssd->write_block_size); + + if (rv_s != (ssize_t)ssd->write_block_size) { + cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)", + ssd->name, errno, cf_strerror(errno)); + } + + if (start_ns != 0) { + histogram_insert_data_point(ssd->hist_write, start_ns); + } + + ssd_fd_put(ssd, fd); +} + + +void +ssd_shadow_flush_swb(drv_ssd *ssd, ssd_write_buf *swb) +{ + int fd = ssd_shadow_fd_get(ssd); + off_t write_offset = (off_t)WBLOCK_ID_TO_BYTES(ssd, swb->wblock_id); + + uint64_t start_ns = ssd->ns->storage_benchmarks_enabled ? cf_getns() : 0; + + if (lseek(fd, write_offset, SEEK_SET) != write_offset) { + cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED seek: offset %ld: errno %d (%s)", + ssd->shadow_name, write_offset, errno, cf_strerror(errno)); + } + + ssize_t rv_s = write(fd, swb->buf, ssd->write_block_size); + + if (rv_s != (ssize_t)ssd->write_block_size) { + cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)", + ssd->shadow_name, errno, cf_strerror(errno)); + } + + if (start_ns != 0) { + histogram_insert_data_point(ssd->hist_shadow_write, start_ns); + } + + ssd_shadow_fd_put(ssd, fd); +} + + +void +ssd_write_sanity_checks(drv_ssd *ssd, ssd_write_buf *swb) +{ + ssd_wblock_state* p_wblock_state = + &ssd->alloc_table->wblock_state[swb->wblock_id]; + + if (p_wblock_state->swb != swb) { + cf_warning(AS_DRV_SSD, "device %s: wblock-id %u swb not consistent while writing", + ssd->name, swb->wblock_id); + } + + if (p_wblock_state->state != WBLOCK_STATE_NONE) { + cf_warning(AS_DRV_SSD, "device %s: wblock-id %u state not NONE while writing", + ssd->name, swb->wblock_id); + } +} + + +void +ssd_post_write(drv_ssd *ssd, ssd_write_buf *swb) +{ + if (cf_atomic32_get(ssd->ns->storage_post_write_queue) == 0 || + swb->skip_post_write_q) { + swb_dereference_and_release(ssd, swb->wblock_id, swb); + } + else { + // Transfer swb to post-write queue. + cf_queue_push(ssd->post_write_q, &swb); + } + + if (ssd->post_write_q) { + // Release post-write queue swbs if we're over the limit. + while ((uint32_t)cf_queue_sz(ssd->post_write_q) > + cf_atomic32_get(ssd->ns->storage_post_write_queue)) { + ssd_write_buf* cached_swb; + + if (CF_QUEUE_OK != cf_queue_pop(ssd->post_write_q, &cached_swb, + CF_QUEUE_NOWAIT)) { + // Should never happen. + cf_warning(AS_DRV_SSD, "device %s: post-write queue pop failed", + ssd->name); + break; + } + + swb_dereference_and_release(ssd, cached_swb->wblock_id, + cached_swb); + } + } +} + + +// Thread "run" function that flushes write buffers to device. +void * +ssd_write_worker(void *arg) +{ + drv_ssd *ssd = (drv_ssd*)arg; + + while (ssd->running) { + ssd_write_buf *swb; + + if (CF_QUEUE_OK != cf_queue_pop(ssd->swb_write_q, &swb, 100)) { + continue; + } + + // Sanity checks (optional). + ssd_write_sanity_checks(ssd, swb); + + // Flush to the device. + ssd_flush_swb(ssd, swb); + + if (ssd->shadow_name) { + // Queue for shadow device write. + cf_queue_push(ssd->swb_shadow_q, &swb); + } + else { + // If this swb was a defrag destination, release the sources. + swb_release_all_vacated_wblocks(swb); + + // Transfer to post-write queue, or release swb, as appropriate. + ssd_post_write(ssd, swb); + } + } // infinite event loop waiting for block to write + + return NULL; +} + + +// Thread "run" function that flushes write buffers to shadow device. +void * +ssd_shadow_worker(void *arg) +{ + drv_ssd *ssd = (drv_ssd*)arg; + + while (ssd->running) { + ssd_write_buf *swb; + + if (CF_QUEUE_OK != cf_queue_pop(ssd->swb_shadow_q, &swb, 100)) { + continue; + } + + // Sanity checks (optional). + ssd_write_sanity_checks(ssd, swb); + + // Flush to the shadow device. + ssd_shadow_flush_swb(ssd, swb); + + // If this swb was a defrag destination, release the sources. + swb_release_all_vacated_wblocks(swb); + + // Transfer to post-write queue, or release swb, as appropriate. + ssd_post_write(ssd, swb); + } + + return NULL; +} + + +void +ssd_start_write_worker_threads(drv_ssds *ssds) +{ + if (ssds->ns->storage_write_threads > MAX_SSD_THREADS) { + cf_warning(AS_DRV_SSD, "configured number of write threads %u greater than max, using %d instead", + ssds->ns->storage_write_threads, MAX_SSD_THREADS); + ssds->ns->storage_write_threads = MAX_SSD_THREADS; + } + + cf_info(AS_DRV_SSD, "{%s} starting write worker threads", ssds->ns->name); + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + for (uint32_t j = 0; j < ssds->ns->storage_write_threads; j++) { + pthread_create(&ssd->write_worker_thread[j], 0, ssd_write_worker, + (void*)ssd); + } + + if (ssd->shadow_name) { + pthread_create(&ssd->shadow_worker_thread, 0, ssd_shadow_worker, + (void*)ssd); + } + } +} + + +static inline uint32_t +ssd_record_overhead_size(as_storage_rd *rd) +{ + // Start with size of record header struct. + size_t size = sizeof(drv_ssd_block); + + // Add size of any record properties. + if (rd->rec_props.p_data) { + size += rd->rec_props.size; + } + + return (uint32_t)size; +} + + +uint32_t +ssd_record_size(as_storage_rd *rd) +{ + // Start with the record storage overhead, including vinfo and rec-props. + uint32_t write_size = ssd_record_overhead_size(rd); + + // Add the bins' sizes, including bin overhead. + for (uint16_t i = 0; i < rd->n_bins; i++) { + as_bin *bin = &rd->bins[i]; + + if (! as_bin_inuse(bin)) { + break; + } + + // TODO: could factor out sizeof(drv_ssd_bin) and multiply by i, but + // for now let's favor the low bin-count case and leave it this way. + write_size += sizeof(drv_ssd_bin) + as_bin_particle_flat_size(bin); + } + + return write_size; +} + + +int +ssd_buffer_bins(as_storage_rd *rd) +{ + as_namespace *ns = rd->ns; + as_record *r = rd->r; + drv_ssd *ssd = rd->ssd; + + // Note - this is the only place where rounding size (up to a multiple of + // RBLOCK_SIZE) is really necessary. + uint32_t write_size = BYTES_TO_RBLOCK_BYTES(ssd_record_size(rd)); + + if (write_size > ssd->write_block_size) { + cf_detail_digest(AS_DRV_SSD, &r->keyd, "write: size %u - rejecting ", + write_size); + return -AS_PROTO_RESULT_FAIL_RECORD_TOO_BIG; + } + + // Reserve the portion of the current swb where this record will be written. + pthread_mutex_lock(&ssd->write_lock); + + ssd_write_buf *swb = ssd->current_swb; + + if (! swb) { + swb = swb_get(ssd); + ssd->current_swb = swb; + + if (! swb) { + cf_warning(AS_DRV_SSD, "write bins: couldn't get swb"); + pthread_mutex_unlock(&ssd->write_lock); + return -AS_PROTO_RESULT_FAIL_OUT_OF_SPACE; + } + } + + // Check if there's enough space in current buffer - if not, free and zero + // any remaining unused space, enqueue it to be flushed to device, and grab + // a new buffer. + if (write_size > ssd->write_block_size - swb->pos) { + if (ssd->write_block_size != swb->pos) { + // Clean the end of the buffer before pushing to write queue. + memset(&swb->buf[swb->pos], 0, ssd->write_block_size - swb->pos); + } + + // Enqueue the buffer, to be flushed to device. + cf_queue_push(ssd->swb_write_q, &swb); + cf_atomic64_incr(&ssd->n_wblock_writes); + + // Get the new buffer. + swb = swb_get(ssd); + ssd->current_swb = swb; + + if (! swb) { + cf_warning(AS_DRV_SSD, "write bins: couldn't get swb"); + pthread_mutex_unlock(&ssd->write_lock); + return -AS_PROTO_RESULT_FAIL_OUT_OF_SPACE; + } + } + + // There's enough space - save the position where this record will be + // written, and advance swb->pos for the next writer. + uint32_t swb_pos = swb->pos; + + swb->pos += write_size; + cf_atomic32_incr(&swb->n_writers); + + pthread_mutex_unlock(&ssd->write_lock); + // May now write this record concurrently with others in this swb. + + // Flatten data into the block. + + uint8_t *buf = &swb->buf[swb_pos]; + uint8_t *buf_start = buf; + + drv_ssd_block *block = (drv_ssd_block*)buf; + + buf += sizeof(drv_ssd_block); + + // Properties list goes just before bins. + if (rd->rec_props.p_data) { + memcpy(buf, rd->rec_props.p_data, rd->rec_props.size); + buf += rd->rec_props.size; + } + + uint16_t n_bins_written; + + for (n_bins_written = 0; n_bins_written < rd->n_bins; n_bins_written++) { + as_bin *bin = &rd->bins[n_bins_written]; + + if (! as_bin_inuse(bin)) { + break; + } + + drv_ssd_bin *ssd_bin = (drv_ssd_bin*)buf; + + buf += sizeof(drv_ssd_bin); + + ssd_bin->version = 0; + + if (ns->single_bin) { + ssd_bin->name[0] = 0; + } + else { + strcpy(ssd_bin->name, as_bin_get_name_from_id(ns, bin->id)); + } + + ssd_bin->offset = buf - buf_start; + + uint32_t particle_flat_size = as_bin_particle_to_flat(bin, buf); + + buf += particle_flat_size; + ssd_bin->len = particle_flat_size; + ssd_bin->next = buf - buf_start; + } + + block->sig = 0; // deprecated + block->length = write_size - LENGTH_BASE; + block->magic = SSD_BLOCK_MAGIC; + block->keyd = r->keyd; + block->generation = r->generation; + block->void_time = r->void_time; + block->bins_offset = rd->rec_props.p_data ? rd->rec_props.size : 0; + block->n_bins = n_bins_written; + block->last_update_time = r->last_update_time; + + uint64_t write_offset = WBLOCK_ID_TO_BYTES(ssd, swb->wblock_id) + swb_pos; + + ssd_encrypt(ssd, write_offset, block); + + r->file_id = ssd->file_id; + r->rblock_id = BYTES_TO_RBLOCKS(write_offset); + r->n_rblocks = BYTES_TO_RBLOCKS(write_size); + + cf_atomic64_add(&ssd->inuse_size, (int64_t)write_size); + cf_atomic32_add(&ssd->alloc_table->wblock_state[swb->wblock_id].inuse_sz, (int32_t)write_size); + + // We are finished writing to the buffer. + cf_atomic32_decr(&swb->n_writers); + + if (ns->storage_benchmarks_enabled) { + histogram_insert_raw(ns->device_write_size_hist, write_size); + } + + return 0; +} + + +int +ssd_write(as_storage_rd *rd) +{ + as_record *r = rd->r; + + drv_ssd *old_ssd = NULL; + uint64_t old_rblock_id = 0; + uint16_t old_n_rblocks = 0; + + if (STORAGE_RBLOCK_IS_VALID(r->rblock_id)) { + // Replacing an old record. + old_ssd = rd->ssd; + old_rblock_id = r->rblock_id; + old_n_rblocks = r->n_rblocks; + } + + drv_ssds *ssds = (drv_ssds*)rd->ns->storage_private; + + // Figure out which device to write to. When replacing an old record, it's + // possible this is different from the old device (e.g. if we've added a + // fresh device), so derive it from the digest each time. + rd->ssd = &ssds->ssds[ssd_get_file_id(ssds, &r->keyd)]; + + drv_ssd *ssd = rd->ssd; + + if (! ssd) { + cf_warning(AS_DRV_SSD, "{%s} ssd_write: no drv_ssd for file_id %u", + rd->ns->name, ssd_get_file_id(ssds, &r->keyd)); + return -AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + int rv = ssd_write_bins(rd); + + if (rv == 0 && old_ssd) { + ssd_block_free(old_ssd, old_rblock_id, old_n_rblocks, "ssd-write"); + } + + return rv; +} + + +//========================================================== +// Storage statistics utilities. +// + +void +as_storage_show_wblock_stats(as_namespace *ns) +{ + if (AS_STORAGE_ENGINE_SSD != ns->storage_type) { + cf_info(AS_DRV_SSD, "Storage engine type must be SSD (%d), not %d.", + AS_STORAGE_ENGINE_SSD, ns->storage_type); + return; + } + + if (ns->storage_private) { + drv_ssds *ssds = ns->storage_private; + + for (int d = 0; d < ssds->n_ssds; d++) { + int num_free_blocks = 0; + int num_full_blocks = 0; + int num_full_swb = 0; + int num_above_wm = 0; + int num_defraggable = 0; + + drv_ssd *ssd = &ssds->ssds[d]; + ssd_alloc_table *at = ssd->alloc_table; + uint32_t lwm_size = ns->defrag_lwm_size; + + for (uint32_t i = 0; i < at->n_wblocks; i++) { + ssd_wblock_state *wblock_state = &at->wblock_state[i]; + uint32_t inuse_sz = cf_atomic32_get(wblock_state->inuse_sz); + + if (inuse_sz == 0) { + num_free_blocks++; + } + else if (inuse_sz == ssd->write_block_size) { + if (wblock_state->swb) { + num_full_swb++; + } + else { + num_full_blocks++; + } + } + else { + if (inuse_sz > ssd->write_block_size || inuse_sz < lwm_size) { + cf_info(AS_DRV_SSD, "dev %d, wblock %u, inuse_sz %u, %s swb", + d, i, inuse_sz, wblock_state->swb ? "has" : "no"); + + num_defraggable++; + } + else { + num_above_wm++; + } + } + } + + cf_info(AS_DRV_SSD, "device %s free %d full %d fullswb %d pfull %d defrag %d freeq %d", + ssd->name, num_free_blocks, num_full_blocks, num_full_swb, + num_above_wm, num_defraggable, cf_queue_sz(ssd->free_wblock_q)); + } + } + else { + cf_info(AS_DRV_SSD, "no devices"); + } +} + + +void +as_storage_summarize_wblock_stats(as_namespace *ns) +{ + if (AS_STORAGE_ENGINE_SSD != ns->storage_type) { + cf_info(AS_DRV_SSD, "Storage engine type must be SSD (%d), not %d.", + AS_STORAGE_ENGINE_SSD, ns->storage_type); + return; + } + + if (! ns->storage_private) { + cf_info(AS_DRV_SSD, "no devices"); + return; + } + + drv_ssds *ssds = ns->storage_private; + uint32_t total_num_defraggable = 0; + uint32_t total_num_above_wm = 0; + uint64_t defraggable_sz = 0; + uint64_t non_defraggable_sz = 0; + + // Note: This is a sparse array that could be more efficiently stored. + // (In addition, ranges of block sizes could be binned together to + // compress the histogram, rather than using one bin per block size.) + uint32_t wb_hist[MAX_WRITE_BLOCK_SIZE] = { 0 }; + + for (uint32_t d = 0; d < ssds->n_ssds; d++) { + drv_ssd *ssd = &ssds->ssds[d]; + ssd_alloc_table *at = ssd->alloc_table; + uint32_t num_free_blocks = 0; + uint32_t num_full_swb = 0; + uint32_t num_full_blocks = 0; + uint32_t lwm_size = ns->defrag_lwm_size; + uint32_t num_defraggable = 0; + uint32_t num_above_wm = 0; + + for (uint32_t i = 0; i < at->n_wblocks; i++) { + ssd_wblock_state *wblock_state = &at->wblock_state[i]; + uint32_t inuse_sz = cf_atomic32_get(wblock_state->inuse_sz); + + if (inuse_sz > ssd->write_block_size) { + cf_warning(AS_DRV_SSD, "wblock size (%d > %d) too large ~~ not counting in histogram", + inuse_sz, ssd->write_block_size); + } + else { + wb_hist[inuse_sz]++; + } + + if (inuse_sz == 0) { + num_free_blocks++; + } + else if (inuse_sz == ssd->write_block_size) { + if (wblock_state->swb) { + num_full_swb++; + } + else { + num_full_blocks++; + } + } + else if (inuse_sz < lwm_size) { + defraggable_sz += inuse_sz; + num_defraggable++; + } + else { + non_defraggable_sz += inuse_sz; + num_above_wm++; + } + } + + total_num_defraggable += num_defraggable; + total_num_above_wm += num_above_wm; + + cf_info(AS_DRV_SSD, "device %s free %u full %u fullswb %u pfull %u defrag %u freeq %u", + ssd->name, num_free_blocks, num_full_blocks, num_full_swb, + num_above_wm, num_defraggable, cf_queue_sz(ssd->free_wblock_q)); + } + + cf_info(AS_DRV_SSD, "WBH: Storage histogram for namespace \"%s\":", + ns->name); + cf_info(AS_DRV_SSD, "WBH: Average wblock size of: defraggable blocks: %lu bytes; nondefraggable blocks: %lu bytes; all blocks: %lu bytes", + defraggable_sz / MAX(1, total_num_defraggable), + non_defraggable_sz / MAX(1, total_num_above_wm), + (defraggable_sz + non_defraggable_sz) / + MAX(1, (total_num_defraggable + total_num_above_wm))); + + for (uint32_t i = 0; i < MAX_WRITE_BLOCK_SIZE; i++) { + if (wb_hist[i] > 0) { + cf_info(AS_DRV_SSD, "WBH: %u block%s of size %u bytes", + wb_hist[i], (wb_hist[i] != 1 ? "s" : ""), i); + } + } +} + + +// TODO - do something more useful with this info command. +int +as_storage_analyze_wblock(as_namespace* ns, int device_index, + uint32_t wblock_id) +{ + if (AS_STORAGE_ENGINE_SSD != ns->storage_type) { + cf_info(AS_DRV_SSD, "Storage engine type must be SSD (%d), not %d.", + AS_STORAGE_ENGINE_SSD, ns->storage_type); + return -1; + } + + cf_info(AS_DRV_SSD, "analyze wblock: {%s}, device-index %d, wblock-id %u", + ns->name, device_index, wblock_id); + + drv_ssds* ssds = (drv_ssds*)ns->storage_private; + + if (! ssds) { + cf_warning(AS_DRV_SSD, "analyze wblock ERROR: no devices"); + return -1; + } + + if (device_index < 0 || device_index >= ssds->n_ssds) { + cf_warning(AS_DRV_SSD, "analyze wblock ERROR: bad device-index"); + return -1; + } + + drv_ssd* ssd = &ssds->ssds[device_index]; + uint8_t* read_buf = cf_valloc(ssd->write_block_size); + + int fd = ssd_fd_get(ssd); + uint64_t file_offset = WBLOCK_ID_TO_BYTES(ssd, wblock_id); + + if (lseek(fd, (off_t)file_offset, SEEK_SET) != (off_t)file_offset) { + cf_warning(AS_DRV_SSD, "%s: seek failed: offset %lu: errno %d (%s)", + ssd->name, file_offset, errno, cf_strerror(errno)); + cf_free(read_buf); + close(fd); + return -1; + } + + ssize_t rlen = read(fd, read_buf, ssd->write_block_size); + + if (rlen != (ssize_t)ssd->write_block_size) { + cf_warning(AS_DRV_SSD, "%s: read failed (%ld): errno %d (%s)", + ssd->name, rlen, errno, cf_strerror(errno)); + cf_free(read_buf); + close(fd); + return -1; + } + + ssd_fd_put(ssd, fd); + + uint32_t living_populations[AS_PARTITIONS]; + uint32_t zombie_populations[AS_PARTITIONS]; + + memset(living_populations, 0, sizeof(living_populations)); + memset(zombie_populations, 0, sizeof(zombie_populations)); + + uint32_t inuse_sz_start = + cf_atomic32_get(ssd->alloc_table->wblock_state[wblock_id].inuse_sz); + uint32_t offset = 0; + + while (offset < ssd->write_block_size) { + drv_ssd_block* p_block = (drv_ssd_block*)&read_buf[offset]; + + ssd_decrypt(ssd, file_offset + offset, p_block); + + if (p_block->magic != SSD_BLOCK_MAGIC) { + if (offset == 0) { + // First block must have magic. + cf_warning(AS_DRV_SSD, "analyze wblock ERROR: 1st block has no magic"); + cf_free(read_buf); + return -1; + } + + // Later blocks may have no magic, just skip to next block. + offset += RBLOCK_SIZE; + continue; + } + + // Note - if block->length is sane, we don't need to round up to a + // multiple of RBLOCK_SIZE, but let's do it anyway just to be safe. + uint32_t next_offset = offset + + BYTES_TO_RBLOCK_BYTES(p_block->length + LENGTH_BASE); + + if (next_offset > ssd->write_block_size) { + cf_warning(AS_DRV_SSD, "analyze wblock ERROR: record overflows wblock"); + cf_free(read_buf); + return -1; + } + + uint64_t rblock_id = BYTES_TO_RBLOCKS(file_offset + offset); + uint32_t n_rblocks = (uint32_t)BYTES_TO_RBLOCKS(next_offset - offset); + + bool living = false; + uint32_t pid = as_partition_getid(&p_block->keyd); + as_partition_reservation rsv; + + as_partition_reserve(ns, pid, &rsv); + + as_index_ref r_ref; + r_ref.skip_lock = false; + + if (0 == as_record_get(rsv.tree, &p_block->keyd, &r_ref)) { + as_index* r = r_ref.r; + + if (r->rblock_id == rblock_id && r->n_rblocks == n_rblocks) { + living = true; + } + + as_record_done(&r_ref, ns); + } + // else it was deleted (?) so call it a zombie... + + as_partition_release(&rsv); + + if (living) { + living_populations[pid]++; + } + else { + zombie_populations[pid]++; + } + + offset = next_offset; + } + + cf_free(read_buf); + + uint32_t inuse_sz_end = + cf_atomic32_get(ssd->alloc_table->wblock_state[wblock_id].inuse_sz); + + cf_info(AS_DRV_SSD, "analyze wblock: inuse_sz %u (before) -> %u (after)", + inuse_sz_start, inuse_sz_end); + + for (int i = 0; i < AS_PARTITIONS; i++) { + if (living_populations[i] > 0 || zombie_populations[i] > 0) { + cf_info(AS_DRV_SSD, "analyze wblock: pid %4d - live %u, dead %u", + i, living_populations[i], zombie_populations[i]); + } + } + + return 0; +} + + +//========================================================== +// Per-device background jobs. +// + +#define LOG_STATS_INTERVAL_sec 20 + +void +ssd_log_stats(drv_ssd *ssd, uint64_t *p_prev_n_total_writes, + uint64_t *p_prev_n_defrag_reads, uint64_t *p_prev_n_defrag_writes, + uint64_t *p_prev_n_tomb_raider_reads) +{ + uint64_t n_defrag_reads = cf_atomic64_get(ssd->n_defrag_wblock_reads); + uint64_t n_defrag_writes = cf_atomic64_get(ssd->n_defrag_wblock_writes); + uint64_t n_total_writes = cf_atomic64_get(ssd->n_wblock_writes) + + n_defrag_writes; + + float total_write_rate = (float)(n_total_writes - *p_prev_n_total_writes) / + (float)LOG_STATS_INTERVAL_sec; + float defrag_read_rate = (float)(n_defrag_reads - *p_prev_n_defrag_reads) / + (float)LOG_STATS_INTERVAL_sec; + float defrag_write_rate = (float)(n_defrag_writes - *p_prev_n_defrag_writes) / + (float)LOG_STATS_INTERVAL_sec; + + uint64_t n_tomb_raider_reads = ssd->n_tomb_raider_reads; + char tomb_raider_str[64]; + + *tomb_raider_str = 0; + + if (n_tomb_raider_reads != 0) { + if (*p_prev_n_tomb_raider_reads > n_tomb_raider_reads) { + *p_prev_n_tomb_raider_reads = 0; + } + + float tomb_raider_read_rate = + (float)(n_tomb_raider_reads - *p_prev_n_tomb_raider_reads) / + (float)LOG_STATS_INTERVAL_sec; + + sprintf(tomb_raider_str, " tomb-raider-read (%lu,%.1f)", + n_tomb_raider_reads, tomb_raider_read_rate); + } + + char shadow_str[64]; + + *shadow_str = 0; + + if (ssd->shadow_name) { + sprintf(shadow_str, " shadow-write-q %d", + cf_queue_sz(ssd->swb_shadow_q)); + } + + cf_info(AS_DRV_SSD, "{%s} %s: used-bytes %lu free-wblocks %d write-q %d write (%lu,%.1f) defrag-q %d defrag-read (%lu,%.1f) defrag-write (%lu,%.1f)%s%s", + ssd->ns->name, ssd->name, + ssd->inuse_size, cf_queue_sz(ssd->free_wblock_q), + cf_queue_sz(ssd->swb_write_q), + n_total_writes, total_write_rate, + cf_queue_sz(ssd->defrag_wblock_q), n_defrag_reads, defrag_read_rate, + n_defrag_writes, defrag_write_rate, + shadow_str, tomb_raider_str); + + *p_prev_n_total_writes = n_total_writes; + *p_prev_n_defrag_reads = n_defrag_reads; + *p_prev_n_defrag_writes = n_defrag_writes; + *p_prev_n_tomb_raider_reads = n_tomb_raider_reads; + + if (cf_queue_sz(ssd->free_wblock_q) == 0) { + cf_warning(AS_DRV_SSD, "device %s: out of storage space", ssd->name); + } +} + + +void +ssd_free_swbs(drv_ssd *ssd) +{ + // Try to recover swbs, 16 at a time, down to 16. + for (int i = 0; i < 16 && cf_queue_sz(ssd->swb_free_q) > 16; i++) { + ssd_write_buf* swb; + + if (CF_QUEUE_OK != + cf_queue_pop(ssd->swb_free_q, &swb, CF_QUEUE_NOWAIT)) { + break; + } + + swb_destroy(swb); + } +} + + +void +ssd_flush_current_swb(drv_ssd *ssd, uint64_t *p_prev_n_writes, + uint32_t *p_prev_size) +{ + uint64_t n_writes = cf_atomic64_get(ssd->n_wblock_writes); + + // If there's an active write load, we don't need to flush. + if (n_writes != *p_prev_n_writes) { + *p_prev_n_writes = n_writes; + *p_prev_size = 0; + return; + } + + pthread_mutex_lock(&ssd->write_lock); + + n_writes = cf_atomic64_get(ssd->n_wblock_writes); + + // Must check under the lock, could be racing a current swb just queued. + if (n_writes != *p_prev_n_writes) { + + pthread_mutex_unlock(&ssd->write_lock); + + *p_prev_n_writes = n_writes; + *p_prev_size = 0; + return; + } + + // Flush the current swb if it isn't empty, and has been written to since + // last flushed. + + ssd_write_buf *swb = ssd->current_swb; + + if (swb && swb->pos != *p_prev_size) { + *p_prev_size = swb->pos; + + // Clean the end of the buffer before flushing. + if (ssd->write_block_size != swb->pos) { + memset(&swb->buf[swb->pos], 0, ssd->write_block_size - swb->pos); + } + + // Flush it. + ssd_flush_swb(ssd, swb); + + if (ssd->shadow_name) { + ssd_shadow_flush_swb(ssd, swb); + } + } + + pthread_mutex_unlock(&ssd->write_lock); +} + + +void +ssd_fsync(drv_ssd *ssd) +{ + int fd = ssd_fd_get(ssd); + + uint64_t start_ns = ssd->ns->storage_benchmarks_enabled ? cf_getns() : 0; + + fsync(fd); + + if (start_ns != 0) { + histogram_insert_data_point(ssd->hist_fsync, start_ns); + } + + ssd_fd_put(ssd, fd); +} + + +// Check all wblocks to load a device's defrag queue at runtime. Triggered only +// when defrag-lwm-pct is increased by manual intervention. +void +ssd_defrag_sweep(drv_ssd *ssd) +{ + ssd_alloc_table* at = ssd->alloc_table; + uint32_t first_id = BYTES_TO_WBLOCK_ID(ssd, SSD_HEADER_SIZE); + uint32_t last_id = at->n_wblocks; + uint32_t n_queued = 0; + + for (uint32_t wblock_id = first_id; wblock_id < last_id; wblock_id++) { + ssd_wblock_state *p_wblock_state = &at->wblock_state[wblock_id]; + + cf_mutex_lock(&p_wblock_state->LOCK); + + uint32_t inuse_sz = cf_atomic32_get(p_wblock_state->inuse_sz); + + if (! p_wblock_state->swb && + p_wblock_state->state != WBLOCK_STATE_DEFRAG && + inuse_sz != 0 && + inuse_sz < ssd->ns->defrag_lwm_size) { + push_wblock_to_defrag_q(ssd, wblock_id); + n_queued++; + } + + cf_mutex_unlock(&p_wblock_state->LOCK); + } + + cf_info(AS_DRV_SSD, "... %s sweep queued %u wblocks for defrag", ssd->name, + n_queued); +} + + +static inline uint64_t +next_time(uint64_t now, uint64_t job_interval, uint64_t next) +{ + uint64_t next_job = now + job_interval; + + return next_job < next ? next_job : next; +} + + +// All in microseconds since we're using usleep(). +#define MAX_INTERVAL (1000 * 1000) +#define LOG_STATS_INTERVAL (1000 * 1000 * LOG_STATS_INTERVAL_sec) +#define FREE_SWBS_INTERVAL (1000 * 1000 * 20) + +// Thread "run" function to perform various background jobs per device. +void * +run_ssd_maintenance(void *udata) +{ + drv_ssd *ssd = (drv_ssd*)udata; + as_namespace *ns = ssd->ns; + + uint64_t prev_n_total_writes = 0; + uint64_t prev_n_defrag_reads = 0; + uint64_t prev_n_defrag_writes = 0; + uint64_t prev_n_tomb_raider_reads = 0; + + uint64_t prev_n_writes_flush = 0; + uint32_t prev_size_flush = 0; + + uint64_t now = cf_getus(); + uint64_t next = now + MAX_INTERVAL; + + uint64_t prev_log_stats = now; + uint64_t prev_free_swbs = now; + uint64_t prev_flush = now; + uint64_t prev_fsync = now; + + // If any job's (initial) interval is less than MAX_INTERVAL and we want it + // done on its interval the first time through, add a next_time() call for + // that job here to adjust 'next'. (No such jobs for now.) + + uint64_t sleep_us = next - now; + + while (true) { + usleep((uint32_t)sleep_us); + + now = cf_getus(); + next = now + MAX_INTERVAL; + + if (now >= prev_log_stats + LOG_STATS_INTERVAL) { + ssd_log_stats(ssd, &prev_n_total_writes, &prev_n_defrag_reads, + &prev_n_defrag_writes, &prev_n_tomb_raider_reads); + prev_log_stats = now; + next = next_time(now, LOG_STATS_INTERVAL, next); + } + + if (now >= prev_free_swbs + FREE_SWBS_INTERVAL) { + ssd_free_swbs(ssd); + prev_free_swbs = now; + next = next_time(now, FREE_SWBS_INTERVAL, next); + } + + uint64_t flush_max_us = ssd_flush_max_us(ns); + + if (flush_max_us != 0 && now >= prev_flush + flush_max_us) { + ssd_flush_current_swb(ssd, &prev_n_writes_flush, &prev_size_flush); + prev_flush = now; + next = next_time(now, flush_max_us, next); + } + + uint64_t fsync_max_us = ns->storage_fsync_max_us; + + if (fsync_max_us != 0 && now >= prev_fsync + fsync_max_us) { + ssd_fsync(ssd); + prev_fsync = now; + next = next_time(now, fsync_max_us, next); + } + + if (cf_atomic32_get(ssd->defrag_sweep) != 0) { + // May take long enough to mess up other jobs' schedules, but it's a + // very rare manually-triggered intervention. + ssd_defrag_sweep(ssd); + cf_atomic32_decr(&ssd->defrag_sweep); + } + + now = cf_getus(); // refresh in case jobs took significant time + sleep_us = next > now ? next - now : 1; + } + + return NULL; +} + + +void +ssd_start_maintenance_threads(drv_ssds *ssds) +{ + cf_info(AS_DRV_SSD, "{%s} starting device maintenance threads", + ssds->ns->name); + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd* ssd = &ssds->ssds[i]; + + pthread_create(&ssd->maintenance_thread, 0, run_ssd_maintenance, ssd); + } +} + + +//========================================================== +// Device header utilities. +// + +// -1 means unrecoverable error +// -2 means not formatted, please overwrite me +int +ssd_read_header(drv_ssd *ssd, as_namespace *ns, ssd_device_header **header_r) +{ + *header_r = 0; + + int rv = -1; + + bool use_shadow = ns->cold_start && ssd->shadow_name; + const char *ssd_name = use_shadow ? ssd->shadow_name : ssd->name; + int fd = use_shadow ? ssd_shadow_fd_get(ssd) : ssd_fd_get(ssd); + + size_t peek_size = BYTES_UP_TO_IO_MIN(ssd, sizeof(ssd_device_header)); + ssd_device_header *header = cf_valloc(peek_size); + + if (lseek(fd, 0, SEEK_SET) != 0) { + cf_warning(AS_DRV_SSD, "%s: seek failed: errno %d (%s)", ssd_name, + errno, cf_strerror(errno)); + close(fd); + fd = -1; + goto Fail; + } + + ssize_t sz = read(fd, (void*)header, peek_size); + + if (sz != (ssize_t)peek_size) { + cf_warning(AS_DRV_SSD, "%s: read failed (%ld): errno %d (%s)", + ssd_name, sz, errno, cf_strerror(errno)); + close(fd); + fd = -1; + goto Fail; + } + + // Make sure all following checks that return -1 or -2 are also done in + // peek_devices() in the enterprise repo. + + if (header->magic != SSD_HEADER_MAGIC) { // normal path for a fresh drive + cf_detail(AS_DRV_SSD, "read_header: device %s no magic, not a Citrusleaf drive", + ssd_name); + rv = -2; + goto Fail; + } + + if (header->version != SSD_VERSION) { + if (can_convert_storage_version(header->version)) { + cf_info(AS_DRV_SSD, "read_header: device %s converting storage version %u to %u", + ssd_name, header->version, SSD_VERSION); + } + else { + cf_warning(AS_DRV_SSD, "read_header: device %s bad version %u, not a current Citrusleaf drive", + ssd_name, header->version); + goto Fail; + } + } + + if (header->write_block_size != 0 && + ns->storage_write_block_size % header->write_block_size != 0) { + cf_warning(AS_DRV_SSD, "read header: device %s can't change write-block-size from %u to %u", + ssd_name, header->write_block_size, + ns->storage_write_block_size); + goto Fail; + } + + if (header->devices_n > AS_STORAGE_MAX_DEVICES) { + cf_warning(AS_DRV_SSD, "read header: device %s bad number of devices %u", + ssd_name, header->devices_n); + goto Fail; + } + + if (header->header_length != SSD_HEADER_SIZE) { + cf_warning(AS_DRV_SSD, "read header: device %s incompatible header size %u", + ssd_name, header->header_length); + goto Fail; + } + + if (strcmp(header->namespace, ns->name) != 0) { + cf_warning(AS_DRV_SSD, "read header: device %s previous namespace %s now %s, check config or erase device", + ssd_name, header->namespace, ns->name); + goto Fail; + } + + size_t h_len = header->header_length; + + cf_free(header); + + header = cf_valloc(h_len); + + if (lseek(fd, 0, SEEK_SET) != 0) { + cf_warning(AS_DRV_SSD, "%s: seek failed: errno %d (%s)", ssd_name, + errno, cf_strerror(errno)); + close(fd); + fd = -1; + goto Fail; + } + + sz = read(fd, (void*)header, h_len); + + if (sz != (ssize_t)header->header_length) { + cf_warning(AS_DRV_SSD, "%s: read failed (%ld): errno %d (%s)", + ssd_name, sz, errno, cf_strerror(errno)); + close(fd); + fd = -1; + goto Fail; + } + + cf_detail(AS_DRV_SSD, "device %s: header read success: version %d devices %d random %lu", + ssd_name, header->version, header->devices_n, header->random); + + if (! ssd_header_is_valid_cfg(ns, header)) { + goto Fail; + } + + // In case we're bumping the version - ensure the new version gets written. + header->version = SSD_VERSION; + + // In case we're increasing write-block-size - ensure new value is recorded. + header->write_block_size = ns->storage_write_block_size; + + *header_r = header; + + use_shadow ? ssd_shadow_fd_put(ssd, fd) : ssd_fd_put(ssd, fd); + + return 0; + +Fail: + + if (header) { + cf_free(header); + } + + if (fd != -1) { + use_shadow ? ssd_shadow_fd_put(ssd, fd) : ssd_fd_put(ssd, fd); + } + + return rv; +} + + +ssd_device_header * +ssd_init_header(as_namespace *ns) +{ + ssd_device_header *h = cf_valloc(SSD_HEADER_SIZE); + + memset(h, 0, SSD_HEADER_SIZE); + + h->magic = SSD_HEADER_MAGIC; + h->random = 0; + h->write_block_size = ns->storage_write_block_size; + h->last_evict_void_time = 0; + h->version = SSD_VERSION; + h->flags = 0; + h->devices_n = 0; + h->header_length = SSD_HEADER_SIZE; + memset(h->namespace, 0, sizeof(h->namespace)); + strcpy(h->namespace, ns->name); + h->info_n = AS_PARTITIONS; + h->info_stride = SSD_HEADER_INFO_STRIDE; + + ssd_header_init_cfg(ns, h); + + return h; +} + + +bool +ssd_empty_header(int fd, const char* device_name) +{ + void *h = cf_valloc(SSD_HEADER_SIZE); + + memset(h, 0, SSD_HEADER_SIZE); + + if (0 != lseek(fd, 0, SEEK_SET)) { + cf_warning(AS_DRV_SSD, "device %s: empty header: seek error: %s", + device_name, cf_strerror(errno)); + cf_free(h); + return false; + } + + if (SSD_HEADER_SIZE != write(fd, h, SSD_HEADER_SIZE)) { + cf_warning(AS_DRV_SSD, "device %s: empty header: write error: %s", + device_name, cf_strerror(errno)); + cf_free(h); + return false; + } + + cf_free(h); + fsync(fd); + + return true; +} + + +void +ssd_write_header(drv_ssd *ssd, ssd_device_header *header, off_t offset, + size_t size) +{ + int fd = ssd_fd_get(ssd); + + if (lseek(fd, offset, SEEK_SET) != offset) { + cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED seek: errno %d (%s)", + ssd->name, errno, cf_strerror(errno)); + } + + uint8_t *from = (uint8_t*)header + offset; + + ssize_t sz = write(fd, (void*)from, size); + + if (sz != (ssize_t)size) { + cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)", + ssd->name, errno, cf_strerror(errno)); + } + + ssd_fd_put(ssd, fd); + + if (! ssd->shadow_name) { + return; + } + + fd = ssd_shadow_fd_get(ssd); + + if (lseek(fd, offset, SEEK_SET) != offset) { + cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED seek: errno %d (%s)", + ssd->shadow_name, errno, cf_strerror(errno)); + } + + sz = write(fd, (void*)from, size); + + if (sz != (ssize_t)size) { + cf_crash(AS_DRV_SSD, "%s: DEVICE FAILED write: errno %d (%s)", + ssd->shadow_name, errno, cf_strerror(errno)); + } + + ssd_shadow_fd_put(ssd, fd); +} + + +//========================================================== +// Cold start utilities. +// + +bool +prefer_existing_record(drv_ssd* ssd, drv_ssd_block* block, as_index* r) +{ + int result = as_record_resolve_conflict(ssd_cold_start_policy(ssd->ns), + r->generation, r->last_update_time, + block->generation, block->last_update_time); + + if (result != 0) { + return result == -1; // -1 means block record < existing record + } + + // Finally, compare void-times. Note that defragged records will generate + // identical copies on drive, so they'll get here and return true. + return r->void_time == 0 || + (block->void_time != 0 && block->void_time <= r->void_time); +} + + +bool +is_set_evictable(as_namespace* ns, const as_rec_props* p_props) +{ + if (p_props->size == 0) { + return true; + } + + const char* set_name; + + if (as_rec_props_get_value(p_props, CL_REC_PROPS_FIELD_SET_NAME, NULL, + (uint8_t**)&set_name) != 0) { + return true; + } + + as_set *p_set; + + if (cf_vmapx_get_by_name(ns->p_sets_vmap, set_name, (void**)&p_set) != + CF_VMAPX_OK) { + return true; + } + + return ! IS_SET_EVICTION_DISABLED(p_set); +} + + +bool +is_record_expired(as_namespace* ns, const drv_ssd_block* block, + const as_rec_props* p_props) +{ + if (block->void_time == 0 || + block->void_time > ns->cold_start_threshold_void_time) { + return false; + } + + // If set is not evictable, may have expired but wasn't evicted. + return block->void_time < as_record_void_time_get() || + is_set_evictable(ns, p_props); +} + + +void +apply_rec_props(as_record* r, as_namespace* ns, const as_rec_props* p_props) +{ + // Set record's set-id. (If it already has one, assume they're the same.) + if (! as_index_has_set(r) && p_props->size != 0) { + const char* set_name; + + if (as_rec_props_get_value(p_props, CL_REC_PROPS_FIELD_SET_NAME, NULL, + (uint8_t**)&set_name) == 0) { + as_index_set_set(r, ns, set_name, false); + } + } + + uint32_t key_size; + uint8_t* key; + bool got_key = p_props->size != 0 && + as_rec_props_get_value(p_props, CL_REC_PROPS_FIELD_KEY, &key_size, + &key) == 0; + + // If a key wasn't stored, and we got one, accommodate it. + if (r->key_stored == 0) { + if (got_key) { + if (ns->storage_data_in_memory) { + as_record_allocate_key(r, key, key_size); + } + + r->key_stored = 1; + } + } + // If a key was stored, but we didn't get one, remove the key. + else if (! got_key) { + if (ns->storage_data_in_memory) { + as_record_remove_key(r); + } + + r->key_stored = 0; + } +} + + +// Add a record just read from drive to the index, if all is well. +void +ssd_cold_start_add_record(drv_ssds* ssds, drv_ssd* ssd, drv_ssd_block* block, + uint64_t rblock_id, uint32_t n_rblocks) +{ + uint32_t pid = as_partition_getid(&block->keyd); + + // If this isn't a partition we're interested in, skip this record. + if (! ssds->get_state_from_storage[pid]) { + return; + } + + as_namespace* ns = ssds->ns; + + // If eviction is necessary, evict previously added records closest to + // expiration. (If evicting, this call will block for a long time.) This + // call may also update the cold start threshold void-time. + if (! as_cold_start_evict_if_needed(ns)) { + cf_crash(AS_DRV_SSD, "hit stop-writes limit before drive scan completed"); + } + + // Sanity-check the record. + if (! is_valid_record(block, ns->name)) { + cf_warning_digest(AS_DRV_SSD, &block->keyd, "invalid data on device - ignoring record "); + return; // caller will continue and try next record + } + + // Don't bother with reservations - partition trees aren't going anywhere. + as_partition* p_partition = &ns->partitions[pid]; + + // Get or create the record. + as_index_ref r_ref; + r_ref.skip_lock = false; + + // Prepare to read rec-props. + as_rec_props props = { .p_data = block->data, .size = block->bins_offset }; + + if (ssd_cold_start_is_record_truncated(ns, block, &props)) { + return; + } + + // Get/create the record from/in the appropriate index tree. + int rv = as_record_get_create(p_partition->vp, &block->keyd, &r_ref, ns); + + if (rv < 0) { + cf_warning_digest(AS_DRV_SSD, &block->keyd, "record-add as_record_get_create() failed "); + return; + } + + bool is_create = rv == 1; + + // Fix 0 generations coming off device. + if (block->generation == 0) { + block->generation = 1; + cf_warning_digest(AS_DRV_SSD, &block->keyd, "record-add found generation 0 - changed to 1 "); + } + + as_index* r = r_ref.r; + uint32_t wblock_id = RBLOCK_ID_TO_WBLOCK_ID(ssd, rblock_id); + // TODO - pass in wblock_id when we do boundary check in sweep. + + if (! is_create) { + // Record already existed. Ignore this one if existing record is newer. + if (prefer_existing_record(ssd, block, r)) { + ssd_cold_start_adjust_cenotaph(ns, block, r); + as_record_done(&r_ref, ns); + ssd->record_add_older_counter++; + return; + } + } + // The record we're now reading is the latest version (so far) ... + + // Skip records that have expired. + if (is_record_expired(ns, block, &props)) { + as_index_delete(p_partition->vp, &block->keyd); + as_record_done(&r_ref, ns); + ssd->record_add_expired_counter++; + return; + } + + // We'll keep the record we're now reading ... + + ssd_cold_start_init_repl_state(ns, r); + + // Set/reset the record's last-update-time and generation. + r->last_update_time = block->last_update_time; + r->generation = block->generation; + + // Set/reset the record's void-time, truncating it if beyond max-ttl. + if (block->void_time > ns->cold_start_max_void_time) { + cf_detail(AS_DRV_SSD, "record-add truncating void-time %lu > max %u", + block->void_time, ns->cold_start_max_void_time); + + r->void_time = ns->cold_start_max_void_time; + ssd->record_add_max_ttl_counter++; + } + else { + r->void_time = block->void_time; + } + + // Update maximum void-time. + cf_atomic64_setmax(&p_partition->max_void_time, r->void_time); + + // If data is in memory, load bins and particles, adjust secondary index. + if (ns->storage_data_in_memory) { + uint8_t* block_head = (uint8_t*)block; + drv_ssd_bin* ssd_bin = (drv_ssd_bin*)(block->data + block->bins_offset); + as_storage_rd rd; + + if (is_create) { + as_storage_record_create(ns, r, &rd); + } + else { + as_storage_record_open(ns, r, &rd); + } + + as_storage_rd_load_n_bins(&rd); + as_storage_rd_load_bins(&rd, NULL); + + uint64_t bytes_memory = as_storage_record_get_n_bytes_memory(&rd); + + // Do this early since set-id is needed for the secondary index update. + apply_rec_props(r, ns, &props); + + uint16_t old_n_bins = rd.n_bins; + + bool has_sindex = record_has_sindex(r, ns); + int sbins_populated = 0; + + if (has_sindex) { + SINDEX_GRLOCK(); + } + + SINDEX_BINS_SETUP(sbins, 2 * ns->sindex_cnt); + as_sindex* si_arr[2 * ns->sindex_cnt]; + int si_arr_index = 0; + const char* set_name = as_index_get_set_name(r, ns); + + if (has_sindex) { + for (uint16_t i = 0; i < old_n_bins; i++) { + si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, rd.bins[i].id, &si_arr[si_arr_index]); + } + } + + int32_t delta_bins = (int32_t)block->n_bins - (int32_t)old_n_bins; + + if (rd.ns->single_bin) { + if (delta_bins < 0) { + as_record_destroy_bins(&rd); + } + } + else if (delta_bins != 0) { + if (has_sindex && delta_bins < 0) { + sbins_populated += as_sindex_sbins_from_rd(&rd, (uint16_t)block->n_bins, old_n_bins, sbins, AS_SINDEX_OP_DELETE); + } + + as_bin_allocate_bin_space(&rd, delta_bins); + } + + for (uint16_t i = 0; i < block->n_bins; i++) { + as_bin* b; + + if (i < old_n_bins) { + b = &rd.bins[i]; + + if (has_sindex) { + sbins_populated += as_sindex_sbins_from_bin(ns, set_name, b, &sbins[sbins_populated], AS_SINDEX_OP_DELETE); + } + + as_bin_set_id_from_name(ns, b, ssd_bin->name); + } + else { + // TODO - what if this fails? + b = as_bin_create(&rd, ssd_bin->name); + } + + // TODO - what if this fails? + as_bin_particle_replace_from_flat(b, block_head + ssd_bin->offset, + ssd_bin->len); + + ssd_bin = (drv_ssd_bin*)(block_head + ssd_bin->next); + + if (has_sindex) { + si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, b->id, &si_arr[si_arr_index]); + sbins_populated += as_sindex_sbins_from_bin(ns, set_name, b, &sbins[sbins_populated], AS_SINDEX_OP_INSERT); + } + } + + if (has_sindex) { + SINDEX_GRUNLOCK(); + + if (sbins_populated > 0) { + as_sindex_update_by_sbin(ns, as_index_get_set_name(r, ns), sbins, sbins_populated, &r->keyd); + as_sindex_sbin_freeall(sbins, sbins_populated); + } + + as_sindex_release_arr(si_arr, si_arr_index); + } + + as_storage_record_adjust_mem_stats(&rd, bytes_memory); + as_storage_record_close(&rd); + } + else { + apply_rec_props(r, ns, &props); + } + + if (is_create) { + ssd->record_add_unique_counter++; + } + else if (STORAGE_RBLOCK_IS_VALID(r->rblock_id)) { + // Replacing an existing record, undo its previous storage accounting. + ssd_block_free(&ssds->ssds[r->file_id], r->rblock_id, r->n_rblocks, + "record-add"); + ssd->record_add_replace_counter++; + } + else { + cf_warning(AS_DRV_SSD, "replacing record with invalid rblock-id"); + } + + ssd_cold_start_transition_record(ns, block, r, is_create); + + // Update storage accounting to include this record. + // TODO - pass in size instead of n_rblocks. + uint32_t size = (uint32_t)RBLOCKS_TO_BYTES(n_rblocks); + + ssd->inuse_size += size; + ssd->alloc_table->wblock_state[wblock_id].inuse_sz += size; + + // Set/reset the record's storage information. + r->file_id = ssd->file_id; + r->rblock_id = rblock_id; + r->n_rblocks = n_rblocks; + + as_record_done(&r_ref, ns); +} + + +// Sweep through a storage device to rebuild the index. +void +ssd_cold_start_sweep(drv_ssds *ssds, drv_ssd *ssd) +{ + size_t wblock_size = ssd->write_block_size; + + uint8_t *buf = cf_valloc(wblock_size); + + bool read_shadow = ssd->shadow_name; + char *read_ssd_name = read_shadow ? ssd->shadow_name : ssd->name; + int fd = read_shadow ? ssd_shadow_fd_get(ssd) : ssd_fd_get(ssd); + int write_fd = read_shadow ? ssd_fd_get(ssd) : -1; + + // Seek past the header. + + if (lseek(fd, SSD_HEADER_SIZE, SEEK_SET) != SSD_HEADER_SIZE) { + cf_crash(AS_DRV_SSD, "%s: seek failed: errno %d (%s)", read_ssd_name, + errno, cf_strerror(errno)); + } + + if (read_shadow && + lseek(write_fd, SSD_HEADER_SIZE, SEEK_SET) != SSD_HEADER_SIZE) { + cf_crash(AS_DRV_SSD, "%s: seek failed: errno %d (%s)", ssd->name, + errno, cf_strerror(errno)); + } + + // Loop over all wblocks, unless we encounter 10 contiguous unused wblocks. + + ssd->sweep_wblock_id = SSD_HEADER_SIZE / (uint32_t)wblock_size; + + uint64_t file_offset = SSD_HEADER_SIZE; + uint32_t n_unused_wblocks = 0; + + while (file_offset < ssd->file_size && n_unused_wblocks < 10) { + if (read(fd, buf, wblock_size) != wblock_size) { + cf_crash(AS_DRV_SSD, "%s: read failed: errno %d (%s)", + read_ssd_name, errno, cf_strerror(errno)); + } + + if (read_shadow && + write(write_fd, (void*)buf, wblock_size) != wblock_size) { + cf_crash(AS_DRV_SSD, "%s: write failed: errno %d (%s)", ssd->name, + errno, cf_strerror(errno)); + } + + size_t indent = 0; // current offset within wblock, in bytes + + while (indent < wblock_size) { + drv_ssd_block *block = (drv_ssd_block*)&buf[indent]; + + ssd_decrypt(ssd, file_offset + indent, block); + + // Look for record magic. + if (block->magic != SSD_BLOCK_MAGIC) { + // Should always find a record at beginning of used wblock. if + // not, we've likely encountered the unused part of the device. + if (indent == 0) { + n_unused_wblocks++; + break; // try next wblock + } + // else - nothing more in this wblock, but keep looking for + // magic - necessary if we want to be able to increase + // write-block-size across restarts. + + indent += RBLOCK_SIZE; + continue; // try next rblock + } + + if (n_unused_wblocks != 0) { + cf_warning(AS_DRV_SSD, "%s: found used wblock after skipping %u unused", + ssd->name, n_unused_wblocks); + + n_unused_wblocks = 0; // restart contiguous count + } + + // Note - if block->length is sane, we don't need to round up to a + // multiple of RBLOCK_SIZE, but let's do it anyway just to be safe. + size_t next_indent = indent + + BYTES_TO_RBLOCK_BYTES(block->length + LENGTH_BASE); + + // Sanity-check for wblock overruns. + if (next_indent > wblock_size) { + cf_warning(AS_DRV_SSD, "%s: record crosses wblock boundary: block-length %u", + ssd->name, block->length); + break; // skip this record, try next wblock + } + + // Found a record - try to add it to the index. + ssd_cold_start_add_record(ssds, ssd, block, + BYTES_TO_RBLOCKS(file_offset + indent), + (uint32_t)BYTES_TO_RBLOCKS(next_indent - indent)); + + indent = next_indent; + } + + file_offset += wblock_size; + ssd->sweep_wblock_id++; + } + + ssd->sweep_wblock_id = (uint32_t)(ssd->file_size / wblock_size); + + if (fd != -1) { + read_shadow ? ssd_shadow_fd_put(ssd, fd) : ssd_fd_put(ssd, fd); + } + + if (write_fd != -1) { + ssd_fd_put(ssd, write_fd); + } + + cf_free(buf); +} + + +// Thread "run" function to read a storage device and rebuild the index. +void * +run_ssd_cold_start(void *udata) +{ + ssd_load_records_info *lri = (ssd_load_records_info*)udata; + drv_ssd *ssd = lri->ssd; + drv_ssds *ssds = lri->ssds; + cf_queue *complete_q = lri->complete_q; + void *complete_udata = lri->complete_udata; + void *complete_rc = lri->complete_rc; + + cf_free(lri); + + as_namespace* ns = ssds->ns; + + cf_info(AS_DRV_SSD, "device %s: reading device to load index", ssd->name); + + CF_ALLOC_SET_NS_ARENA(ns); + + ssd_cold_start_sweep(ssds, ssd); + + cf_info(AS_DRV_SSD, "device %s: read complete: UNIQUE %lu (REPLACED %lu) (OLDER %lu) (EXPIRED %lu) (MAX-TTL %lu) records", + ssd->name, ssd->record_add_unique_counter, + ssd->record_add_replace_counter, ssd->record_add_older_counter, + ssd->record_add_expired_counter, ssd->record_add_max_ttl_counter); + + if (cf_rc_release(complete_rc) == 0) { + // All drives are done reading. + + ns->loading_records = false; + ssd_cold_start_drop_cenotaphs(ns); + ssd_load_wblock_queues(ssds); + + pthread_mutex_destroy(&ns->cold_start_evict_lock); + + cf_queue_push(complete_q, &complete_udata); + cf_rc_free(complete_rc); + + as_truncate_list_cenotaphs(ns); + as_truncate_done_startup(ns); // set truncate last-update-times in sets' vmap + + ssd_start_maintenance_threads(ssds); + ssd_start_write_worker_threads(ssds); + ssd_start_defrag_threads(ssds); + } + + return NULL; +} + + +void +start_loading_records(drv_ssds *ssds, cf_queue *complete_q, void *udata) +{ + as_namespace *ns = ssds->ns; + + ns->loading_records = true; + + void *p = cf_rc_alloc(1); + + for (int i = 1; i < ssds->n_ssds; i++) { + cf_rc_reserve(p); + } + + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + ssd_load_records_info *lri = cf_malloc(sizeof(ssd_load_records_info)); + + lri->ssds = ssds; + lri->ssd = ssd; + lri->complete_q = complete_q; + lri->complete_udata = udata; + lri->complete_rc = p; + + pthread_create(&thread, &attrs, + ns->cold_start ? run_ssd_cold_start : run_ssd_cool_start, lri); + } +} + + +//========================================================== +// Generic startup utilities. +// + +static int +first_used_device(ssd_device_header *headers[], int n_ssds) +{ + for (int i = 0; i < n_ssds; i++) { + if (headers[i]->random != 0) { + return i; + } + } + + return -1; +} + + +static bool +stored_version_has_data(drv_ssds *ssds, uint32_t pid) +{ + info_buf *b = (info_buf*) + (ssds->header->info_data + (SSD_HEADER_INFO_STRIDE * pid)); + + return as_partition_version_has_data(&b->version); +} + + +bool +ssd_load_records(drv_ssds *ssds, cf_queue *complete_q, void *udata) +{ + uint64_t random = cf_get_rand64(); + + int n_ssds = ssds->n_ssds; + as_namespace *ns = ssds->ns; + + ssd_device_header *headers[n_ssds]; + + // Check all the headers. Pick one as the representative. + for (int i = 0; i < n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + int rvh = ssd_read_header(ssd, ns, &headers[i]); + + if (rvh == -1) { + cf_crash(AS_DRV_SSD, "unable to read disk header %s", ssd->name); + } + + if (rvh == -2) { + headers[i] = ssd_init_header(ns); + } + } + + int first_used = first_used_device(headers, n_ssds); + + if (first_used == -1) { + // Shouldn't find all fresh headers here during warm or cool restart. + if (! ns->cold_start) { + // There's no going back to cold start now - do so the harsh way. + cf_crash(AS_DRV_SSD, "{%s}: found all %d devices fresh during %s restart", + ns->name, n_ssds, as_namespace_start_mode_str(ns)); + } + + cf_info(AS_DRV_SSD, "namespace %s: found all %d devices fresh, initializing to random %lu", + ns->name, n_ssds, random); + + ssds->header = headers[0]; + + for (int i = 1; i < n_ssds; i++) { + cf_free(headers[i]); + } + + ssd_init_trusted(ns); + + ssds->header->random = random; + ssds->header->devices_n = n_ssds; + + ssd_adjust_versions(ns, ssds->header); + + as_storage_info_flush_ssd(ns); + + as_truncate_list_cenotaphs(ns); // all will show as cenotaph + as_truncate_done_startup(ns); + + return true; + } + + // At least one device is not fresh. Check that all non-fresh devices match. + + bool fresh_drive = false; + bool untrusted_drive = false; + + for (int i = 0; i < n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + // Skip fresh devices. + if (headers[i]->random == 0) { + ssd->started_fresh = true; // warm or cool restart needs to know + fresh_drive = true; + continue; + } + + if (headers[first_used]->random != headers[i]->random) { + cf_crash(AS_DRV_SSD, "namespace %s: drive set with unmatched headers - devices %s & %s have different signatures", + ns->name, ssds->ssds[first_used].name, ssd->name); + } + + if (headers[first_used]->devices_n != headers[i]->devices_n) { + cf_crash(AS_DRV_SSD, "namespace %s: drive set with unmatched headers - devices %s & %s have different device counts", + ns->name, ssds->ssds[first_used].name, ssd->name); + } + + if (headers[first_used]->last_evict_void_time != + headers[i]->last_evict_void_time) { + cf_warning(AS_DRV_SSD, "namespace %s: devices have inconsistent evict-void-times - ignoring", + ns->name); + headers[first_used]->last_evict_void_time = 0; + } + + untrusted_drive = ssd_is_untrusted(ns, headers[i]->flags); + } + + // Drive set OK - fix up header set. + ssds->header = headers[first_used]; + headers[first_used] = 0; + + for (int i = 0; i < n_ssds; i++) { + if (headers[i]) { + cf_free(headers[i]); + headers[i] = 0; + } + } + + ssd_init_trusted(ns); + + ssds->header->random = random; + ssds->header->devices_n = n_ssds; // may have added fresh drives + + if (fresh_drive || untrusted_drive) { + ssd_adjust_versions(ns, ssds->header); + } + + as_storage_info_flush_ssd(ns); + + // Cache booleans indicating whether partitions are owned or not. + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + ssds->get_state_from_storage[pid] = + stored_version_has_data(ssds, pid); + } + + // Warm or cool restart. + if (! ns->cold_start) { + as_truncate_done_startup(ns); // set truncate last-update-times in sets' vmap + ssd_resume_devices(ssds); + + if (as_namespace_cool_restarts(ns)) { + // Cool restart - fire off threads to load record data - will signal + // completion when threads are all done. + start_loading_records(ssds, complete_q, udata); + + // Make sure caller doesn't signal completion. + return false; + } + + return true; // warm restart (done) + } + + // Initialize the cold start eviction machinery. + + if (0 != pthread_mutex_init(&ns->cold_start_evict_lock, NULL)) { + cf_crash(AS_DRV_SSD, "failed cold start eviction mutex init"); + } + + uint32_t now = as_record_void_time_get(); + + if (ns->cold_start_evict_ttl == 0xFFFFffff) { + // Config file did NOT specify cold-start-evict-ttl. + ns->cold_start_threshold_void_time = ssds->header->last_evict_void_time; + + // Check that it's not already in the past. (Note - includes 0.) + if (ns->cold_start_threshold_void_time < now) { + ns->cold_start_threshold_void_time = now; + } + else { + cf_info(AS_DRV_SSD, "namespace %s: using saved cold start evict-ttl %u", + ns->name, ns->cold_start_threshold_void_time - now); + } + } + else { + // Config file specified cold-start-evict-ttl. (0 is a valid value.) + ns->cold_start_threshold_void_time = now + ns->cold_start_evict_ttl; + + cf_info(AS_DRV_SSD, "namespace %s: using config-specified cold start evict-ttl %u", + ns->name, ns->cold_start_evict_ttl); + } + + ns->cold_start_max_void_time = now + (uint32_t)ns->max_ttl; + + // Fire off threads to load record data - will signal completion when + // threads are all done. + start_loading_records(ssds, complete_q, udata); + + // Make sure caller doesn't signal completion. + return false; +} + + +// Set a device's system block scheduler mode. +static int +ssd_set_scheduler_mode(const char* device_name, const char* mode) +{ + if (strncmp(device_name, "/dev/", 5)) { + cf_warning(AS_DRV_SSD, "storage: invalid device name %s, did not set scheduler mode", + device_name); + return -1; + } + + char device_tag[(strlen(device_name) - 5) + 1]; + + strcpy(device_tag, device_name + 5); + + // Replace any slashes in the device tag with '!' - this is the naming + // convention in /sys/block. + char* p_char = device_tag; + + while (*p_char) { + if (*p_char == '/') { + *p_char = '!'; + } + + p_char++; + } + + char scheduler_file_name[17 + strlen(device_tag) + 3 + 16 + 1]; + + strcpy(scheduler_file_name, "/sys/class/block/"); + strcat(scheduler_file_name, device_tag); + + // Determine if this device is a partition. + char partition_file_name[strlen(scheduler_file_name) + 10 + 1]; + + strcpy(partition_file_name, scheduler_file_name); + strcat(partition_file_name, "/partition"); + + FILE* partition_file = fopen(partition_file_name, "r"); + + if (partition_file) { + fclose(partition_file); + + // This device is a partition, get parent device. + strcat(scheduler_file_name, "/.."); + } + + strcat(scheduler_file_name, "/queue/scheduler"); + + FILE* scheduler_file = fopen(scheduler_file_name, "w"); + + if (! scheduler_file) { + cf_warning(AS_DRV_SSD, "storage: couldn't open %s, did not set scheduler mode: %s", + scheduler_file_name, cf_strerror(errno)); + return -1; + } + + if (fwrite(mode, strlen(mode), 1, scheduler_file) != 1) { + fclose(scheduler_file); + + cf_warning(AS_DRV_SSD, "storage: couldn't write %s to %s, did not set scheduler mode", + mode, scheduler_file_name); + return -1; + } + + fclose(scheduler_file); + + cf_info(AS_DRV_SSD, "storage: set device %s scheduler mode to %s", + device_name, mode); + + return 0; +} + + +static uint64_t +check_file_size(as_namespace *ns, uint64_t file_size, const char *tag) +{ + cf_assert(sizeof(off_t) > 4, AS_DRV_SSD, "this OS supports only 32-bit (4g) files - compile with 64 bit offsets"); + + if (file_size > SSD_HEADER_SIZE) { + off_t unusable_size = + (file_size - SSD_HEADER_SIZE) % ns->storage_write_block_size; + + if (unusable_size != 0) { + cf_info(AS_DRV_SSD, "%s size must be header size %u + multiple of %u, rounding down", + tag, SSD_HEADER_SIZE, ns->storage_write_block_size); + file_size -= unusable_size; + } + + if (file_size > AS_STORAGE_MAX_DEVICE_SIZE) { + cf_warning(AS_DRV_SSD, "%s size must be <= %ld, trimming original size %ld", + tag, AS_STORAGE_MAX_DEVICE_SIZE, file_size); + file_size = AS_STORAGE_MAX_DEVICE_SIZE; + } + } + + if (file_size <= SSD_HEADER_SIZE) { + cf_crash(AS_DRV_SSD, "%s size %ld must be greater than header size %d", + tag, file_size, SSD_HEADER_SIZE); + } + + return file_size; +} + + +static uint64_t +find_io_min_size(int fd, const char *ssd_name) +{ + off_t off = lseek(fd, 0, SEEK_SET); + + if (off != 0) { + cf_crash(AS_DRV_SSD, "%s: seek error %s", ssd_name, cf_strerror(errno)); + } + + uint8_t *buf = cf_valloc(HI_IO_MIN_SIZE); + size_t read_sz = LO_IO_MIN_SIZE; + + while (read_sz <= HI_IO_MIN_SIZE) { + if (read(fd, (void*)buf, read_sz) == (ssize_t)read_sz) { + cf_free(buf); + return read_sz; + } + + read_sz <<= 1; // LO_IO_MIN_SIZE and HI_IO_MIN_SIZE are powers of 2 + } + + cf_crash(AS_DRV_SSD, "%s: read failed at all sizes from %u to %u bytes", + ssd_name, LO_IO_MIN_SIZE, HI_IO_MIN_SIZE); + + return 0; +} + + +int +ssd_init_devices(as_namespace *ns, drv_ssds **ssds_p) +{ + int n_ssds; + + for (n_ssds = 0; n_ssds < AS_STORAGE_MAX_DEVICES; n_ssds++) { + if (! ns->storage_devices[n_ssds]) { + break; + } + } + + size_t ssds_size = sizeof(drv_ssds) + (n_ssds * sizeof(drv_ssd)); + drv_ssds *ssds = cf_malloc(ssds_size); + + memset(ssds, 0, ssds_size); + ssds->n_ssds = n_ssds; + ssds->ns = ns; + + // Raw device-specific initialization of drv_ssd structures. + for (int i = 0; i < n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + ssd->name = ns->storage_devices[i]; + + ssd->open_flag = O_RDWR | + (ns->storage_disable_odirect ? 0 : O_DIRECT) | + (ns->storage_enable_osync ? O_SYNC : 0); + + int fd = open(ssd->name, ssd->open_flag, S_IRUSR | S_IWUSR); + + if (-1 == fd) { + cf_warning(AS_DRV_SSD, "unable to open device %s: %s", ssd->name, + cf_strerror(errno)); + return -1; + } + + uint64_t size = 0; + + ioctl(fd, BLKGETSIZE64, &size); // gets the number of bytes + + ssd->file_size = check_file_size(ns, size, "usable device"); + ssd->io_min_size = find_io_min_size(fd, ssd->name); + + if (ns->cold_start && ns->storage_cold_start_empty) { + if (! ssd_empty_header(fd, ssd->name)) { + close(fd); + return -1; + } + + cf_info(AS_DRV_SSD, "cold-start-empty - erased header of %s", + ssd->name); + } + + close(fd); + + ns->ssd_size += ssd->file_size; // increment total storage size + + cf_info(AS_DRV_SSD, "opened device %s: usable size %lu, io-min-size %lu", + ssd->name, ssd->file_size, ssd->io_min_size); + + if (ns->storage_scheduler_mode) { + // Set scheduler mode specified in config file. + ssd_set_scheduler_mode(ssd->name, ns->storage_scheduler_mode); + } + } + + *ssds_p = ssds; + + return 0; +} + + +int +ssd_init_shadows(as_namespace *ns, drv_ssds *ssds) +{ + int n_shadows = 0; + + for (int n = 0; n < ssds->n_ssds; n++) { + if (ns->storage_shadows[n]) { + n_shadows++; + } + } + + if (n_shadows == 0) { + // No shadows - a normal deployment. + return 0; + } + + if (n_shadows != ssds->n_ssds) { + cf_warning(AS_DRV_SSD, "configured %d devices but only %d shadows", + ssds->n_ssds, n_shadows); + return -1; + } + + // Check shadow devices. + for (int i = 0; i < n_shadows; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + ssd->shadow_name = ns->storage_shadows[i]; + + int fd = open(ssd->shadow_name, ssd->open_flag, S_IRUSR | S_IWUSR); + + if (-1 == fd) { + cf_warning(AS_DRV_SSD, "unable to open shadow device %s: %s", + ssd->shadow_name, cf_strerror(errno)); + return -1; + } + + uint64_t size = 0; + + ioctl(fd, BLKGETSIZE64, &size); // gets the number of bytes + + if (size < ssd->file_size) { + cf_warning(AS_DRV_SSD, "shadow device %s is smaller than main device - %lu < %lu", + ssd->shadow_name, size, ssd->file_size); + close(fd); + return -1; + } + + if (ns->cold_start && ns->storage_cold_start_empty) { + if (! ssd_empty_header(fd, ssd->shadow_name)) { + close(fd); + return -1; + } + + cf_info(AS_DRV_SSD, "cold-start-empty - erased header of %s", + ssd->shadow_name); + } + + close(fd); + + cf_info(AS_DRV_SSD, "shadow device %s is compatible with main device", + ssd->shadow_name); + + if (ns->storage_scheduler_mode) { + // Set scheduler mode specified in config file. + ssd_set_scheduler_mode(ssd->shadow_name, + ns->storage_scheduler_mode); + } + } + + return 0; +} + + +int +ssd_init_files(as_namespace *ns, drv_ssds **ssds_p) +{ + int n_ssds; + + for (n_ssds = 0; n_ssds < AS_STORAGE_MAX_FILES; n_ssds++) { + if (! ns->storage_files[n_ssds]) { + break; + } + } + + size_t ssds_size = sizeof(drv_ssds) + (n_ssds * sizeof(drv_ssd)); + drv_ssds *ssds = cf_malloc(ssds_size); + + memset(ssds, 0, ssds_size); + ssds->n_ssds = n_ssds; + ssds->ns = ns; + + // File-specific initialization of drv_ssd structures. + for (int i = 0; i < n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + ssd->name = ns->storage_files[i]; + + if (ns->cold_start && ns->storage_cold_start_empty) { + if (0 == remove(ssd->name)) { + cf_info(AS_DRV_SSD, "cold-start-empty - removed %s", ssd->name); + } + else if (errno == ENOENT) { + cf_info(AS_DRV_SSD, "cold-start-empty - no file %s", ssd->name); + } + else { + cf_warning(AS_DRV_SSD, "failed remove: errno %d", errno); + return -1; + } + } + + ssd->open_flag = O_RDWR; + + // Validate that file can be opened, create it if it doesn't exist. + int fd = open(ssd->name, ssd->open_flag | O_CREAT, S_IRUSR | S_IWUSR); + + if (-1 == fd) { + cf_warning(AS_DRV_SSD, "unable to open file %s: %s", ssd->name, + cf_strerror(errno)); + return -1; + } + + ssd->file_size = check_file_size(ns, ns->storage_filesize, "file"); + ssd->io_min_size = LO_IO_MIN_SIZE; + + // Truncate will grow or shrink the file to the correct size. + if (0 != ftruncate(fd, (off_t)ssd->file_size)) { + cf_warning(AS_DRV_SSD, "unable to truncate file: errno %d", errno); + close(fd); + return -1; + } + + close(fd); + + ns->ssd_size += ssd->file_size; // increment total storage size + + cf_info(AS_DRV_SSD, "opened file %s: usable size %lu", ssd->name, + ssd->file_size); + } + + *ssds_p = ssds; + + return 0; +} + + +//========================================================== +// Storage API implementation: startup, shutdown, etc. +// + +int +as_storage_namespace_init_ssd(as_namespace *ns, cf_queue *complete_q, + void *udata) +{ + drv_ssds *ssds; + + if (ns->storage_devices[0]) { + if (0 != ssd_init_devices(ns, &ssds)) { + cf_warning(AS_DRV_SSD, "{%s} can't initialize devices", ns->name); + return -1; + } + + if (0 != ssd_init_shadows(ns, ssds)) { + cf_warning(AS_DRV_SSD, "{%s} can't initialize shadows", ns->name); + return -1; + } + } + else if (ns->storage_files[0]) { + if (0 != ssd_init_files(ns, &ssds)) { + cf_warning(AS_DRV_SSD, "{%s} can't initialize files", ns->name); + return -1; + } + } + else { + cf_warning(AS_DRV_SSD, "{%s} has no devices or files", ns->name); + return -1; + } + + // Allow defrag to go full speed during startup - restore the configured + // settings when startup is done. + ns->saved_defrag_sleep = ns->storage_defrag_sleep; + ns->storage_defrag_sleep = 0; + + // The queue limit is more efficient to work with. + ns->storage_max_write_q = (int) + (ns->storage_max_write_cache / ns->storage_write_block_size); + + // Minimize how often we recalculate this. + ns->defrag_lwm_size = + (ns->storage_write_block_size * ns->storage_defrag_lwm_pct) / 100; + + ns->storage_private = (void*)ssds; + + char histname[HISTOGRAM_NAME_SIZE]; + + snprintf(histname, sizeof(histname), "{%s}-device-read-size", ns->name); + ns->device_read_size_hist = histogram_create(histname, HIST_SIZE); + + snprintf(histname, sizeof(histname), "{%s}-device-write-size", ns->name); + ns->device_write_size_hist = histogram_create(histname, HIST_SIZE); + + // Finish initializing drv_ssd structures (non-zero-value members). + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + ssd->ns = ns; + ssd->file_id = i; + + pthread_mutex_init(&ssd->write_lock, 0); + pthread_mutex_init(&ssd->defrag_lock, 0); + + ssd->running = true; + + ssd->data_in_memory = ns->storage_data_in_memory; + ssd->write_block_size = ns->storage_write_block_size; + + ssd_wblock_init(ssd); + + // Note: free_wblock_q, defrag_wblock_q created after loading devices. + + ssd->fd_q = cf_queue_create(sizeof(int), true); + + if (ssd->shadow_name) { + ssd->shadow_fd_q = cf_queue_create(sizeof(int), true); + } + + ssd->swb_write_q = cf_queue_create(sizeof(void*), true); + + if (ssd->shadow_name) { + ssd->swb_shadow_q = cf_queue_create(sizeof(void*), true); + } + + ssd->swb_free_q = cf_queue_create(sizeof(void*), true); + + if (! ns->storage_data_in_memory) { + ssd->post_write_q = cf_queue_create(sizeof(void*), false); + } + + snprintf(histname, sizeof(histname), "{%s}-%s-read", ns->name, ssd->name); + ssd->hist_read = histogram_create(histname, HIST_MILLISECONDS); + + snprintf(histname, sizeof(histname), "{%s}-%s-large-block-read", ns->name, ssd->name); + ssd->hist_large_block_read = histogram_create(histname, HIST_MILLISECONDS); + + snprintf(histname, sizeof(histname), "{%s}-%s-write", ns->name, ssd->name); + ssd->hist_write = histogram_create(histname, HIST_MILLISECONDS); + + if (ssd->shadow_name) { + snprintf(histname, sizeof(histname), "{%s}-%s-shadow-write", ns->name, ssd->name); + ssd->hist_shadow_write = histogram_create(histname, HIST_MILLISECONDS); + } + + snprintf(histname, sizeof(histname), "{%s}-%s-fsync", ns->name, ssd->name); + ssd->hist_fsync = histogram_create(histname, HIST_MILLISECONDS); + + ssd_init_commit(ssd); + } + + // Attempt to load the data. + // + // Return value 'false' means it's going to take a long time and will later + // asynchronously signal completion via the complete_q, 'true' means it's + // finished, signal here. + + if (ssd_load_records(ssds, complete_q, udata)) { + ssd_load_wblock_queues(ssds); + + cf_queue_push(complete_q, &udata); + + ssd_start_maintenance_threads(ssds); + ssd_start_write_worker_threads(ssds); + ssd_start_defrag_threads(ssds); + } + + return 0; +} + + +void +as_storage_loading_records_ticker_ssd() +{ + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_namespace *ns = g_config.namespaces[i]; + + if (ns->loading_records) { + char buf[2048]; + int pos = 0; + drv_ssds *ssds = (drv_ssds*)ns->storage_private; + + for (int j = 0; j < ssds->n_ssds; j++) { + drv_ssd *ssd = &ssds->ssds[j]; + uint32_t pct = (uint32_t)((ssd->sweep_wblock_id * 100UL) / + (ssd->file_size / ssd->write_block_size)); + + pos += sprintf(buf + pos, ", %s %u%%", ssd->name, pct); + } + + // TODO - conform with new log standard? + if (ns->n_tombstones == 0) { + cf_info(AS_DRV_SSD, "{%s} loaded %lu objects%s", ns->name, + ns->n_objects, buf); + } + else { + cf_info(AS_DRV_SSD, "{%s} loaded %lu objects, %lu tombstones%s", + ns->name, ns->n_objects, ns->n_tombstones, buf); + } + } + } +} + + +int +as_storage_namespace_destroy_ssd(as_namespace *ns) +{ + // This is not called - for now we don't bother unwinding. + return 0; +} + + +// Note that this is *NOT* the counterpart to as_storage_record_create_ssd()! +// That would be as_storage_record_close_ssd(). This is what gets called when a +// record is destroyed, to dereference storage. +int +as_storage_record_destroy_ssd(as_namespace *ns, as_record *r) +{ + if (STORAGE_RBLOCK_IS_VALID(r->rblock_id) && r->n_rblocks != 0) { + drv_ssds *ssds = (drv_ssds*)ns->storage_private; + drv_ssd *ssd = &ssds->ssds[r->file_id]; + + ssd_block_free(ssd, r->rblock_id, r->n_rblocks, "destroy"); + + r->rblock_id = 0; + r->n_rblocks = 0; + } + + return 0; +} + + +//========================================================== +// Storage API implementation: as_storage_rd cycle. +// + +int +as_storage_record_create_ssd(as_storage_rd *rd) +{ + rd->block = NULL; + rd->must_free_block = NULL; + rd->ssd = NULL; + + cf_assert(rd->r->rblock_id == 0, AS_DRV_SSD, "unexpected - uninitialized rblock-id"); + + return 0; +} + + +int +as_storage_record_open_ssd(as_storage_rd *rd) +{ + drv_ssds *ssds = (drv_ssds*)rd->ns->storage_private; + + rd->block = NULL; + rd->must_free_block = NULL; + rd->ssd = &ssds->ssds[rd->r->file_id]; + + return 0; +} + + +int +as_storage_record_close_ssd(as_storage_rd *rd) +{ + if (rd->must_free_block) { + cf_free(rd->must_free_block); + rd->must_free_block = NULL; + rd->block = NULL; + } + + return 0; +} + + +// These are near the top of this file: +// as_storage_record_get_n_bins_ssd() +// as_storage_record_read_ssd() +// as_storage_particle_read_all_ssd() +// as_storage_particle_read_and_size_all_ssd() + + +bool +as_storage_record_size_and_check_ssd(as_storage_rd *rd) +{ + return rd->ns->storage_write_block_size >= ssd_record_size(rd); +} + + +//========================================================== +// Storage API implementation: storage capacity monitoring. +// + +void +as_storage_wait_for_defrag_ssd(as_namespace *ns) +{ + if (ns->storage_defrag_startup_minimum > 0) { + while (true) { + int avail_pct; + + if (0 != as_storage_stats_ssd(ns, &avail_pct, 0)) { + cf_crash(AS_DRV_SSD, "namespace %s storage stats failed", + ns->name); + } + + if (avail_pct >= ns->storage_defrag_startup_minimum) { + break; + } + + cf_info(AS_DRV_SSD, "namespace %s waiting for defrag: %d pct available, waiting for %d ...", + ns->name, avail_pct, ns->storage_defrag_startup_minimum); + + sleep(2); + } + } + + // Restore configured defrag throttling values. + ns->storage_defrag_sleep = ns->saved_defrag_sleep; +} + + +bool +as_storage_overloaded_ssd(as_namespace *ns) +{ + drv_ssds *ssds = (drv_ssds*)ns->storage_private; + int max_write_q = ns->storage_max_write_q; + + // TODO - would be nice to not do this loop every single write transaction! + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + int qsz = cf_queue_sz(ssd->swb_write_q); + + if (qsz > max_write_q) { + cf_ticker_warning(AS_DRV_SSD, "{%s} write fail: queue too deep: exceeds max %d", + ns->name, max_write_q); + return true; + } + + if (ssd->shadow_name) { + qsz = cf_queue_sz(ssd->swb_shadow_q); + + if (qsz > max_write_q) { + cf_ticker_warning(AS_DRV_SSD, "{%s} write fail: shadow queue too deep: exceeds max %d", + ns->name, max_write_q); + return true; + } + } + } + + return false; +} + + +bool +as_storage_has_space_ssd(as_namespace *ns) +{ + // Shortcut - assume we can't go from 5% to 0% in 1 ticker interval. + if (ns->storage_last_avail_pct > 5) { + return true; + } + // else - running low on available percent, check rigorously... + + drv_ssds* ssds = (drv_ssds*)ns->storage_private; + + for (int i = 0; i < ssds->n_ssds; i++) { + if (cf_queue_sz(ssds->ssds[i].free_wblock_q) < min_free_wblocks(ns)) { + return false; + } + } + + return true; +} + + +void +as_storage_defrag_sweep_ssd(as_namespace *ns) +{ + cf_info(AS_DRV_SSD, "{%s} sweeping all devices for wblocks to defrag ...", ns->name); + + drv_ssds* ssds = (drv_ssds*)ns->storage_private; + + for (int i = 0; i < ssds->n_ssds; i++) { + cf_atomic32_incr(&ssds->ssds[i].defrag_sweep); + } +} + + +//========================================================== +// Storage API implementation: data in device headers. +// + +void +as_storage_info_set_ssd(as_namespace *ns, const as_partition *p, bool flush) +{ + drv_ssds *ssds = (drv_ssds*)ns->storage_private; + info_buf *b = (info_buf*) + (ssds->header->info_data + (SSD_HEADER_INFO_STRIDE * p->id)); + + // TODO - until future storage format change, we'll use partition 0 to save + // and restore ns->eventual_regime. + b->regime = p->id == 0 ? ns->eventual_regime : 0; + + b->version = p->version; + + if (flush) { + // TODO - in future storage format change, arrange for each stride to + // never cross an io-min-size boundary, so we can do less math here. + + uint64_t offset = (uint8_t*)b - (uint8_t*)ssds->header; + uint64_t end_offset = offset + SSD_HEADER_INFO_STRIDE; + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + uint64_t flush_offset = BYTES_DOWN_TO_IO_MIN(ssd, offset); + uint64_t flush_end_offset = BYTES_UP_TO_IO_MIN(ssd, end_offset); + + ssd_write_header(ssd, ssds->header, + flush_offset, flush_end_offset - flush_offset); + } + } +} + + +void +as_storage_info_get_ssd(as_namespace *ns, as_partition *p) +{ + drv_ssds *ssds = (drv_ssds*)ns->storage_private; + info_buf *b = (info_buf*) + (ssds->header->info_data + (SSD_HEADER_INFO_STRIDE * p->id)); + + if (p->id == 0) { + ns->eventual_regime = b->regime; + ns->rebalance_regime = b->regime; + } + + p->version = b->version; +} + + +int +as_storage_info_flush_ssd(as_namespace *ns) +{ + drv_ssds *ssds = (drv_ssds*)ns->storage_private; + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + ssd_write_header(ssd, ssds->header, 0, SSD_HEADER_SIZE); + } + + return 0; +} + + +void +as_storage_save_evict_void_time_ssd(as_namespace *ns, uint32_t evict_void_time) +{ + drv_ssds* ssds = (drv_ssds*)ns->storage_private; + + ssds->header->last_evict_void_time = evict_void_time; + + // Customized write instead of using as_storage_info_flush_ssd() so we can + // write 512-4096b instead of 1Mb (and not interfere with potentially + // concurrent writes for partition info). + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd* ssd = &ssds->ssds[i]; + size_t peek_size = BYTES_UP_TO_IO_MIN(ssd, sizeof(ssd_device_header)); + + ssd_write_header(ssd, ssds->header, 0, peek_size); + } +} + + +//========================================================== +// Storage API implementation: statistics. +// + +int +as_storage_stats_ssd(as_namespace *ns, int *available_pct, + uint64_t *used_disk_bytes) +{ + drv_ssds *ssds = (drv_ssds*)ns->storage_private; + + if (available_pct) { + *available_pct = 100; + + // Find the device with the lowest available percent. + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + uint64_t pct = (available_size(ssd) * 100) / ssd->file_size; + + if (pct < (uint64_t)*available_pct) { + *available_pct = pct; + } + } + + // Used for shortcut in as_storage_has_space_ssd(), which is done on a + // per-transaction basis: + ns->storage_last_avail_pct = *available_pct; + } + + if (used_disk_bytes) { + uint64_t sz = 0; + + for (int i = 0; i < ssds->n_ssds; i++) { + sz += ssds->ssds[i].inuse_size; + } + + *used_disk_bytes = sz; + } + + return 0; +} + + +int +as_storage_ticker_stats_ssd(as_namespace *ns) +{ + histogram_dump(ns->device_read_size_hist); + histogram_dump(ns->device_write_size_hist); + + drv_ssds *ssds = (drv_ssds*)ns->storage_private; + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + histogram_dump(ssd->hist_read); + histogram_dump(ssd->hist_large_block_read); + histogram_dump(ssd->hist_write); + + if (ssd->hist_shadow_write) { + histogram_dump(ssd->hist_shadow_write); + } + + histogram_dump(ssd->hist_fsync); + } + + return 0; +} + + +int +as_storage_histogram_clear_ssd(as_namespace *ns) +{ + drv_ssds *ssds = (drv_ssds*)ns->storage_private; + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + histogram_clear(ssd->hist_read); + histogram_clear(ssd->hist_large_block_read); + histogram_clear(ssd->hist_write); + + if (ssd->hist_shadow_write) { + histogram_clear(ssd->hist_shadow_write); + } + + histogram_clear(ssd->hist_fsync); + } + + return 0; +} + + +//========================================================== +// Shutdown. +// + +void +as_storage_shutdown_ssd(as_namespace *ns) +{ + drv_ssds *ssds = (drv_ssds*)ns->storage_private; + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + // Stop the maintenance thread from (also) flushing the swbs. + pthread_mutex_lock(&ssd->write_lock); + pthread_mutex_lock(&ssd->defrag_lock); + + // Flush current swb by pushing it to write-q. + if (ssd->current_swb) { + // Clean the end of the buffer before pushing to write-q. + if (ssd->write_block_size > ssd->current_swb->pos) { + memset(&ssd->current_swb->buf[ssd->current_swb->pos], 0, + ssd->write_block_size - ssd->current_swb->pos); + } + + cf_queue_push(ssd->swb_write_q, &ssd->current_swb); + ssd->current_swb = NULL; + } + + // Flush defrag swb by pushing it to write-q. + if (ssd->defrag_swb) { + // Clean the end of the buffer before pushing to write-q. + if (ssd->write_block_size > ssd->defrag_swb->pos) { + memset(&ssd->defrag_swb->buf[ssd->defrag_swb->pos], 0, + ssd->write_block_size - ssd->defrag_swb->pos); + } + + cf_queue_push(ssd->swb_write_q, &ssd->defrag_swb); + ssd->defrag_swb = NULL; + } + } + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + + while (cf_queue_sz(ssd->swb_write_q)) { + usleep(1000); + } + + if (ssd->shadow_name) { + while (cf_queue_sz(ssd->swb_shadow_q)) { + usleep(1000); + } + } + + ssd->running = false; + } + + for (int i = 0; i < ssds->n_ssds; i++) { + drv_ssd *ssd = &ssds->ssds[i]; + void *p_void; + + for (uint32_t j = 0; j < ssds->ns->storage_write_threads; j++) { + pthread_join(ssd->write_worker_thread[j], &p_void); + } + + if (ssd->shadow_name) { + pthread_join(ssd->shadow_worker_thread, &p_void); + } + } + + ssd_set_trusted(ns); +} diff --git a/as/src/storage/drv_ssd_ce.c b/as/src/storage/drv_ssd_ce.c new file mode 100644 index 00000000..9b52934c --- /dev/null +++ b/as/src/storage/drv_ssd_ce.c @@ -0,0 +1,181 @@ +/* + * drv_ssd_cold.c + * + * Copyright (C) 2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "storage/drv_ssd.h" +#include +#include +#include +#include "fault.h" +#include "base/datamodel.h" +#include "base/rec_props.h" +#include "storage/storage.h" + + +void +ssd_resume_devices(drv_ssds* ssds) +{ + // Should not get here - for enterprise version only. + cf_crash(AS_DRV_SSD, "cold start called ssd_resume_devices()"); +} + +void* +run_ssd_cool_start(void* udata) +{ + // Should not get here - for enterprise version only. + cf_crash(AS_DRV_SSD, "community edition called run_ssd_cool_start()"); + + return NULL; +} + +void +ssd_header_init_cfg(const as_namespace* ns, ssd_device_header* header) +{ +} + +bool +ssd_header_is_valid_cfg(const as_namespace* ns, const ssd_device_header* header) +{ + return true; +} + +bool +ssd_cold_start_is_valid_n_bins(uint32_t n_bins) +{ + // FIXME - what should we do here? + cf_assert(n_bins != 0, AS_DRV_SSD, + "community edition found tombstone - erase drive and restart"); + + return n_bins <= BIN_NAMES_QUOTA; +} + +bool +ssd_cold_start_is_record_truncated(as_namespace* ns, const drv_ssd_block* block, + const as_rec_props* p_props) +{ + return false; +} + +void +ssd_cold_start_adjust_cenotaph(as_namespace* ns, const drv_ssd_block* block, + as_record* r) +{ + // Nothing to do - relevant for enterprise version only. +} + +void +ssd_cold_start_transition_record(as_namespace* ns, const drv_ssd_block* block, + as_record* r, bool is_create) +{ + // Nothing to do - relevant for enterprise version only. +} + +void +ssd_cold_start_drop_cenotaphs(as_namespace* ns) +{ + // Nothing to do - relevant for enterprise version only. +} + +void +ssd_adjust_versions(as_namespace* ns, ssd_device_header* header) +{ + // Nothing to do - relevant for enterprise version only. +} + +conflict_resolution_pol +ssd_cold_start_policy(as_namespace *ns) +{ + return AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME; +} + +void +ssd_cold_start_init_repl_state(as_namespace* ns, as_record* r) +{ + // Nothing to do - relevant for enterprise version only. +} + +void +ssd_init_commit(drv_ssd *ssd) +{ + // Nothing to do - relevant for enterprise version only. +} + +uint64_t +ssd_flush_max_us(const as_namespace *ns) +{ + return ns->storage_flush_max_us; +} + +int +ssd_write_bins(as_storage_rd *rd) +{ + return ssd_buffer_bins(rd); +} + +void +ssd_init_trusted(as_namespace* ns) +{ + // Nothing to do - relevant for enterprise version only. +} + +bool +ssd_is_untrusted(as_namespace *ns, uint8_t header_flags) +{ + return false; +} + +void +ssd_set_trusted(as_namespace* ns) +{ + // Nothing to do - relevant for enterprise version only. +} + +void +as_storage_start_tomb_raider_ssd(as_namespace* ns) +{ + // Tomb raider is for enterprise version only. +} + +int +as_storage_record_write_ssd(as_storage_rd* rd) +{ + // All record writes except defrag come through here! + return as_bin_inuse_has(rd) ? ssd_write(rd) : 0; +} + +void +ssd_init_encryption_key(as_namespace* ns) +{ +} + +void +ssd_do_encrypt(const uint8_t* key, uint64_t off, drv_ssd_block* block) +{ + // Should not get here - for enterprise version only. + cf_crash(AS_DRV_SSD, "community edition called ssd_do_encrypt()"); +} + +void +ssd_do_decrypt(const uint8_t* key, uint64_t off, drv_ssd_block* block) +{ + // Should not get here - for enterprise version only. + cf_crash(AS_DRV_SSD, "community edition called ssd_do_decrypt()"); +} diff --git a/as/src/storage/storage.c b/as/src/storage/storage.c new file mode 100644 index 00000000..e4e1fb35 --- /dev/null +++ b/as/src/storage/storage.c @@ -0,0 +1,688 @@ +/* + * storage.c + * + * Copyright (C) 2009-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "storage/storage.h" + +#include +#include +#include +#include + +#include "citrusleaf/cf_digest.h" +#include "citrusleaf/cf_queue.h" + +#include "cf_mutex.h" +#include "fault.h" +#include "olock.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/rec_props.h" +#include "base/thr_info.h" +#include "fabric/partition.h" + + +//========================================================== +// Generic "base class" functions that call through +// storage-engine "v-tables". +// + +//-------------------------------------- +// as_storage_init +// + +typedef int (*as_storage_namespace_init_fn)(as_namespace *ns, cf_queue *complete_q, void *udata); +static const as_storage_namespace_init_fn as_storage_namespace_init_table[AS_NUM_STORAGE_ENGINES] = { + as_storage_namespace_init_memory, + as_storage_namespace_init_ssd +}; + +void +as_storage_init() +{ + cf_queue *complete_q = cf_queue_create(sizeof(void*), true); + + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_namespace *ns = g_config.namespaces[i]; + + if (as_storage_namespace_init_table[ns->storage_type]) { + if (0 != as_storage_namespace_init_table[ns->storage_type](ns, complete_q, NULL)) { + cf_crash(AS_STORAGE, "could not initialize storage for namespace %s", ns->name); + } + } + else { + cf_crash(AS_STORAGE, "invalid storage type for namespace %s", ns->name); + } + } + + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + void *_t; + + while (CF_QUEUE_OK != cf_queue_pop(complete_q, &_t, 2000)) { + as_storage_loading_records_ticker_ssd(); + } + } + + cf_queue_destroy(complete_q); +} + +//-------------------------------------- +// as_storage_start_tomb_raider +// + +typedef void (*as_storage_start_tomb_raider_fn)(as_namespace *ns); +static const as_storage_start_tomb_raider_fn as_storage_start_tomb_raider_table[AS_NUM_STORAGE_ENGINES] = { + as_storage_start_tomb_raider_memory, + as_storage_start_tomb_raider_ssd +}; + +void +as_storage_start_tomb_raider() +{ + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_namespace *ns = g_config.namespaces[i]; + + if (as_storage_start_tomb_raider_table[ns->storage_type]) { + as_storage_start_tomb_raider_table[ns->storage_type](ns); + } + } +} + +//-------------------------------------- +// as_storage_namespace_destroy +// + +typedef int (*as_storage_namespace_destroy_fn)(as_namespace *ns); +static const as_storage_namespace_destroy_fn as_storage_namespace_destroy_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory has no destroy + as_storage_namespace_destroy_ssd +}; + +int +as_storage_namespace_destroy(as_namespace *ns) +{ + if (as_storage_namespace_destroy_table[ns->storage_type]) { + return as_storage_namespace_destroy_table[ns->storage_type](ns); + } + + return 0; +} + +//-------------------------------------- +// as_storage_record_destroy +// + +typedef int (*as_storage_record_destroy_fn)(as_namespace *ns, as_record *r); +static const as_storage_record_destroy_fn as_storage_record_destroy_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory has no record destroy + as_storage_record_destroy_ssd +}; + +int +as_storage_record_destroy(as_namespace *ns, as_record *r) +{ + if (as_storage_record_destroy_table[ns->storage_type]) { + return as_storage_record_destroy_table[ns->storage_type](ns, r); + } + + return 0; +} + +//-------------------------------------- +// as_storage_record_create +// + +typedef int (*as_storage_record_create_fn)(as_storage_rd *rd); +static const as_storage_record_create_fn as_storage_record_create_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory has no record create + as_storage_record_create_ssd +}; + +int +as_storage_record_create(as_namespace *ns, as_record *r, as_storage_rd *rd) +{ + rd->r = r; + rd->ns = ns; + as_rec_props_clear(&rd->rec_props); + rd->bins = 0; + rd->n_bins = 0; + rd->record_on_device = false; + rd->ignore_record_on_device = false; + rd->key_size = 0; + rd->key = NULL; + rd->is_durable_delete = false; + + if (as_storage_record_create_table[ns->storage_type]) { + return as_storage_record_create_table[ns->storage_type](rd); + } + + return 0; +} + +//-------------------------------------- +// as_storage_record_open +// + +typedef int (*as_storage_record_open_fn)(as_storage_rd *rd); +static const as_storage_record_open_fn as_storage_record_open_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory has no record open + as_storage_record_open_ssd +}; + +int +as_storage_record_open(as_namespace *ns, as_record *r, as_storage_rd *rd) +{ + rd->r = r; + rd->ns = ns; + as_rec_props_clear(&rd->rec_props); + rd->bins = 0; + rd->n_bins = 0; + rd->record_on_device = true; + rd->ignore_record_on_device = false; + rd->key_size = 0; + rd->key = NULL; + rd->is_durable_delete = false; + + if (as_storage_record_open_table[ns->storage_type]) { + return as_storage_record_open_table[ns->storage_type](rd); + } + + return 0; +} + +//-------------------------------------- +// as_storage_record_close +// + +typedef int (*as_storage_record_close_fn)(as_storage_rd *rd); +static const as_storage_record_close_fn as_storage_record_close_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory has no record close + as_storage_record_close_ssd +}; + +int +as_storage_record_close(as_storage_rd *rd) +{ + if (as_storage_record_close_table[rd->ns->storage_type]) { + return as_storage_record_close_table[rd->ns->storage_type](rd); + } + + return 0; +} + +//-------------------------------------- +// as_storage_record_load_n_bins +// + +typedef int (*as_storage_record_load_n_bins_fn)(as_storage_rd *rd); +static const as_storage_record_load_n_bins_fn as_storage_record_load_n_bins_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory has no record load n bins + as_storage_record_load_n_bins_ssd +}; + +int +as_storage_record_load_n_bins(as_storage_rd *rd) +{ + if (as_storage_record_load_n_bins_table[rd->ns->storage_type]) { + return as_storage_record_load_n_bins_table[rd->ns->storage_type](rd); + } + + return 0; +} + +//-------------------------------------- +// as_storage_record_load_bins +// + +typedef int (*as_storage_record_load_bins_fn)(as_storage_rd *rd); +static const as_storage_record_load_bins_fn as_storage_record_load_bins_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory has no record load bins + as_storage_record_load_bins_ssd +}; + +int +as_storage_record_load_bins(as_storage_rd *rd) +{ + if (as_storage_record_load_bins_table[rd->ns->storage_type]) { + return as_storage_record_load_bins_table[rd->ns->storage_type](rd); + } + + return 0; +} + +//-------------------------------------- +// as_storage_record_size_and_check +// + +typedef bool (*as_storage_record_size_and_check_fn)(as_storage_rd *rd); +static const as_storage_record_size_and_check_fn as_storage_record_size_and_check_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // no limit if no persistent storage - flat size is irrelevant + as_storage_record_size_and_check_ssd +}; + +bool +as_storage_record_size_and_check(as_storage_rd *rd) +{ + if (as_storage_record_size_and_check_table[rd->ns->storage_type]) { + return as_storage_record_size_and_check_table[rd->ns->storage_type](rd); + } + + return true; +} + +//-------------------------------------- +// as_storage_record_write +// + +typedef int (*as_storage_record_write_fn)(as_storage_rd *rd); +static const as_storage_record_write_fn as_storage_record_write_table[AS_NUM_STORAGE_ENGINES] = { + as_storage_record_write_memory, + as_storage_record_write_ssd +}; + +int +as_storage_record_write(as_storage_rd *rd) +{ + if (as_storage_record_write_table[rd->ns->storage_type]) { + return as_storage_record_write_table[rd->ns->storage_type](rd); + } + + return 0; +} + +//-------------------------------------- +// as_storage_wait_for_defrag +// + +typedef void (*as_storage_wait_for_defrag_fn)(as_namespace *ns); +static const as_storage_wait_for_defrag_fn as_storage_wait_for_defrag_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory doesn't do defrag + as_storage_wait_for_defrag_ssd +}; + +void +as_storage_wait_for_defrag() +{ + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_namespace *ns = g_config.namespaces[i]; + + if (as_storage_wait_for_defrag_table[ns->storage_type]) { + as_storage_wait_for_defrag_table[ns->storage_type](ns); + } + } +} + +//-------------------------------------- +// as_storage_overloaded +// + +typedef bool (*as_storage_overloaded_fn)(as_namespace *ns); +static const as_storage_overloaded_fn as_storage_overloaded_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory has no overload check + as_storage_overloaded_ssd +}; + +bool +as_storage_overloaded(as_namespace *ns) +{ + if (as_storage_overloaded_table[ns->storage_type]) { + return as_storage_overloaded_table[ns->storage_type](ns); + } + + return false; +} + +//-------------------------------------- +// as_storage_has_space +// + +typedef bool (*as_storage_has_space_fn)(as_namespace *ns); +static const as_storage_has_space_fn as_storage_has_space_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory has no space check + as_storage_has_space_ssd +}; + +bool +as_storage_has_space(as_namespace *ns) +{ + if (as_storage_has_space_table[ns->storage_type]) { + return as_storage_has_space_table[ns->storage_type](ns); + } + + return true; +} + +//-------------------------------------- +// as_storage_defrag_sweep +// + +typedef void (*as_storage_defrag_sweep_fn)(as_namespace *ns); +static const as_storage_defrag_sweep_fn as_storage_defrag_sweep_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory doesn't do defrag + as_storage_defrag_sweep_ssd +}; + +void +as_storage_defrag_sweep(as_namespace *ns) +{ + if (as_storage_defrag_sweep_table[ns->storage_type]) { + as_storage_defrag_sweep_table[ns->storage_type](ns); + } +} + +//-------------------------------------- +// as_storage_info_set +// + +typedef void (*as_storage_info_set_fn)(as_namespace *ns, const as_partition *p, bool flush); +static const as_storage_info_set_fn as_storage_info_set_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory doesn't support info + as_storage_info_set_ssd +}; + +void +as_storage_info_set(as_namespace *ns, const as_partition *p, bool flush) +{ + if (as_storage_info_set_table[ns->storage_type]) { + as_storage_info_set_table[ns->storage_type](ns, p, flush); + } +} + +//-------------------------------------- +// as_storage_info_get +// + +typedef void (*as_storage_info_get_fn)(as_namespace *ns, as_partition *p); +static const as_storage_info_get_fn as_storage_info_get_table[AS_NUM_STORAGE_ENGINES] = { + as_storage_info_get_memory, + as_storage_info_get_ssd +}; + +void +as_storage_info_get(as_namespace *ns, as_partition *p) +{ + if (as_storage_info_get_table[ns->storage_type]) { + as_storage_info_get_table[ns->storage_type](ns, p); + } +} + +//-------------------------------------- +// as_storage_info_flush +// + +typedef int (*as_storage_info_flush_fn)(as_namespace *ns); +static const as_storage_info_flush_fn as_storage_info_flush_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory doesn't support info + as_storage_info_flush_ssd +}; + +int +as_storage_info_flush(as_namespace *ns) +{ + if (as_storage_info_flush_table[ns->storage_type]) { + return as_storage_info_flush_table[ns->storage_type](ns); + } + + return 0; +} + +//-------------------------------------- +// as_storage_save_evict_void_time +// + +typedef void (*as_storage_save_evict_void_time_fn)(as_namespace *ns, uint32_t evict_void_time); +static const as_storage_save_evict_void_time_fn as_storage_save_evict_void_time_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory doesn't store info + as_storage_save_evict_void_time_ssd +}; + +void +as_storage_save_evict_void_time(as_namespace *ns, uint32_t evict_void_time) +{ + if (as_storage_save_evict_void_time_table[ns->storage_type]) { + as_storage_save_evict_void_time_table[ns->storage_type](ns, evict_void_time); + } +} + +//-------------------------------------- +// as_storage_stats +// + +typedef int (*as_storage_stats_fn)(as_namespace *ns, int *available_pct, uint64_t *used_disk_bytes); +static const as_storage_stats_fn as_storage_stats_table[AS_NUM_STORAGE_ENGINES] = { + as_storage_stats_memory, + as_storage_stats_ssd +}; + +int +as_storage_stats(as_namespace *ns, int *available_pct, uint64_t *used_disk_bytes) +{ + if (as_storage_stats_table[ns->storage_type]) { + return as_storage_stats_table[ns->storage_type](ns, available_pct, used_disk_bytes); + } + + return 0; +} + +//-------------------------------------- +// as_storage_ticker_stats +// + +typedef int (*as_storage_ticker_stats_fn)(as_namespace *ns); +static const as_storage_ticker_stats_fn as_storage_ticker_stats_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory doesn't support per-disk histograms... for now. + as_storage_ticker_stats_ssd +}; + +int +as_storage_ticker_stats(as_namespace *ns) +{ + if (as_storage_ticker_stats_table[ns->storage_type]) { + return as_storage_ticker_stats_table[ns->storage_type](ns); + } + + return 0; +} + +//-------------------------------------- +// as_storage_histogram_clear_all +// + +typedef int (*as_storage_histogram_clear_fn)(as_namespace *ns); +static const as_storage_histogram_clear_fn as_storage_histogram_clear_table[AS_NUM_STORAGE_ENGINES] = { + NULL, // memory doesn't support per-disk histograms... for now. + as_storage_histogram_clear_ssd +}; + +int +as_storage_histogram_clear_all(as_namespace *ns) +{ + if (as_storage_histogram_clear_table[ns->storage_type]) { + return as_storage_histogram_clear_table[ns->storage_type](ns); + } + + return 0; +} + + +//========================================================== +// Generic functions that don't use "v-tables". +// + +// Get size of record's in-memory data - everything except index bytes. +uint64_t +as_storage_record_get_n_bytes_memory(as_storage_rd *rd) +{ + if (! rd->ns->storage_data_in_memory) { + return 0; + } + + uint64_t n_bytes_memory = 0; + + for (uint16_t i = 0; i < rd->n_bins; i++) { + n_bytes_memory += as_bin_particle_size(&rd->bins[i]); + } + + if (! rd->ns->single_bin) { + if (rd->r->key_stored == 1) { + n_bytes_memory += sizeof(as_rec_space) + + ((as_rec_space*)rd->r->dim)->key_size; + } + + if (as_index_get_bin_space(rd->r)) { + n_bytes_memory += sizeof(as_bin_space) + + (sizeof(as_bin) * rd->n_bins); + } + } + + return n_bytes_memory; +} + +void +as_storage_record_adjust_mem_stats(as_storage_rd *rd, uint64_t start_bytes) +{ + if (! rd->ns->storage_data_in_memory) { + return; + } + + uint64_t end_bytes = as_storage_record_get_n_bytes_memory(rd); + int64_t delta_bytes = (int64_t)end_bytes - (int64_t)start_bytes; + + if (delta_bytes != 0) { + cf_atomic_int_add(&rd->ns->n_bytes_memory, delta_bytes); + as_namespace_adjust_set_memory(rd->ns, as_index_get_set_id(rd->r), + delta_bytes); + } +} + +void +as_storage_record_drop_from_mem_stats(as_storage_rd *rd) +{ + if (! rd->ns->storage_data_in_memory) { + return; + } + + uint64_t drop_bytes = as_storage_record_get_n_bytes_memory(rd); + + cf_atomic_int_sub(&rd->ns->n_bytes_memory, drop_bytes); + as_namespace_adjust_set_memory(rd->ns, as_index_get_set_id(rd->r), + -(int64_t)drop_bytes); +} + +bool +as_storage_record_get_key(as_storage_rd *rd) +{ + if (rd->r->key_stored == 0) { + return false; + } + + if (rd->ns->storage_data_in_memory) { + rd->key_size = ((as_rec_space*)rd->r->dim)->key_size; + rd->key = ((as_rec_space*)rd->r->dim)->key; + return true; + } + + if (rd->record_on_device && ! rd->ignore_record_on_device) { + return as_storage_record_get_key_ssd(rd); + } + + return false; +} + +size_t +as_storage_record_rec_props_size(as_storage_rd *rd) +{ + size_t rec_props_data_size = 0; + + const char *set_name = as_index_get_set_name(rd->r, rd->ns); + + if (set_name) { + rec_props_data_size += as_rec_props_sizeof_field(strlen(set_name) + 1); + } + + if (rd->key) { + rec_props_data_size += as_rec_props_sizeof_field(rd->key_size); + } + + return rec_props_data_size; +} + +// Populates rec_props struct in rd, using index info where possible. Assumes +// relevant information is ready: +// - set name +// - record key +// Relies on caller's properly allocated rec_props_data. +void +as_storage_record_set_rec_props(as_storage_rd *rd, uint8_t* rec_props_data) +{ + as_rec_props_init(&(rd->rec_props), rec_props_data); + + if (as_index_has_set(rd->r)) { + const char *set_name = as_index_get_set_name(rd->r, rd->ns); + as_rec_props_add_field(&(rd->rec_props), CL_REC_PROPS_FIELD_SET_NAME, + strlen(set_name) + 1, (uint8_t *)set_name); + } + + if (rd->key) { + as_rec_props_add_field(&(rd->rec_props), CL_REC_PROPS_FIELD_KEY, + rd->key_size, rd->key); + } +} + +void +as_storage_shutdown(void) +{ + cf_info(AS_STORAGE, "initiating storage shutdown ..."); + + // Pull all record locks - stops everything writing to current swbs such + // that each write's record lock scope is either completed or never entered. + + for (uint32_t n = 0; n < g_record_locks->n_locks; n++) { + cf_mutex_lock(&g_record_locks->locks[n]); + } + + // Now flush everything outstanding to storage devices. + + cf_info(AS_STORAGE, "flushing data to storage ..."); + + for (uint32_t i = 0; i < g_config.n_namespaces; i++) { + as_namespace *ns = g_config.namespaces[i]; + + if (ns->storage_type == AS_STORAGE_ENGINE_SSD) { + + // For now this is only needed for warm-restartable namespaces. + for (uint32_t pid = 0; pid < AS_PARTITIONS; pid++) { + as_partition_shutdown(ns, pid); + } + + as_storage_shutdown_ssd(ns); + as_namespace_xmem_trusted(ns); + } + } + + cf_info(AS_STORAGE, "completed flushing to storage"); +} diff --git a/as/src/transaction/delete.c b/as/src/transaction/delete.c new file mode 100644 index 00000000..9ab8387d --- /dev/null +++ b/as/src/transaction/delete.c @@ -0,0 +1,486 @@ +/* + * delete.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/delete.h" + +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" + +#include "dynbuf.h" +#include "fault.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/secondary_index.h" +#include "base/transaction.h" +#include "base/transaction_policy.h" +#include "base/xdr_serverside.h" +#include "fabric/partition.h" +#include "storage/storage.h" +#include "transaction/duplicate_resolve.h" +#include "transaction/proxy.h" +#include "transaction/replica_write.h" +#include "transaction/rw_request.h" +#include "transaction/rw_request_hash.h" +#include "transaction/rw_utils.h" + + +//========================================================== +// Forward declarations. +// + +void start_delete_dup_res(rw_request* rw, as_transaction* tr); +void start_delete_repl_write(rw_request* rw, as_transaction* tr); +void start_delete_repl_write_forget(rw_request* rw, as_transaction* tr); +bool delete_dup_res_cb(rw_request* rw); +void delete_repl_write_after_dup_res(rw_request* rw, as_transaction* tr); +void delete_repl_write_forget_after_dup_res(rw_request* rw, as_transaction* tr); +void delete_repl_write_cb(rw_request* rw); + +void send_delete_response(as_transaction* tr); +void delete_timeout_cb(rw_request* rw); + + +//========================================================== +// Inlines & macros. +// + +static inline void +client_delete_update_stats(as_namespace* ns, uint8_t result_code, + bool is_xdr_op) +{ + switch (result_code) { + case AS_PROTO_RESULT_OK: + cf_atomic64_incr(&ns->n_client_delete_success); + if (is_xdr_op) { + cf_atomic64_incr(&ns->n_xdr_delete_success); + } + break; + case AS_PROTO_RESULT_FAIL_TIMEOUT: + cf_atomic64_incr(&ns->n_client_delete_timeout); + if (is_xdr_op) { + cf_atomic64_incr(&ns->n_xdr_delete_timeout); + } + break; + default: + cf_atomic64_incr(&ns->n_client_delete_error); + if (is_xdr_op) { + cf_atomic64_incr(&ns->n_xdr_delete_error); + } + break; + case AS_PROTO_RESULT_FAIL_NOT_FOUND: + cf_atomic64_incr(&ns->n_client_delete_not_found); + if (is_xdr_op) { + cf_atomic64_incr(&ns->n_xdr_delete_not_found); + } + break; + } +} + + +//========================================================== +// Public API. +// + +transaction_status +as_delete_start(as_transaction* tr) +{ + // Apply XDR filter. + if (! xdr_allows_write(tr)) { + tr->result_code = AS_PROTO_RESULT_FAIL_ALWAYS_FORBIDDEN; + send_delete_response(tr); + return TRANS_DONE_ERROR; + } + + if (! validate_delete_durability(tr)) { + tr->result_code = AS_PROTO_RESULT_FAIL_FORBIDDEN; + send_delete_response(tr); + return TRANS_DONE_ERROR; + } + + if (delete_storage_overloaded(tr)) { + tr->result_code = AS_PROTO_RESULT_FAIL_DEVICE_OVERLOAD; + send_delete_response(tr); + return TRANS_DONE_ERROR; + } + + // Create rw_request and add to hash. + rw_request_hkey hkey = { tr->rsv.ns->id, tr->keyd }; + rw_request* rw = rw_request_create(&tr->keyd); + transaction_status status = rw_request_hash_insert(&hkey, rw, tr); + + // If rw_request wasn't inserted in hash, transaction is finished. + if (status != TRANS_IN_PROGRESS) { + rw_request_release(rw); + + if (status != TRANS_WAITING) { + send_delete_response(tr); + } + + return status; + } + // else - rw_request is now in hash, continue... + + if (tr->rsv.ns->write_dup_res_disabled || + as_transaction_is_nsup_delete(tr)) { + // Note - preventing duplicate resolution this way allows + // rw_request_destroy() to handle dup_msg[] cleanup correctly. + tr->rsv.n_dupl = 0; + } + + // If there are duplicates to resolve, start doing so. + // TODO - should we bother if there's no generation check? + if (tr->rsv.n_dupl != 0) { + start_delete_dup_res(rw, tr); + + // Started duplicate resolution. + return TRANS_IN_PROGRESS; + } + // else - no duplicate resolution phase, apply operation to master. + + // Set up the nodes to which we'll write replicas. + rw->n_dest_nodes = as_partition_get_other_replicas(tr->rsv.p, + rw->dest_nodes); + + if (insufficient_replica_destinations(tr->rsv.ns, rw->n_dest_nodes)) { + rw_request_hash_delete(&hkey, rw); + tr->result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE; + send_delete_response(tr); + return TRANS_DONE_ERROR; + } + + // If error, transaction is finished. + if ((status = delete_master(tr, rw)) != TRANS_IN_PROGRESS) { + rw_request_hash_delete(&hkey, rw); + + if (status != TRANS_WAITING) { + send_delete_response(tr); + } + + return status; + } + + // If we don't need replica writes, transaction is finished. + if (rw->n_dest_nodes == 0) { + finished_replicated(tr); + rw_request_hash_delete(&hkey, rw); + send_delete_response(tr); + return TRANS_DONE_SUCCESS; + } + + // If we don't need to wait for replica write acks, fire and forget. + if (as_transaction_is_nsup_delete(tr) || respond_on_master_complete(tr)) { + start_delete_repl_write_forget(rw, tr); + rw_request_hash_delete(&hkey, rw); + send_delete_response(tr); + return TRANS_DONE_SUCCESS; + } + + start_delete_repl_write(rw, tr); + + // Started replica write. + return TRANS_IN_PROGRESS; +} + + +//========================================================== +// Local helpers - transaction flow. +// + +void +start_delete_dup_res(rw_request* rw, as_transaction* tr) +{ + // Finish initializing rw, construct and send dup-res message. + + dup_res_make_message(rw, tr); + + pthread_mutex_lock(&rw->lock); + + dup_res_setup_rw(rw, tr, delete_dup_res_cb, delete_timeout_cb); + send_rw_messages(rw); + + pthread_mutex_unlock(&rw->lock); +} + + +void +start_delete_repl_write(rw_request* rw, as_transaction* tr) +{ + // Finish initializing rw, construct and send repl-delete message. + + repl_write_make_message(rw, tr); + + pthread_mutex_lock(&rw->lock); + + repl_write_setup_rw(rw, tr, delete_repl_write_cb, delete_timeout_cb); + send_rw_messages(rw); + + pthread_mutex_unlock(&rw->lock); +} + + +void +start_delete_repl_write_forget(rw_request* rw, as_transaction* tr) +{ + // Construct and send repl-write message. No need to finish rw setup. + + repl_write_make_message(rw, tr); + send_rw_messages_forget(rw); +} + + +bool +delete_dup_res_cb(rw_request* rw) +{ + as_transaction tr; + as_transaction_init_from_rw(&tr, rw); + + if (tr.result_code != AS_PROTO_RESULT_OK) { + send_delete_response(&tr); + return true; + } + + // Set up the nodes to which we'll write replicas. + rw->n_dest_nodes = as_partition_get_other_replicas(tr.rsv.p, + rw->dest_nodes); + + if (insufficient_replica_destinations(tr.rsv.ns, rw->n_dest_nodes)) { + tr.result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE; + send_delete_response(&tr); + return true; + } + + transaction_status status = delete_master(&tr, rw); + + if (status == TRANS_WAITING) { + // Note - new tr now owns msgp, make sure rw destructor doesn't free it. + // Also, rw will release rsv - new tr will get a new one. + rw->msgp = NULL; + return true; + } + + if (status == TRANS_DONE_ERROR) { + send_delete_response(&tr); + return true; + } + + // If we don't need replica writes, transaction is finished. + if (rw->n_dest_nodes == 0) { + finished_replicated(&tr); + send_delete_response(&tr); + return true; + } + + // If we don't need to wait for replica write acks, fire and forget. + // (Remember that nsup deletes can't get here, so no need to check.) + if (respond_on_master_complete(&tr)) { + delete_repl_write_forget_after_dup_res(rw, &tr); + send_delete_response(&tr); + return true; + } + + delete_repl_write_after_dup_res(rw, &tr); + + // Started replica write - don't delete rw_request from hash. + return false; +} + + +void +delete_repl_write_after_dup_res(rw_request* rw, as_transaction* tr) +{ + // Recycle rw_request that was just used for duplicate resolution to now do + // replica writes. Note - we are under the rw_request lock here! + + repl_write_make_message(rw, tr); + repl_write_reset_rw(rw, tr, delete_repl_write_cb); + send_rw_messages(rw); +} + + +void +delete_repl_write_forget_after_dup_res(rw_request* rw, as_transaction* tr) +{ + // Send replica writes. Not waiting for acks, so need to reset rw_request. + // Note - we are under the rw_request lock here! + + repl_write_make_message(rw, tr); + send_rw_messages_forget(rw); +} + + +void +delete_repl_write_cb(rw_request* rw) +{ + as_transaction tr; + as_transaction_init_from_rw(&tr, rw); + + finished_replicated(&tr); + send_delete_response(&tr); + + // Finished transaction - rw_request cleans up reservation and msgp! +} + + +//========================================================== +// Local helpers - transaction end. +// + +void +send_delete_response(as_transaction* tr) +{ + // Paranoia - shouldn't get here on losing race with timeout. + if (! tr->from.any && tr->origin != FROM_NSUP) { + cf_warning(AS_RW, "transaction origin %u has null 'from'", tr->origin); + return; + } + + // Note - if tr was setup from rw, rw->from.any has been set null and + // informs timeout it lost the race. + + switch (tr->origin) { + case FROM_CLIENT: + as_msg_send_reply(tr->from.proto_fd_h, tr->result_code, 0, 0, NULL, + NULL, 0, tr->rsv.ns, as_transaction_trid(tr)); + client_delete_update_stats(tr->rsv.ns, tr->result_code, + as_transaction_is_xdr(tr)); + break; + case FROM_PROXY: + as_proxy_send_response(tr->from.proxy_node, tr->from_data.proxy_tid, + tr->result_code, 0, 0, NULL, NULL, 0, tr->rsv.ns, + as_transaction_trid(tr)); + break; + case FROM_NSUP: + break; + default: + cf_crash(AS_RW, "unexpected transaction origin %u", tr->origin); + break; + } + + tr->from.any = NULL; // pattern, not needed +} + + +void +delete_timeout_cb(rw_request* rw) +{ + // Paranoia - remove eventually. + cf_assert(rw->origin != FROM_NSUP, AS_RW, "nsup delete got timeout cb"); + + if (! rw->from.any) { + return; // lost race against dup-res or repl-write callback + } + + finished_not_replicated(rw); + + switch (rw->origin) { + case FROM_CLIENT: + as_msg_send_reply(rw->from.proto_fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0, + 0, NULL, NULL, 0, rw->rsv.ns, rw_request_trid(rw)); + client_delete_update_stats(rw->rsv.ns, AS_PROTO_RESULT_FAIL_TIMEOUT, + as_msg_is_xdr(&rw->msgp->msg)); + break; + case FROM_PROXY: + break; + default: + cf_crash(AS_RW, "unexpected transaction origin %u", rw->origin); + break; + } + + rw->from.any = NULL; // inform other callback it lost the race +} + + +//========================================================== +// Local helpers - delete master. +// + +transaction_status +drop_master(as_transaction* tr, as_index_ref* r_ref, rw_request* rw) +{ + as_msg* m = &tr->msgp->msg; + as_namespace* ns = tr->rsv.ns; + as_index_tree* tree = tr->rsv.tree; + as_record* r = r_ref->r; + + // Check generation requirement, if any. + if (! generation_check(r, m, ns)) { + as_record_done(r_ref, ns); + cf_atomic64_incr(&ns->n_fail_generation); + tr->result_code = AS_PROTO_RESULT_FAIL_GENERATION; + return TRANS_DONE_ERROR; + } + + bool check_key = as_transaction_has_key(tr); + + if (ns->storage_data_in_memory || check_key) { + as_storage_rd rd; + as_storage_record_open(ns, r, &rd); + + // Check the key if required. + // Note - for data-not-in-memory a key check is expensive! + if (check_key && as_storage_record_get_key(&rd) && + ! check_msg_key(m, &rd)) { + as_storage_record_close(&rd); + as_record_done(r_ref, ns); + tr->result_code = AS_PROTO_RESULT_FAIL_KEY_MISMATCH; + return TRANS_DONE_ERROR; + } + + if (ns->storage_data_in_memory) { + delete_adjust_sindex(&rd); + } + + as_storage_record_close(&rd); + } + + // Generate a binless pickle. but don't generate pickled rec-props - these + // are useless for a drop. + rw->pickled_sz = sizeof(uint16_t); + rw->pickled_buf = cf_malloc(rw->pickled_sz); + *(uint16_t*)rw->pickled_buf = 0; + + // Save the set-ID for XDR. + uint16_t set_id = as_index_get_set_id(r); + + as_index_delete(tree, &tr->keyd); + as_record_done(r_ref, ns); + + if (xdr_must_ship_delete(ns, as_transaction_is_nsup_delete(tr), + as_msg_is_xdr(m))) { + xdr_write(ns, &tr->keyd, 0, 0, XDR_OP_TYPE_DROP, set_id, NULL); + } + + return TRANS_IN_PROGRESS; +} diff --git a/as/src/transaction/delete_ce.c b/as/src/transaction/delete_ce.c new file mode 100644 index 00000000..2872e806 --- /dev/null +++ b/as/src/transaction/delete_ce.c @@ -0,0 +1,69 @@ +/* + * delete_ce.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/delete.h" + +#include + +#include "fault.h" + +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/transaction.h" +#include "transaction/rw_request.h" + + +//========================================================== +// Private API - for enterprise separation only. +// + +bool +delete_storage_overloaded(as_transaction* tr) +{ + return false; +} + + +transaction_status +delete_master(as_transaction* tr, rw_request* rw) +{ + if (as_transaction_is_durable_delete(tr)) { + cf_warning(AS_RW, "durable delete is an enterprise feature"); + tr->result_code = AS_PROTO_RESULT_FAIL_ENTERPRISE_ONLY; + return TRANS_DONE_ERROR; + } + + as_index_ref r_ref; + r_ref.skip_lock = false; + + if (0 != as_record_get(tr->rsv.tree, &tr->keyd, &r_ref)) { + tr->result_code = AS_PROTO_RESULT_FAIL_NOT_FOUND; + return TRANS_DONE_ERROR; + } + + return drop_master(tr, &r_ref, rw); +} diff --git a/as/src/transaction/duplicate_resolve.c b/as/src/transaction/duplicate_resolve.c new file mode 100644 index 00000000..c131cf33 --- /dev/null +++ b/as/src/transaction/duplicate_resolve.c @@ -0,0 +1,578 @@ +/* + * duplicate_resolve.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/duplicate_resolve.h" + +#include +#include +#include +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_digest.h" + +#include "fault.h" +#include "msg.h" +#include "node.h" + +#include "base/datamodel.h" +#include "base/proto.h" +#include "base/thr_tsvc.h" +#include "base/transaction.h" +#include "fabric/exchange.h" +#include "fabric/fabric.h" +#include "fabric/partition.h" +#include "storage/storage.h" +#include "transaction/rw_request.h" +#include "transaction/rw_request_hash.h" +#include "transaction/rw_utils.h" + + +//========================================================== +// Forward declarations. +// + +void done_handle_request(as_partition_reservation* rsv, as_index_ref* r_ref, as_storage_rd* rd); +void send_dup_res_ack(cf_node node, msg* m, uint32_t result); +void send_ack_for_bad_request(cf_node node, msg* m); +uint32_t parse_dup_meta(msg* m, uint32_t* p_generation, uint64_t* p_last_update_time); +void apply_winner(rw_request* rw); + + +//========================================================== +// Public API. +// + +void +dup_res_make_message(rw_request* rw, as_transaction* tr) +{ + rw->dest_msg = as_fabric_msg_get(M_TYPE_RW); + + as_namespace* ns = tr->rsv.ns; + msg* m = rw->dest_msg; + + msg_set_uint32(m, RW_FIELD_OP, RW_OP_DUP); + msg_set_buf(m, RW_FIELD_NAMESPACE, (uint8_t*)ns->name, strlen(ns->name), + MSG_SET_COPY); + msg_set_uint32(m, RW_FIELD_NS_ID, ns->id); + msg_set_buf(m, RW_FIELD_DIGEST, (void*)&tr->keyd, sizeof(cf_digest), + MSG_SET_COPY); + msg_set_uint32(m, RW_FIELD_TID, rw->tid); + + // TODO - JUMP - send this only because versions up to 3.14.x require it. + msg_set_uint64(m, RW_FIELD_CLUSTER_KEY, as_exchange_cluster_key()); + + as_index_ref r_ref; + r_ref.skip_lock = false; + + if (as_record_get(tr->rsv.tree, &tr->keyd, &r_ref) == 0) { + as_record* r = r_ref.r; + + msg_set_uint32(m, RW_FIELD_GENERATION, r->generation); + msg_set_uint64(m, RW_FIELD_LAST_UPDATE_TIME, r->last_update_time); + + as_record_done(&r_ref, ns); + } +} + + +void +dup_res_setup_rw(rw_request* rw, as_transaction* tr, dup_res_done_cb dup_res_cb, + timeout_done_cb timeout_cb) +{ + rw->msgp = tr->msgp; + tr->msgp = NULL; + + rw->msg_fields = tr->msg_fields; + rw->origin = tr->origin; + rw->from_flags = tr->from_flags; + + rw->from.any = tr->from.any; + rw->from_data.any = tr->from_data.any; + tr->from.any = NULL; + + rw->start_time = tr->start_time; + rw->benchmark_time = tr->benchmark_time; + + as_partition_reservation_copy(&rw->rsv, &tr->rsv); + // Hereafter, rw must release the reservation - happens in destructor. + + rw->end_time = tr->end_time; + // Note - don't need as_transaction's other 'container' members. + + rw->dup_res_cb = dup_res_cb; + rw->timeout_cb = timeout_cb; + + rw->xmit_ms = cf_getms() + g_config.transaction_retry_ms; + rw->retry_interval_ms = g_config.transaction_retry_ms; + + rw->n_dest_nodes = tr->rsv.n_dupl; + + for (uint32_t i = 0; i < rw->n_dest_nodes; i++) { + rw->dest_complete[i] = false; + rw->dest_nodes[i] = tr->rsv.dupl_nodes[i]; + } + + // Allow retransmit thread to destroy rw as soon as we unlock. + rw->is_set_up = true; +} + + +void +dup_res_handle_request(cf_node node, msg* m) +{ + cf_digest* keyd; + + if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, NULL, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_RW, "dup-res handler: no digest"); + send_ack_for_bad_request(node, m); + return; + } + + uint8_t* ns_name; + size_t ns_name_len; + + if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_RW, "dup-res handler: no namespace"); + send_ack_for_bad_request(node, m); + return; + } + + as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len); + + if (! ns) { + cf_warning(AS_RW, "dup-res handler: invalid namespace"); + send_ack_for_bad_request(node, m); + return; + } + + uint32_t generation = 0; + uint64_t last_update_time = 0; + + bool local_conflict_check = + msg_get_uint32(m, RW_FIELD_GENERATION, &generation) == 0 && + msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME, + &last_update_time) == 0; + + // Done reading message fields, may now set fields for ack. + msg_preserve_fields(m, 3, RW_FIELD_NS_ID, RW_FIELD_DIGEST, RW_FIELD_TID); + + as_partition_reservation rsv; + + as_partition_reserve(ns, as_partition_getid(keyd), &rsv); + + as_index_ref r_ref; + r_ref.skip_lock = false; + + if (as_record_get(rsv.tree, keyd, &r_ref) != 0) { + done_handle_request(&rsv, NULL, NULL); + send_dup_res_ack(node, m, AS_PROTO_RESULT_FAIL_NOT_FOUND); + return; + } + + as_record* r = r_ref.r; + + int result; + + if ((result = as_partition_check_source(ns, rsv.p, node, NULL)) != + AS_PROTO_RESULT_OK) { + done_handle_request(&rsv, &r_ref, NULL); + send_dup_res_ack(node, m, result); + return; + } + + if (local_conflict_check && + (result = as_record_resolve_conflict(ns->conflict_resolution_policy, + generation, last_update_time, r->generation, + r->last_update_time)) <= 0) { + uint32_t info = dup_res_pack_repl_state_info(r, ns); + + if (info != 0) { + msg_set_uint32(m, RW_FIELD_INFO, info); + } + + done_handle_request(&rsv, &r_ref, NULL); + send_dup_res_ack(node, m, result == 0 ? + AS_PROTO_RESULT_FAIL_RECORD_EXISTS : + AS_PROTO_RESULT_FAIL_GENERATION); + return; + } + + as_storage_rd rd; + + as_storage_record_open(ns, r, &rd); + + if ((result = as_storage_rd_load_n_bins(&rd)) < 0) { + done_handle_request(&rsv, &r_ref, &rd); + send_dup_res_ack(node, m, (uint32_t)-result); + return; + } + + as_bin stack_bins[rd.ns->storage_data_in_memory ? 0 : rd.n_bins]; + + if ((result = as_storage_rd_load_bins(&rd, stack_bins)) < 0) { + done_handle_request(&rsv, &r_ref, &rd); + send_dup_res_ack(node, m, (uint32_t)-result); + return; + } + + size_t buf_len; + uint8_t* buf = as_record_pickle(&rd, &buf_len); + + msg_set_buf(m, RW_FIELD_RECORD, (void*)buf, buf_len, + MSG_SET_HANDOFF_MALLOC); + + const char* set_name = as_index_get_set_name(r, ns); + + if (set_name) { + msg_set_buf(m, RW_FIELD_SET_NAME, (const uint8_t *)set_name, + strlen(set_name), MSG_SET_COPY); + } + + as_storage_record_get_key(&rd); + + if (rd.key) { + msg_set_buf(m, RW_FIELD_KEY, rd.key, rd.key_size, MSG_SET_COPY); + } + + msg_set_uint32(m, RW_FIELD_GENERATION, r->generation); + msg_set_uint64(m, RW_FIELD_LAST_UPDATE_TIME, r->last_update_time); + + if (r->void_time != 0) { + msg_set_uint32(m, RW_FIELD_VOID_TIME, r->void_time); + } + + uint32_t info = dup_res_pack_info(r, ns); + + if (info != 0) { + msg_set_uint32(m, RW_FIELD_INFO, info); + } + + done_handle_request(&rsv, &r_ref, &rd); + send_dup_res_ack(node, m, AS_PROTO_RESULT_OK); +} + + +void +dup_res_handle_ack(cf_node node, msg* m) +{ + uint32_t ns_id; + + if (msg_get_uint32(m, RW_FIELD_NS_ID, &ns_id) != 0) { + cf_warning(AS_RW, "dup-res ack: no ns-id"); + as_fabric_msg_put(m); + return; + } + + cf_digest* keyd; + + if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, NULL, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_RW, "dup-res ack: no digest"); + as_fabric_msg_put(m); + return; + } + + uint32_t tid; + + if (msg_get_uint32(m, RW_FIELD_TID, &tid) != 0) { + cf_warning(AS_RW, "dup-res ack: no tid"); + as_fabric_msg_put(m); + return; + } + + rw_request_hkey hkey = { ns_id, *keyd }; + rw_request* rw = rw_request_hash_get(&hkey); + + if (! rw) { + // Extra ack, after rw_request is already gone. + as_fabric_msg_put(m); + return; + } + + pthread_mutex_lock(&rw->lock); + + if (rw->tid != tid || rw->dup_res_complete) { + // Extra ack - rw_request is newer transaction for same digest, or ack + // is arriving after rw_request was aborted or finished dup-res. + pthread_mutex_unlock(&rw->lock); + rw_request_release(rw); + as_fabric_msg_put(m); + return; + } + + // Find remote node in duplicates list. + int i = index_of_node(rw->dest_nodes, rw->n_dest_nodes, node); + + if (i == -1) { + cf_warning(AS_RW, "dup-res ack: from non-dest node %lx", node); + pthread_mutex_unlock(&rw->lock); + rw_request_release(rw); + as_fabric_msg_put(m); + return; + } + + if (rw->dest_complete[i]) { + // Extra ack for this duplicate. + pthread_mutex_unlock(&rw->lock); + rw_request_release(rw); + as_fabric_msg_put(m); + return; + } + + rw->dest_complete[i] = true; + + uint32_t generation = 0; + uint64_t last_update_time = 0; + uint32_t result_code = parse_dup_meta(m, &generation, &last_update_time); + + // If it makes sense, retry transaction from the beginning. + // TODO - is this retry too fast? Should there be a throttle? If so, how? + if (dup_res_should_retry_transaction(rw, result_code)) { + if (! rw->from.any) { + // Lost race against timeout in retransmit thread. + pthread_mutex_unlock(&rw->lock); + rw_request_release(rw); + as_fabric_msg_put(m); + return; + } + + as_transaction tr; + as_transaction_init_head_from_rw(&tr, rw); + + // Note that tr now owns msgp - make sure rw destructor doesn't free it. + // Note also that rw will release rsv - tr will get a new one. + rw->msgp = NULL; + + tr.from_flags |= FROM_FLAG_RESTART; + as_tsvc_enqueue(&tr); + + rw->dup_res_complete = true; + + pthread_mutex_unlock(&rw->lock); + rw_request_hash_delete(&hkey, rw); + rw_request_release(rw); + as_fabric_msg_put(m); + return; + } + + dup_res_handle_tie(rw, m, result_code); + + // Compare this duplicate with previous best, if any. + bool keep_previous_best = rw->best_dup_msg && + as_record_resolve_conflict(rw->rsv.ns->conflict_resolution_policy, + rw->best_dup_gen, rw->best_dup_lut, + (uint16_t)generation, last_update_time) <= 0; + + if (keep_previous_best) { + // This duplicate is no better than previous best - keep previous best. + as_fabric_msg_put(m); + } + else { + // No previous best, or this duplicate is better - keep this one. + if (rw->best_dup_msg) { + as_fabric_msg_put(rw->best_dup_msg); + } + + msg_preserve_all_fields(m); + rw->best_dup_msg = m; + rw->best_dup_result_code = (uint8_t)result_code; + rw->best_dup_gen = generation; + rw->best_dup_lut = last_update_time; + } + + // Saved or discarded m - from here down don't call as_fabric_msg_put(m)! + + for (uint32_t j = 0; j < rw->n_dest_nodes; j++) { + if (! rw->dest_complete[j]) { + // Still haven't heard from all duplicates. + pthread_mutex_unlock(&rw->lock); + rw_request_release(rw); + return; + } + } + + if (rw->best_dup_result_code == AS_PROTO_RESULT_OK) { + apply_winner(rw); // sets rw->result_code to pass along to callback + } + else { + apply_if_tie(rw); + } + + // Check for lost race against timeout in retransmit thread *after* applying + // winner - may save a future transaction from re-fetching the duplicates. + // Note - nsup deletes don't get here, so check using rw->from.any is ok. + if (! rw->from.any) { + pthread_mutex_unlock(&rw->lock); + rw_request_release(rw); + return; + } + + dup_res_translate_result_code(rw); + + bool delete_from_hash = rw->dup_res_cb(rw); + + rw->dup_res_complete = true; + + pthread_mutex_unlock(&rw->lock); + + if (delete_from_hash) { + rw_request_hash_delete(&hkey, rw); + } + + rw_request_release(rw); +} + + +//========================================================== +// Local helpers. +// + +void +done_handle_request(as_partition_reservation* rsv, as_index_ref* r_ref, + as_storage_rd* rd) +{ + if (rd) { + as_storage_record_close(rd); + } + + if (r_ref) { + as_record_done(r_ref, rsv->ns); + } + + if (rsv) { + as_partition_release(rsv); + } +} + + +void +send_dup_res_ack(cf_node node, msg* m, uint32_t result) +{ + msg_set_uint32(m, RW_FIELD_OP, RW_OP_DUP_ACK); + msg_set_uint32(m, RW_FIELD_RESULT, result); + + if (as_fabric_send(node, m, AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } +} + + +void +send_ack_for_bad_request(cf_node node, msg* m) +{ + msg_preserve_fields(m, 3, RW_FIELD_NS_ID, RW_FIELD_DIGEST, RW_FIELD_TID); + + msg_set_uint32(m, RW_FIELD_OP, RW_OP_DUP_ACK); + msg_set_uint32(m, RW_FIELD_RESULT, AS_PROTO_RESULT_FAIL_UNKNOWN); // ??? + + if (as_fabric_send(node, m, AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } +} + + +uint32_t +parse_dup_meta(msg* m, uint32_t* p_generation, uint64_t* p_last_update_time) +{ + uint32_t result_code; + + if (msg_get_uint32(m, RW_FIELD_RESULT, &result_code) != 0) { + cf_warning(AS_RW, "dup-res ack: no result_code"); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + if (result_code != AS_PROTO_RESULT_OK) { + return result_code; + } + + if (msg_get_uint32(m, RW_FIELD_GENERATION, p_generation) != 0 || + *p_generation == 0) { + cf_warning(AS_RW, "dup-res ack: no or bad generation"); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + if (msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME, p_last_update_time) != 0) { + cf_warning(AS_RW, "dup-res ack: no last-update-time"); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + return AS_PROTO_RESULT_OK; +} + + +void +apply_winner(rw_request* rw) +{ + msg* m = rw->best_dup_msg; + + as_remote_record rr = { + // Skipping .src for now. + .rsv = &rw->rsv, + .keyd = &rw->keyd, + .generation = rw->best_dup_gen, + .last_update_time = rw->best_dup_lut + }; + + if (msg_get_buf(m, RW_FIELD_RECORD, &rr.record_buf, &rr.record_buf_sz, + MSG_GET_DIRECT) != 0 || rr.record_buf_sz < 2) { + cf_warning_digest(AS_RW, &rw->keyd, "dup-res ack: no record "); + rw->result_code = AS_PROTO_RESULT_FAIL_UNKNOWN; + return; + } + + uint32_t info = 0; + + msg_get_uint32(m, RW_FIELD_INFO, &info); + + if (dup_res_ignore_pickle(rr.record_buf, info)) { + cf_warning_digest(AS_RW, &rw->keyd, "dup-res ack: binless pickle "); + rw->result_code = AS_PROTO_RESULT_FAIL_UNKNOWN; + return; + } + + msg_get_uint32(m, RW_FIELD_VOID_TIME, &rr.void_time); + + msg_get_buf(m, RW_FIELD_SET_NAME, (uint8_t **)&rr.set_name, + &rr.set_name_len, MSG_GET_DIRECT); + + msg_get_buf(m, RW_FIELD_KEY, (uint8_t **)&rr.key, &rr.key_size, + MSG_GET_DIRECT); + + dup_res_init_repl_state(&rr, info); + + rw->result_code = (uint8_t)as_record_replace_if_better(&rr, false, false, + false); + + // Duplicate resolution just treats these errors as successful no-ops: + if (rw->result_code == AS_PROTO_RESULT_FAIL_RECORD_EXISTS || + rw->result_code == AS_PROTO_RESULT_FAIL_GENERATION) { + rw->result_code = AS_PROTO_RESULT_OK; + } +} diff --git a/as/src/transaction/proxy.c b/as/src/transaction/proxy.c new file mode 100644 index 00000000..d97ac7d9 --- /dev/null +++ b/as/src/transaction/proxy.c @@ -0,0 +1,698 @@ +/* + * proxy.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/proxy.h" + +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" + +#include "dynbuf.h" +#include "fault.h" +#include "msg.h" +#include "node.h" +#include "shash.h" +#include "socket.h" + +#include "base/batch.h" +#include "base/datamodel.h" +#include "base/proto.h" +#include "base/thr_tsvc.h" +#include "base/transaction.h" +#include "base/stats.h" +#include "fabric/exchange.h" +#include "fabric/fabric.h" +#include "fabric/partition.h" +#include "transaction/rw_request.h" +#include "transaction/rw_request_hash.h" +#include "transaction/rw_utils.h" +#include "transaction/udf.h" + + +//========================================================== +// Typedefs & constants. +// + +typedef enum { + // These values go on the wire, so mind backward compatibility if changing. + PROXY_FIELD_OP, + PROXY_FIELD_TID, + PROXY_FIELD_DIGEST, + PROXY_FIELD_REDIRECT, + PROXY_FIELD_AS_PROTO, // request as_proto - currently contains only as_msg's + PROXY_FIELD_UNUSED_5, + PROXY_FIELD_UNUSED_6, + PROXY_FIELD_UNUSED_7, + + NUM_PROXY_FIELDS +} proxy_msg_field; + +#define PROXY_OP_REQUEST 1 +#define PROXY_OP_RESPONSE 2 +#define PROXY_OP_RETURN_TO_SENDER 3 + +const msg_template proxy_mt[] = { + { PROXY_FIELD_OP, M_FT_UINT32 }, + { PROXY_FIELD_TID, M_FT_UINT32 }, + { PROXY_FIELD_DIGEST, M_FT_BUF }, + { PROXY_FIELD_REDIRECT, M_FT_UINT64 }, + { PROXY_FIELD_AS_PROTO, M_FT_BUF }, + { PROXY_FIELD_UNUSED_5, M_FT_UINT64 }, + { PROXY_FIELD_UNUSED_6, M_FT_UINT32 }, + { PROXY_FIELD_UNUSED_7, M_FT_UINT32 }, +}; + +COMPILER_ASSERT(sizeof(proxy_mt) / sizeof(msg_template) == NUM_PROXY_FIELDS); + +#define PROXY_MSG_SCRATCH_SIZE 128 + +typedef struct proxy_request_s { + uint32_t msg_fields; + + uint8_t origin; + uint8_t from_flags; + + union { + void* any; + as_file_handle* proto_fd_h; + as_batch_shared* batch_shared; + // No need yet for other members of this union. + } from; + + // No need yet for a 'from_data" union. + uint32_t batch_index; + + uint64_t start_time; + uint64_t end_time; + + // The original proxy message. + msg* fab_msg; + + as_namespace* ns; +} proxy_request; + + +//========================================================== +// Globals. +// + +static cf_shash* g_proxy_hash = NULL; +static cf_atomic32 g_proxy_tid = 0; + + +//========================================================== +// Forward declarations. +// + +void* run_proxy_timeout(void* arg); +int proxy_timeout_reduce_fn(const void* key, void* data, void* udata); + +int proxy_msg_cb(cf_node src, msg* m, void* udata); + +void proxyer_handle_response(msg* m, uint32_t tid); +int proxyer_handle_client_response(msg* m, proxy_request* pr); +int proxyer_handle_batch_response(msg* m, proxy_request* pr); +void proxyer_handle_return_to_sender(msg* m, uint32_t tid); + +void proxyee_handle_request(cf_node src, msg* m, uint32_t tid); + + +//========================================================== +// Inlines & macros. +// + +static inline void +error_response(cf_node src, uint32_t tid, uint32_t error) +{ + as_proxy_send_response(src, tid, error, 0, 0, NULL, NULL, 0, NULL, 0); +} + +static inline void +client_proxy_update_stats(as_namespace* ns, uint8_t result_code) +{ + switch (result_code) { + case AS_PROTO_RESULT_OK: + cf_atomic64_incr(&ns->n_client_proxy_complete); + break; + case AS_PROTO_RESULT_FAIL_TIMEOUT: + cf_atomic64_incr(&ns->n_client_proxy_timeout); + break; + default: + cf_atomic64_incr(&ns->n_client_proxy_error); + break; + } +} + +static inline void +batch_sub_proxy_update_stats(as_namespace* ns, uint8_t result_code) +{ + switch (result_code) { + case AS_PROTO_RESULT_OK: + cf_atomic64_incr(&ns->n_batch_sub_proxy_complete); + break; + case AS_PROTO_RESULT_FAIL_TIMEOUT: + cf_atomic64_incr(&ns->n_batch_sub_proxy_timeout); + break; + default: + cf_atomic64_incr(&ns->n_batch_sub_proxy_error); + break; + } +} + + +//========================================================== +// Public API. +// + +void +as_proxy_init() +{ + g_proxy_hash = cf_shash_create(cf_shash_fn_u32, sizeof(uint32_t), + sizeof(proxy_request), 4 * 1024, CF_SHASH_MANY_LOCK); + + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + if (pthread_create(&thread, &attrs, run_proxy_timeout, NULL) != 0) { + cf_crash(AS_PROXY, "failed to create proxy timeout thread"); + } + + as_fabric_register_msg_fn(M_TYPE_PROXY, proxy_mt, sizeof(proxy_mt), + PROXY_MSG_SCRATCH_SIZE, proxy_msg_cb, NULL); +} + + +uint32_t +as_proxy_hash_count() +{ + return cf_shash_get_size(g_proxy_hash); +} + + +// Proxyer - divert a transaction request to another node. +void +as_proxy_divert(cf_node dst, as_transaction* tr, as_namespace* ns) +{ + // Special log detail. + switch (tr->origin) { + case FROM_CLIENT: + cf_detail_digest(AS_PROXY_DIVERT, &tr->keyd, + "{%s} diverting from client %s to node %lx ", + ns->name, tr->from.proto_fd_h->client, dst); + break; + case FROM_BATCH: + cf_detail_digest(AS_PROXY_DIVERT, &tr->keyd, + "{%s} diverting batch-sub from client %s to node %lx ", + ns->name, as_batch_get_fd_h(tr->from.batch_shared)->client, + dst); + break; + default: + cf_crash(AS_PROXY, "unexpected transaction origin %u", tr->origin); + break; + } + + // Get a fabric message and fill it out. + + msg* m = as_fabric_msg_get(M_TYPE_PROXY); + + uint32_t tid = cf_atomic32_incr(&g_proxy_tid); + + msg_set_type set_type = tr->origin == FROM_BATCH ? + MSG_SET_COPY : MSG_SET_HANDOFF_MALLOC; + + msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_REQUEST); + msg_set_uint32(m, PROXY_FIELD_TID, tid); + msg_set_buf(m, PROXY_FIELD_DIGEST, (void*)&tr->keyd, sizeof(cf_digest), + MSG_SET_COPY); + msg_set_buf(m, PROXY_FIELD_AS_PROTO, (void*)tr->msgp, + as_proto_size_get(&tr->msgp->proto), set_type); + + // Set up a proxy_request and insert it in the hash. + + proxy_request pr; + + pr.msg_fields = tr->msg_fields; + + pr.origin = tr->origin; + pr.from_flags = tr->from_flags; + pr.from.any = tr->from.any; + pr.batch_index = tr->from_data.batch_index; + + pr.start_time = tr->start_time; + pr.end_time = tr->end_time; + + pr.fab_msg = m; + + pr.ns = ns; + + cf_shash_put(g_proxy_hash, &tid, &pr); + + tr->msgp = NULL; // pattern, not needed + tr->from.any = NULL; // pattern, not needed + + // Send fabric message to remote node. + + msg_incr_ref(m); + + if (as_fabric_send(dst, m, AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } +} + + +// Proxyee - transaction reservation failed here, tell proxyer to try again. +void +as_proxy_return_to_sender(const as_transaction* tr, as_namespace* ns) +{ + msg* m = as_fabric_msg_get(M_TYPE_PROXY); + uint32_t pid = as_partition_getid(&tr->keyd); + cf_node redirect_node = as_partition_proxyee_redirect(ns, pid); + + msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_RETURN_TO_SENDER); + msg_set_uint32(m, PROXY_FIELD_TID, tr->from_data.proxy_tid); + msg_set_uint64(m, PROXY_FIELD_REDIRECT, + redirect_node == (cf_node)0 ? tr->from.proxy_node : redirect_node); + + if (as_fabric_send(tr->from.proxy_node, m, AS_FABRIC_CHANNEL_RW) != + AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } +} + + +// Proxyee - transaction completed here, send response to proxyer. +void +as_proxy_send_response(cf_node dst, uint32_t proxy_tid, uint32_t result_code, + uint32_t generation, uint32_t void_time, as_msg_op** ops, as_bin** bins, + uint16_t bin_count, as_namespace* ns, uint64_t trid) +{ + msg* m = as_fabric_msg_get(M_TYPE_PROXY); + + msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_RESPONSE); + msg_set_uint32(m, PROXY_FIELD_TID, proxy_tid); + + size_t msg_sz = 0; + uint8_t* msgp = (uint8_t*)as_msg_make_response_msg(result_code, generation, + void_time, ops, bins, bin_count, ns, 0, &msg_sz, trid); + + msg_set_buf(m, PROXY_FIELD_AS_PROTO, msgp, msg_sz, MSG_SET_HANDOFF_MALLOC); + + if (as_fabric_send(dst, m, AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } +} + + +// Proxyee - transaction completed here, send response to proxyer. +void +as_proxy_send_ops_response(cf_node dst, uint32_t proxy_tid, cf_dyn_buf* db) +{ + msg* m = as_fabric_msg_get(M_TYPE_PROXY); + + msg_set_uint32(m, PROXY_FIELD_OP, PROXY_OP_RESPONSE); + msg_set_uint32(m, PROXY_FIELD_TID, proxy_tid); + + uint8_t* msgp = db->buf; + size_t msg_sz = db->used_sz; + + if (db->is_stack) { + msg_set_buf(m, PROXY_FIELD_AS_PROTO, msgp, msg_sz, MSG_SET_COPY); + } + else { + msg_set_buf(m, PROXY_FIELD_AS_PROTO, msgp, msg_sz, + MSG_SET_HANDOFF_MALLOC); + db->buf = NULL; // the fabric owns the buffer now + } + + if (as_fabric_send(dst, m, AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } +} + + +//========================================================== +// Local helpers - proxyer. +// + +void +proxyer_handle_response(msg* m, uint32_t tid) +{ + proxy_request pr; + + if (cf_shash_get_and_delete(g_proxy_hash, &tid, &pr) != CF_SHASH_OK) { + // Some other response (or timeout) has already finished this pr. + return; + } + + cf_assert(pr.from.any, AS_PROXY, "origin %u has null 'from'", pr.origin); + + int result; + + switch (pr.origin) { + case FROM_CLIENT: + result = proxyer_handle_client_response(m, &pr); + client_proxy_update_stats(pr.ns, result); + break; + case FROM_BATCH: + result = proxyer_handle_batch_response(m, &pr); + batch_sub_proxy_update_stats(pr.ns, result); + // Note - no worries about msgp, proxy divert copied it. + break; + default: + cf_crash(AS_PROXY, "unexpected transaction origin %u", pr.origin); + break; + } + + pr.from.any = NULL; // pattern, not needed + + as_fabric_msg_put(pr.fab_msg); + + // Note that this includes both origins. + if (pr.ns->proxy_hist_enabled) { + histogram_insert_data_point(pr.ns->proxy_hist, pr.start_time); + } +} + + +int +proxyer_handle_client_response(msg* m, proxy_request* pr) +{ + uint8_t* proto; + size_t proto_sz; + + if (msg_get_buf(m, PROXY_FIELD_AS_PROTO, &proto, &proto_sz, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_PROXY, "msg get for proto failed"); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + as_file_handle* fd_h = pr->from.proto_fd_h; + + if (cf_socket_send_all(&fd_h->sock, proto, proto_sz, MSG_NOSIGNAL, + CF_SOCKET_TIMEOUT) < 0) { + // Common when a client aborts. + as_end_of_transaction_force_close(fd_h); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + as_end_of_transaction_ok(fd_h); + return AS_PROTO_RESULT_OK; +} + + +int +proxyer_handle_batch_response(msg* m, proxy_request* pr) +{ + cl_msg* msgp; + size_t msgp_sz; + + if (msg_get_buf(m, PROXY_FIELD_AS_PROTO, (uint8_t**)&msgp, &msgp_sz, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_PROXY, "msg get for proto failed"); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + cf_digest* keyd; + + if (msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (uint8_t**)&keyd, NULL, + MSG_GET_DIRECT) != 0) { + cf_crash(AS_PROXY, "original msg get for digest failed"); + } + + as_batch_add_proxy_result(pr->from.batch_shared, pr->batch_index, keyd, + msgp, msgp_sz); + + return AS_PROTO_RESULT_OK; +} + + +void +proxyer_handle_return_to_sender(msg* m, uint32_t tid) +{ + proxy_request* pr; + pthread_mutex_t* lock; + + if (cf_shash_get_vlock(g_proxy_hash, &tid, (void**)&pr, &lock) != + CF_SHASH_OK) { + // Some other response (or timeout) has already finished this pr. + return; + } + + cf_node redirect_node; + + if (msg_get_uint64(m, PROXY_FIELD_REDIRECT, &redirect_node) == 0 + && redirect_node != g_config.self_node + && redirect_node != (cf_node)0) { + // If this node was a "random" node, i.e. neither acting nor eventual + // master, it diverts to the eventual master (the best it can do.) The + // eventual master must inform this node about the acting master. + + msg_incr_ref(pr->fab_msg); + + if (as_fabric_send(redirect_node, pr->fab_msg, AS_FABRIC_CHANNEL_RW) != + AS_FABRIC_SUCCESS) { + as_fabric_msg_put(pr->fab_msg); + } + + pthread_mutex_unlock(lock); + return; + } + + cf_digest* keyd; + + if (msg_get_buf(pr->fab_msg, PROXY_FIELD_DIGEST, (uint8_t**)&keyd, NULL, + MSG_GET_DIRECT) != 0) { + cf_crash(AS_PROXY, "original msg get for digest failed"); + } + + cl_msg* msgp; + + // TODO - inefficient! Should be a way to 'take' a buffer from msg. + if (msg_get_buf(pr->fab_msg, PROXY_FIELD_AS_PROTO, (uint8_t**)&msgp, NULL, + MSG_GET_COPY_MALLOC) != 0) { + cf_crash(AS_PROXY, "original msg get for proto failed"); + } + + // Put the as_msg on the normal queue for processing. + as_transaction tr; + as_transaction_init_head(&tr, keyd, msgp); + // msgp might not have digest - batch sub-transactions, old clients. + // For old clients, will compute it again from msgp key and set. + + tr.msg_fields = pr->msg_fields; + tr.origin = pr->origin; + tr.from_flags = pr->from_flags; + tr.from.any = pr->from.any; + tr.from_data.batch_index = pr->batch_index; + tr.start_time = pr->start_time; + + as_tsvc_enqueue(&tr); + + as_fabric_msg_put(pr->fab_msg); + + cf_shash_delete_lockfree(g_proxy_hash, &tid); + pthread_mutex_unlock(lock); +} + + +//========================================================== +// Local helpers - proxyee. +// + +void +proxyee_handle_request(cf_node src, msg* m, uint32_t tid) +{ + cf_digest* keyd; + + if (msg_get_buf(m, PROXY_FIELD_DIGEST, (uint8_t**)&keyd, NULL, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_PROXY, "msg get for digest failed"); + error_response(src, tid, AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + cl_msg* msgp; + size_t msgp_sz; + + if (msg_get_buf(m, PROXY_FIELD_AS_PROTO, (uint8_t**)&msgp, &msgp_sz, + MSG_GET_COPY_MALLOC) != 0) { + cf_warning(AS_PROXY, "msg get for proto failed"); + error_response(src, tid, AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + // Sanity check as_proto fields. + as_proto* proto = &msgp->proto; + + if (! as_proto_wrapped_is_valid(proto, msgp_sz)) { + cf_warning(AS_PROXY, "bad proto: version %u, type %u, sz %lu [%lu]", + proto->version, proto->type, (uint64_t)proto->sz, msgp_sz); + error_response(src, tid, AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + // Put the as_msg on the normal queue for processing. + as_transaction tr; + as_transaction_init_head(&tr, keyd, msgp); + // msgp might not have digest - batch sub-transactions, old clients. + // For old clients, will compute it again from msgp key and set. + + tr.start_time = cf_getns(); + + tr.origin = FROM_PROXY; + tr.from.proxy_node = src; + tr.from_data.proxy_tid = tid; + + // Proxyer has already done byte swapping in as_msg. + if (! as_transaction_prepare(&tr, false)) { + cf_warning(AS_PROXY, "bad proxy msg"); + error_response(src, tid, AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + // For batch sub-transactions, make sure we flag them so they're not + // mistaken for multi-record transactions (which never proxy). + if (as_transaction_has_no_key_or_digest(&tr)) { + tr.from_flags |= FROM_FLAG_BATCH_SUB; + } + + as_tsvc_enqueue(&tr); +} + + +//========================================================== +// Local helpers - timeout. +// + +void* +run_proxy_timeout(void* arg) +{ + while (true) { + usleep(75 * 1000); + + now_times now; + + now.now_ns = cf_getns(); + now.now_ms = now.now_ns / 1000000; + + cf_shash_reduce(g_proxy_hash, proxy_timeout_reduce_fn, &now); + } + + return NULL; +} + + +int +proxy_timeout_reduce_fn(const void* key, void* data, void* udata) +{ + proxy_request* pr = data; + now_times* now = (now_times*)udata; + + if (now->now_ns < pr->end_time) { + return CF_SHASH_OK; + } + + // Handle timeouts. + + cf_assert(pr->from.any, AS_PROXY, "origin %u has null 'from'", pr->origin); + + switch (pr->origin) { + case FROM_CLIENT: + // TODO - when it becomes important enough, find a way to echo trid. + as_msg_send_reply(pr->from.proto_fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0, + 0, NULL, NULL, 0, pr->ns, 0); + client_proxy_update_stats(pr->ns, AS_PROTO_RESULT_FAIL_TIMEOUT); + break; + case FROM_BATCH: + as_batch_add_error(pr->from.batch_shared, pr->batch_index, + AS_PROTO_RESULT_FAIL_TIMEOUT); + // Note - no worries about msgp, proxy divert copied it. + batch_sub_proxy_update_stats(pr->ns, AS_PROTO_RESULT_FAIL_TIMEOUT); + break; + default: + cf_crash(AS_PROXY, "unexpected transaction origin %u", pr->origin); + break; + } + + pr->from.any = NULL; // pattern, not needed + as_fabric_msg_put(pr->fab_msg); + + return CF_SHASH_REDUCE_DELETE; +} + + +//========================================================== +// Local helpers - handle PROXY fabric messages. +// + +int +proxy_msg_cb(cf_node src, msg* m, void* udata) +{ + uint32_t op; + + if (msg_get_uint32(m, PROXY_FIELD_OP, &op) != 0) { + cf_warning(AS_PROXY, "msg get for op failed"); + as_fabric_msg_put(m); + return 0; + } + + uint32_t tid; + + if (msg_get_uint32(m, PROXY_FIELD_TID, &tid) != 0) { + cf_warning(AS_PROXY, "msg get for tid failed"); + as_fabric_msg_put(m); + return 0; + } + + switch (op) { + case PROXY_OP_REQUEST: + proxyee_handle_request(src, m, tid); + break; + case PROXY_OP_RESPONSE: + proxyer_handle_response(m, tid); + break; + case PROXY_OP_RETURN_TO_SENDER: + proxyer_handle_return_to_sender(m, tid); + break; + default: + cf_warning(AS_PROXY, "received unexpected message op %u", op); + break; + } + + as_fabric_msg_put(m); + return 0; +} diff --git a/as/src/transaction/re_replicate_ce.c b/as/src/transaction/re_replicate_ce.c new file mode 100644 index 00000000..ffb518e6 --- /dev/null +++ b/as/src/transaction/re_replicate_ce.c @@ -0,0 +1,43 @@ +/* + * re_replicate_ce.c + * + * Copyright (C) 2017-2018 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/re_replicate.h" + +#include "fault.h" + +#include "base/transaction.h" + + +//========================================================== +// Public API. +// + +transaction_status +as_re_replicate_start(as_transaction* tr) +{ + cf_crash(AS_RW, "CE code called as_re_replicate_start()"); + return TRANS_DONE_ERROR; +} diff --git a/as/src/transaction/read.c b/as/src/transaction/read.c new file mode 100644 index 00000000..7d9d7949 --- /dev/null +++ b/as/src/transaction/read.c @@ -0,0 +1,625 @@ +/* + * read.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/read.h" + +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" + +#include "dynbuf.h" +#include "fault.h" + +#include "base/batch.h" +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/transaction.h" +#include "base/transaction_policy.h" +#include "fabric/partition.h" +#include "storage/storage.h" +#include "transaction/duplicate_resolve.h" +#include "transaction/proxy.h" +#include "transaction/replica_ping.h" +#include "transaction/rw_request.h" +#include "transaction/rw_request_hash.h" +#include "transaction/rw_utils.h" + + +//========================================================== +// Forward declarations. +// + +void start_read_dup_res(rw_request* rw, as_transaction* tr); +void start_repl_ping(rw_request* rw, as_transaction* tr); +bool read_dup_res_cb(rw_request* rw); +void repl_ping_after_dup_res(rw_request* rw, as_transaction* tr); +void repl_ping_cb(rw_request* rw); + +void send_read_response(as_transaction* tr, as_msg_op** ops, + as_bin** response_bins, uint16_t n_bins, cf_dyn_buf* db); +void read_timeout_cb(rw_request* rw); + +transaction_status read_local(as_transaction* tr); +void read_local_done(as_transaction* tr, as_index_ref* r_ref, as_storage_rd* rd, + int result_code); + + +//========================================================== +// Inlines & macros. +// + +static inline bool +read_must_duplicate_resolve(const as_transaction* tr) +{ + return tr->rsv.n_dupl != 0 && + TR_READ_CONSISTENCY_LEVEL(tr) == AS_READ_CONSISTENCY_LEVEL_ALL; +} + +static inline bool +read_must_ping(const as_transaction *tr) +{ + return (tr->flags & AS_TRANSACTION_FLAG_MUST_PING) != 0; +} + +static inline void +client_read_update_stats(as_namespace* ns, uint8_t result_code) +{ + switch (result_code) { + case AS_PROTO_RESULT_OK: + cf_atomic64_incr(&ns->n_client_read_success); + break; + case AS_PROTO_RESULT_FAIL_TIMEOUT: + cf_atomic64_incr(&ns->n_client_read_timeout); + break; + default: + cf_atomic64_incr(&ns->n_client_read_error); + break; + case AS_PROTO_RESULT_FAIL_NOT_FOUND: + cf_atomic64_incr(&ns->n_client_read_not_found); + break; + } +} + +static inline void +batch_sub_read_update_stats(as_namespace* ns, uint8_t result_code) +{ + switch (result_code) { + case AS_PROTO_RESULT_OK: + cf_atomic64_incr(&ns->n_batch_sub_read_success); + break; + case AS_PROTO_RESULT_FAIL_TIMEOUT: + cf_atomic64_incr(&ns->n_batch_sub_read_timeout); + break; + default: + cf_atomic64_incr(&ns->n_batch_sub_read_error); + break; + case AS_PROTO_RESULT_FAIL_NOT_FOUND: + cf_atomic64_incr(&ns->n_batch_sub_read_not_found); + break; + } +} + + +//========================================================== +// Public API. +// + +transaction_status +as_read_start(as_transaction* tr) +{ + BENCHMARK_START(tr, read, FROM_CLIENT); + BENCHMARK_START(tr, batch_sub, FROM_BATCH); + + if (! repl_ping_check(tr)) { + send_read_response(tr, NULL, NULL, 0, NULL); + return TRANS_DONE_ERROR; + } + + transaction_status status; + bool must_duplicate_resolve = read_must_duplicate_resolve(tr); + bool must_ping = read_must_ping(tr); + + if (! must_duplicate_resolve && ! must_ping) { + // No network hops needed, try reading. + if ((status = read_local(tr)) != TRANS_IN_PROGRESS) { + return status; + } + // else - must try again under hash. + } + // else - there are duplicates, and we're configured to resolve them, or + // we're required to ping replicas. + + // Create rw_request and add to hash. + rw_request_hkey hkey = { tr->rsv.ns->id, tr->keyd }; + rw_request* rw = rw_request_create(&tr->keyd); + + // If rw_request isn't inserted in hash, transaction is finished. + if ((status = rw_request_hash_insert(&hkey, rw, tr)) != TRANS_IN_PROGRESS) { + rw_request_release(rw); + + if (status != TRANS_WAITING) { + send_read_response(tr, NULL, NULL, 0, NULL); + } + + return status; + } + // else - rw_request is now in hash, continue... + + if (must_duplicate_resolve) { + start_read_dup_res(rw, tr); + + // Started duplicate resolution. + return TRANS_IN_PROGRESS; + } + + if (must_ping) { + // Set up the nodes to which we'll ping. + rw->n_dest_nodes = as_partition_get_other_replicas(tr->rsv.p, + rw->dest_nodes); + + if (insufficient_replica_destinations(tr->rsv.ns, rw->n_dest_nodes)) { + rw_request_hash_delete(&hkey, rw); + tr->result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE; + send_read_response(tr, NULL, NULL, 0, NULL); + return TRANS_DONE_ERROR; + } + + start_repl_ping(rw, tr); + + // Started replica ping. + return TRANS_IN_PROGRESS; + } + + // Trying again under hash. + status = read_local(tr); + cf_assert(status != TRANS_IN_PROGRESS, AS_RW, "read in-progress"); + rw_request_hash_delete(&hkey, rw); + + return status; +} + + +//========================================================== +// Local helpers - transaction flow. +// + +void +start_read_dup_res(rw_request* rw, as_transaction* tr) +{ + // Finish initializing rw_request, construct and send dup-res message. + + dup_res_make_message(rw, tr); + + pthread_mutex_lock(&rw->lock); + + dup_res_setup_rw(rw, tr, read_dup_res_cb, read_timeout_cb); + send_rw_messages(rw); + + pthread_mutex_unlock(&rw->lock); +} + + +void +start_repl_ping(rw_request* rw, as_transaction* tr) +{ + // Finish initializing rw, construct and send repl-ping message. + + repl_ping_make_message(rw, tr); + + pthread_mutex_lock(&rw->lock); + + repl_ping_setup_rw(rw, tr, repl_ping_cb, read_timeout_cb); + send_rw_messages(rw); + + pthread_mutex_unlock(&rw->lock); +} + + +bool +read_dup_res_cb(rw_request* rw) +{ + BENCHMARK_NEXT_DATA_POINT(rw, read, dup_res); + BENCHMARK_NEXT_DATA_POINT(rw, batch_sub, dup_res); + + as_transaction tr; + as_transaction_init_from_rw(&tr, rw); + + if (tr.result_code != AS_PROTO_RESULT_OK) { + send_read_response(&tr, NULL, NULL, 0, NULL); + return true; + } + + if (read_must_ping(&tr)) { + // Set up the nodes to which we'll ping. + rw->n_dest_nodes = as_partition_get_other_replicas(tr.rsv.p, + rw->dest_nodes); + + if (insufficient_replica_destinations(tr.rsv.ns, rw->n_dest_nodes)) { + tr.result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE; + send_read_response(&tr, NULL, NULL, 0, NULL); + return true; + } + + repl_ping_after_dup_res(rw, &tr); + + return false; + } + + // Read the local copy and respond to origin. + transaction_status status = read_local(&tr); + + cf_assert(status != TRANS_IN_PROGRESS, AS_RW, "read in-progress"); + + if (status == TRANS_WAITING) { + // Note - new tr now owns msgp, make sure rw destructor doesn't free it. + // Also, rw will release rsv - new tr will get a new one. + rw->msgp = NULL; + } + + // Finished transaction - rw_request cleans up reservation and msgp! + return true; +} + + +void +repl_ping_after_dup_res(rw_request* rw, as_transaction* tr) +{ + // Recycle rw_request that was just used for duplicate resolution to now do + // replica pings. Note - we are under the rw_request lock here! + + repl_ping_make_message(rw, tr); + repl_ping_reset_rw(rw, tr, repl_ping_cb); + send_rw_messages(rw); +} + + +void +repl_ping_cb(rw_request* rw) +{ + BENCHMARK_NEXT_DATA_POINT(rw, read, repl_ping); + BENCHMARK_NEXT_DATA_POINT(rw, batch_sub, repl_ping); + + as_transaction tr; + as_transaction_init_from_rw(&tr, rw); + + // Read the local copy and respond to origin. + transaction_status status = read_local(&tr); + + cf_assert(status != TRANS_IN_PROGRESS, AS_RW, "read in-progress"); + + if (status == TRANS_WAITING) { + // Note - new tr now owns msgp, make sure rw destructor doesn't free it. + // Also, rw will release rsv - new tr will get a new one. + rw->msgp = NULL; + } +} + + +//========================================================== +// Local helpers - transaction end. +// + +void +send_read_response(as_transaction* tr, as_msg_op** ops, as_bin** response_bins, + uint16_t n_bins, cf_dyn_buf* db) +{ + // Paranoia - shouldn't get here on losing race with timeout. + if (! tr->from.any) { + cf_warning(AS_RW, "transaction origin %u has null 'from'", tr->origin); + return; + } + + // Note - if tr was setup from rw, rw->from.any has been set null and + // informs timeout it lost the race. + + switch (tr->origin) { + case FROM_CLIENT: + BENCHMARK_NEXT_DATA_POINT(tr, read, local); + if (db && db->used_sz != 0) { + as_msg_send_ops_reply(tr->from.proto_fd_h, db); + } + else { + as_msg_send_reply(tr->from.proto_fd_h, tr->result_code, + tr->generation, tr->void_time, ops, response_bins, n_bins, + tr->rsv.ns, as_transaction_trid(tr)); + } + BENCHMARK_NEXT_DATA_POINT(tr, read, response); + HIST_TRACK_ACTIVATE_INSERT_DATA_POINT(tr, read_hist); + client_read_update_stats(tr->rsv.ns, tr->result_code); + break; + case FROM_PROXY: + if (db && db->used_sz != 0) { + as_proxy_send_ops_response(tr->from.proxy_node, + tr->from_data.proxy_tid, db); + } + else { + as_proxy_send_response(tr->from.proxy_node, tr->from_data.proxy_tid, + tr->result_code, tr->generation, tr->void_time, ops, + response_bins, n_bins, tr->rsv.ns, as_transaction_trid(tr)); + } + break; + case FROM_BATCH: + BENCHMARK_NEXT_DATA_POINT(tr, batch_sub, read_local); + as_batch_add_result(tr, n_bins, response_bins, ops); + BENCHMARK_NEXT_DATA_POINT(tr, batch_sub, response); + batch_sub_read_update_stats(tr->rsv.ns, tr->result_code); + break; + default: + cf_crash(AS_RW, "unexpected transaction origin %u", tr->origin); + break; + } + + tr->from.any = NULL; // pattern, not needed +} + + +void +read_timeout_cb(rw_request* rw) +{ + if (! rw->from.any) { + return; // lost race against dup-res callback + } + + switch (rw->origin) { + case FROM_CLIENT: + as_msg_send_reply(rw->from.proto_fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0, + 0, NULL, NULL, 0, rw->rsv.ns, rw_request_trid(rw)); + // Timeouts aren't included in histograms. + client_read_update_stats(rw->rsv.ns, AS_PROTO_RESULT_FAIL_TIMEOUT); + break; + case FROM_PROXY: + break; + case FROM_BATCH: + as_batch_add_error(rw->from.batch_shared, rw->from_data.batch_index, + AS_PROTO_RESULT_FAIL_TIMEOUT); + // Timeouts aren't included in histograms. + batch_sub_read_update_stats(rw->rsv.ns, AS_PROTO_RESULT_FAIL_TIMEOUT); + break; + default: + cf_crash(AS_RW, "unexpected transaction origin %u", rw->origin); + break; + } + + rw->from.any = NULL; // inform other callback it lost the race +} + + +//========================================================== +// Local helpers - read local. +// + +transaction_status +read_local(as_transaction* tr) +{ + as_msg* m = &tr->msgp->msg; + as_namespace* ns = tr->rsv.ns; + + as_index_ref r_ref; + r_ref.skip_lock = false; + + if (as_record_get(tr->rsv.tree, &tr->keyd, &r_ref) != 0) { + read_local_done(tr, NULL, NULL, AS_PROTO_RESULT_FAIL_NOT_FOUND); + return TRANS_DONE_ERROR; + } + + as_record* r = r_ref.r; + + // Check if it's an expired or truncated record. + if (as_record_is_doomed(r, ns)) { + read_local_done(tr, &r_ref, NULL, AS_PROTO_RESULT_FAIL_NOT_FOUND); + return TRANS_DONE_ERROR; + } + + int result = repl_state_check(r, tr); + + if (result != 0) { + // No response sent to origin. + as_record_done(&r_ref, ns); + return result == 1 ? TRANS_IN_PROGRESS : TRANS_WAITING; + } + + // Check if it's a tombstone. + if (! as_record_is_live(r)) { + read_local_done(tr, &r_ref, NULL, AS_PROTO_RESULT_FAIL_NOT_FOUND); + return TRANS_DONE_ERROR; + } + + as_storage_rd rd; + + as_storage_record_open(ns, r, &rd); + + // Check the key if required. + // Note - for data-not-in-memory "exists" ops, key check is expensive! + if (as_transaction_has_key(tr) && + as_storage_record_get_key(&rd) && ! check_msg_key(m, &rd)) { + read_local_done(tr, &r_ref, &rd, AS_PROTO_RESULT_FAIL_KEY_MISMATCH); + return TRANS_DONE_ERROR; + } + + if ((m->info1 & AS_MSG_INFO1_GET_NO_BINS) != 0) { + tr->generation = r->generation; + tr->void_time = r->void_time; + tr->last_update_time = r->last_update_time; + + read_local_done(tr, &r_ref, &rd, AS_PROTO_RESULT_OK); + return TRANS_DONE_SUCCESS; + } + + if ((result = as_storage_rd_load_n_bins(&rd)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_n_bins() ", ns->name); + read_local_done(tr, &r_ref, &rd, -result); + return TRANS_DONE_ERROR; + } + + as_bin stack_bins[ns->storage_data_in_memory ? 0 : rd.n_bins]; + + if ((result = as_storage_rd_load_bins(&rd, stack_bins)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_storage_rd_load_bins() ", ns->name); + read_local_done(tr, &r_ref, &rd, -result); + return TRANS_DONE_ERROR; + } + + if (! as_bin_inuse_has(&rd)) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: found record with no bins ", ns->name); + read_local_done(tr, &r_ref, &rd, AS_PROTO_RESULT_FAIL_UNKNOWN); + return TRANS_DONE_ERROR; + } + + uint32_t bin_count = (m->info1 & AS_MSG_INFO1_GET_ALL) != 0 ? + rd.n_bins : m->n_ops; + + as_msg_op* ops[bin_count]; + as_msg_op** p_ops = ops; + as_bin* response_bins[bin_count]; + uint16_t n_bins = 0; + + as_bin result_bins[bin_count]; + uint32_t n_result_bins = 0; + + if ((m->info1 & AS_MSG_INFO1_GET_ALL) != 0) { + p_ops = NULL; + n_bins = as_bin_inuse_count(&rd); + as_bin_get_all_p(&rd, response_bins); + } + else { + if (m->n_ops == 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: bin op(s) expected, none present ", ns->name); + read_local_done(tr, &r_ref, &rd, AS_PROTO_RESULT_FAIL_PARAMETER); + return TRANS_DONE_ERROR; + } + + bool respond_all_ops = (m->info2 & AS_MSG_INFO2_RESPOND_ALL_OPS) != 0; + + as_msg_op* op = 0; + int n = 0; + + while ((op = as_msg_op_iterate(m, op, &n)) != NULL) { + if (op->op == AS_MSG_OP_READ) { + as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz); + + if (b || respond_all_ops) { + ops[n_bins] = op; + response_bins[n_bins++] = b; + } + } + else if (op->op == AS_MSG_OP_CDT_READ) { + as_bin* b = as_bin_get_from_buf(&rd, op->name, op->name_sz); + + if (b) { + as_bin* rb = &result_bins[n_result_bins]; + as_bin_set_empty(rb); + + if ((result = as_bin_cdt_read_from_client(b, op, rb)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: failed as_bin_cdt_read_from_client() ", ns->name); + destroy_stack_bins(result_bins, n_result_bins); + read_local_done(tr, &r_ref, &rd, -result); + return TRANS_DONE_ERROR; + } + + if (as_bin_inuse(rb)) { + n_result_bins++; + ops[n_bins] = op; + response_bins[n_bins++] = rb; + } + else if (respond_all_ops) { + ops[n_bins] = op; + response_bins[n_bins++] = NULL; + } + } + else if (respond_all_ops) { + ops[n_bins] = op; + response_bins[n_bins++] = NULL; + } + } + else { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} read_local: unexpected bin op %u ", ns->name, op->op); + destroy_stack_bins(result_bins, n_result_bins); + read_local_done(tr, &r_ref, &rd, AS_PROTO_RESULT_FAIL_PARAMETER); + return TRANS_DONE_ERROR; + } + } + } + + cf_dyn_buf_define_size(db, 16 * 1024); + + if (tr->origin != FROM_BATCH) { + db.used_sz = db.alloc_sz; + db.buf = (uint8_t*)as_msg_make_response_msg(tr->result_code, + r->generation, r->void_time, p_ops, response_bins, n_bins, ns, + (cl_msg*)dyn_bufdb, &db.used_sz, as_transaction_trid(tr)); + + db.is_stack = db.buf == dyn_bufdb; + // Note - not bothering to correct alloc_sz if buf was allocated. + } + else { + tr->generation = r->generation; + tr->void_time = r->void_time; + tr->last_update_time = r->last_update_time; + + // Since as_batch_add_result() constructs response directly in shared + // buffer to avoid extra copies, can't use db. + send_read_response(tr, p_ops, response_bins, n_bins, NULL); + } + + destroy_stack_bins(result_bins, n_result_bins); + as_storage_record_close(&rd); + as_record_done(&r_ref, ns); + + // Now that we're not under the record lock, send the message we just built. + if (db.used_sz != 0) { + send_read_response(tr, NULL, NULL, 0, &db); + + cf_dyn_buf_free(&db); + tr->from.proto_fd_h = NULL; + } + + return TRANS_DONE_SUCCESS; +} + + +void +read_local_done(as_transaction* tr, as_index_ref* r_ref, as_storage_rd* rd, + int result_code) +{ + if (r_ref) { + if (rd) { + as_storage_record_close(rd); + } + + as_record_done(r_ref, tr->rsv.ns); + } + + tr->result_code = (uint8_t)result_code; + + send_read_response(tr, NULL, NULL, 0, NULL); +} diff --git a/as/src/transaction/replica_ping_ce.c b/as/src/transaction/replica_ping_ce.c new file mode 100644 index 00000000..c4a09df0 --- /dev/null +++ b/as/src/transaction/replica_ping_ce.c @@ -0,0 +1,88 @@ +/* + * replica_ping_ce.c + * + * Copyright (C) 2017-2018 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/replica_ping.h" + +#include + +#include "fault.h" +#include "msg.h" +#include "node.h" + +#include "base/datamodel.h" +#include "base/transaction.h" +#include "fabric/fabric.h" +#include "transaction/rw_request.h" + + +//========================================================== +// Public API. +// + +bool +repl_ping_check(as_transaction* tr) +{ + if (as_transaction_is_linearized_read(tr)) { + cf_warning(AS_RW, "linearized read is an enterprise feature"); + tr->result_code = AS_PROTO_RESULT_FAIL_ENTERPRISE_ONLY; + return false; + } + + return true; +} + +void +repl_ping_make_message(rw_request* rw, as_transaction* tr) +{ + cf_crash(AS_RW, "CE code called repl_ping_make_message()"); +} + +void +repl_ping_setup_rw(rw_request* rw, as_transaction* tr, + repl_ping_done_cb repl_ping_cb, timeout_done_cb timeout_cb) +{ + cf_crash(AS_RW, "CE code called repl_ping_setup_rw()"); +} + +void +repl_ping_reset_rw(rw_request* rw, as_transaction* tr, repl_ping_done_cb cb) +{ + cf_crash(AS_RW, "CE code called repl_ping_reset_rw()"); +} + +void +repl_ping_handle_op(cf_node node, msg* m) +{ + cf_warning(AS_RW, "CE code called repl_ping_handle_op()"); + as_fabric_msg_put(m); +} + +void +repl_ping_handle_ack(cf_node node, msg* m) +{ + cf_warning(AS_RW, "CE code called repl_ping_handle_ack()"); + as_fabric_msg_put(m); +} diff --git a/as/src/transaction/replica_write.c b/as/src/transaction/replica_write.c new file mode 100644 index 00000000..ae0b45a0 --- /dev/null +++ b/as/src/transaction/replica_write.c @@ -0,0 +1,520 @@ +/* + * replica_write.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/replica_write.h" + +#include +#include +#include +#include +#include + +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" + +#include "fault.h" +#include "msg.h" +#include "node.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/rec_props.h" +#include "base/secondary_index.h" +#include "base/transaction.h" +#include "base/xdr_serverside.h" +#include "fabric/fabric.h" +#include "fabric/partition.h" +#include "transaction/delete.h" +#include "transaction/rw_request.h" +#include "transaction/rw_request_hash.h" +#include "transaction/rw_utils.h" + + +//========================================================== +// Forward declarations. +// + +uint32_t pack_info_bits(as_transaction* tr); +void send_repl_write_ack(cf_node node, msg* m, uint32_t result); +uint32_t parse_result_code(msg* m); +void drop_replica(as_partition_reservation* rsv, cf_digest* keyd, + bool is_nsup_delete, bool is_xdr_op, cf_node master); + + +//========================================================== +// Public API. +// + +void +repl_write_make_message(rw_request* rw, as_transaction* tr) +{ + if (rw->dest_msg) { + msg_reset(rw->dest_msg); + } + else { + rw->dest_msg = as_fabric_msg_get(M_TYPE_RW); + } + + // TODO - remove this when we're comfortable: + cf_assert(rw->pickled_buf, AS_RW, "making repl-write msg with null pickle"); + + as_namespace* ns = tr->rsv.ns; + msg* m = rw->dest_msg; + + msg_set_uint32(m, RW_FIELD_OP, RW_OP_WRITE); + msg_set_buf(m, RW_FIELD_NAMESPACE, (uint8_t*)ns->name, strlen(ns->name), + MSG_SET_COPY); + msg_set_uint32(m, RW_FIELD_NS_ID, ns->id); + msg_set_buf(m, RW_FIELD_DIGEST, (void*)&tr->keyd, sizeof(cf_digest), + MSG_SET_COPY); + msg_set_uint32(m, RW_FIELD_TID, rw->tid); + msg_set_uint32(m, RW_FIELD_GENERATION, tr->generation); + msg_set_uint64(m, RW_FIELD_LAST_UPDATE_TIME, tr->last_update_time); + + if (tr->void_time != 0) { + msg_set_uint32(m, RW_FIELD_VOID_TIME, tr->void_time); + } + + uint32_t info = pack_info_bits(tr); + + repl_write_flag_pickle(tr, rw->pickled_buf, &info); + + msg_set_buf(m, RW_FIELD_RECORD, (void*)rw->pickled_buf, rw->pickled_sz, + MSG_SET_HANDOFF_MALLOC); + + // Make sure destructor doesn't free this. + rw->pickled_buf = NULL; + + // TODO - replace rw->pickled_rec_props with individual fields. + if (rw->pickled_rec_props.p_data) { + const char* set_name; + uint32_t set_name_size; + + if (as_rec_props_get_value(&rw->pickled_rec_props, + CL_REC_PROPS_FIELD_SET_NAME, &set_name_size, + (uint8_t**)&set_name) == 0) { + msg_set_buf(m, RW_FIELD_SET_NAME, (const uint8_t *)set_name, + set_name_size - 1, MSG_SET_COPY); + } + + uint32_t key_size; + uint8_t* key; + + if (as_rec_props_get_value(&rw->pickled_rec_props, + CL_REC_PROPS_FIELD_KEY, &key_size, &key) == 0) { + msg_set_buf(m, RW_FIELD_KEY, key, key_size, MSG_SET_COPY); + } + } + + if (info != 0) { + msg_set_uint32(m, RW_FIELD_INFO, info); + } +} + + +void +repl_write_setup_rw(rw_request* rw, as_transaction* tr, + repl_write_done_cb repl_write_cb, timeout_done_cb timeout_cb) +{ + rw->msgp = tr->msgp; + tr->msgp = NULL; + + rw->msg_fields = tr->msg_fields; + rw->origin = tr->origin; + rw->from_flags = tr->from_flags; + + rw->from.any = tr->from.any; + rw->from_data.any = tr->from_data.any; + tr->from.any = NULL; + + rw->start_time = tr->start_time; + rw->benchmark_time = tr->benchmark_time; + + as_partition_reservation_copy(&rw->rsv, &tr->rsv); + // Hereafter, rw_request must release reservation - happens in destructor. + + rw->end_time = tr->end_time; + rw->flags = tr->flags; + rw->generation = tr->generation; + rw->void_time = tr->void_time; + rw->last_update_time = tr->last_update_time; + + rw->repl_write_cb = repl_write_cb; + rw->timeout_cb = timeout_cb; + + rw->xmit_ms = cf_getms() + g_config.transaction_retry_ms; + rw->retry_interval_ms = g_config.transaction_retry_ms; + + for (uint32_t i = 0; i < rw->n_dest_nodes; i++) { + rw->dest_complete[i] = false; + } + + // Allow retransmit thread to destroy rw_request as soon as we unlock. + rw->is_set_up = true; +} + + +void +repl_write_reset_rw(rw_request* rw, as_transaction* tr, repl_write_done_cb cb) +{ + // Reset rw->from.any which was set null in tr setup. + rw->from.any = tr->from.any; + + // Needed for response to origin. + rw->flags = tr->flags; + rw->generation = tr->generation; + rw->void_time = tr->void_time; + rw->last_update_time = tr->last_update_time; + + rw->repl_write_cb = cb; + + // TODO - is this better than not resetting? Note - xmit_ms not volatile. + rw->xmit_ms = cf_getms() + g_config.transaction_retry_ms; + rw->retry_interval_ms = g_config.transaction_retry_ms; + + for (uint32_t i = 0; i < rw->n_dest_nodes; i++) { + rw->dest_complete[i] = false; + } +} + + +void +repl_write_handle_op(cf_node node, msg* m) +{ + uint8_t* ns_name; + size_t ns_name_len; + + if (msg_get_buf(m, RW_FIELD_NAMESPACE, &ns_name, &ns_name_len, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_RW, "repl_write_handle_op: no namespace"); + send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + as_namespace* ns = as_namespace_get_bybuf(ns_name, ns_name_len); + + if (! ns) { + cf_warning(AS_RW, "repl_write_handle_op: invalid namespace"); + send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + cf_digest* keyd; + + if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, NULL, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_RW, "repl_write_handle_op: no digest"); + send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + as_partition_reservation rsv; + uint32_t result = as_partition_reserve_replica(ns, as_partition_getid(keyd), + &rsv); + + if (result != AS_PROTO_RESULT_OK) { + send_repl_write_ack(node, m, result); + return; + } + + as_remote_record rr = { .src = node, .rsv = &rsv, .keyd = keyd }; + + if (msg_get_buf(m, RW_FIELD_RECORD, (uint8_t**)&rr.record_buf, + &rr.record_buf_sz, MSG_GET_DIRECT) != 0 || rr.record_buf_sz < 2) { + cf_warning(AS_RW, "repl_write_handle_op: no or bad record"); + as_partition_release(&rsv); + send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + uint32_t info = 0; + + msg_get_uint32(m, RW_FIELD_INFO, &info); + + if (repl_write_pickle_is_drop(rr.record_buf, info)) { + drop_replica(&rsv, keyd, + (info & RW_INFO_NSUP_DELETE) != 0, + (info & RW_INFO_XDR) != 0, + node); + + as_partition_release(&rsv); + send_repl_write_ack(node, m, AS_PROTO_RESULT_OK); + + return; + } + + if (msg_get_uint32(m, RW_FIELD_GENERATION, &rr.generation) != 0 || + rr.generation == 0) { + cf_warning(AS_RW, "repl_write_handle_op: no or bad generation"); + as_partition_release(&rsv); + send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + if (msg_get_uint64(m, RW_FIELD_LAST_UPDATE_TIME, + &rr.last_update_time) != 0) { + cf_warning(AS_RW, "repl_write_handle_op: no last-update-time"); + as_partition_release(&rsv); + send_repl_write_ack(node, m, AS_PROTO_RESULT_FAIL_UNKNOWN); + return; + } + + msg_get_uint32(m, RW_FIELD_VOID_TIME, &rr.void_time); + + msg_get_buf(m, RW_FIELD_SET_NAME, (uint8_t **)&rr.set_name, + &rr.set_name_len, MSG_GET_DIRECT); + + msg_get_buf(m, RW_FIELD_KEY, (uint8_t **)&rr.key, &rr.key_size, + MSG_GET_DIRECT); + + // Do XDR write if the write is a non-XDR write or forwarding is enabled. + bool do_xdr_write = (info & RW_INFO_XDR) == 0 || + is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes; + + // If source didn't touch sindex, may not need to touch it locally. + bool skip_sindex = (info & RW_INFO_SINDEX_TOUCHED) == 0; + + result = (uint32_t)as_record_replace_if_better(&rr, true, skip_sindex, + do_xdr_write); + + as_partition_release(&rsv); + send_repl_write_ack(node, m, result); +} + + +void +repl_write_handle_ack(cf_node node, msg* m) +{ + uint32_t ns_id; + + if (msg_get_uint32(m, RW_FIELD_NS_ID, &ns_id) != 0) { + cf_warning(AS_RW, "repl-write ack: no ns-id"); + as_fabric_msg_put(m); + return; + } + + cf_digest* keyd; + + if (msg_get_buf(m, RW_FIELD_DIGEST, (uint8_t**)&keyd, NULL, + MSG_GET_DIRECT) != 0) { + cf_warning(AS_RW, "repl-write ack: no digest"); + as_fabric_msg_put(m); + return; + } + + uint32_t tid; + + if (msg_get_uint32(m, RW_FIELD_TID, &tid) != 0) { + cf_warning(AS_RW, "repl-write ack: no tid"); + as_fabric_msg_put(m); + return; + } + + rw_request_hkey hkey = { ns_id, *keyd }; + rw_request* rw = rw_request_hash_get(&hkey); + + if (! rw) { + // Extra ack, after rw_request is already gone. + as_fabric_msg_put(m); + return; + } + + pthread_mutex_lock(&rw->lock); + + if (rw->tid != tid || rw->repl_write_complete) { + // Extra ack - rw_request is newer transaction for same digest, or ack + // is arriving after rw_request was aborted. + pthread_mutex_unlock(&rw->lock); + rw_request_release(rw); + as_fabric_msg_put(m); + return; + } + + // Paranoia - remove eventually. + cf_assert(rw->origin != FROM_NSUP, AS_RW, "nsup delete got repl-write ack"); + + if (! rw->from.any) { + // Lost race against timeout in retransmit thread. + pthread_mutex_unlock(&rw->lock); + rw_request_release(rw); + as_fabric_msg_put(m); + return; + } + + // Find remote node in replicas list. + int i = index_of_node(rw->dest_nodes, rw->n_dest_nodes, node); + + if (i == -1) { + cf_warning(AS_RW, "repl-write ack: from non-dest node %lx", node); + pthread_mutex_unlock(&rw->lock); + rw_request_release(rw); + as_fabric_msg_put(m); + return; + } + + if (rw->dest_complete[i]) { + // Extra ack for this replica write. + pthread_mutex_unlock(&rw->lock); + rw_request_release(rw); + as_fabric_msg_put(m); + return; + } + + uint32_t result_code = parse_result_code(m); + + // If it makes sense, retransmit replicas. Note - rw->dest_complete[i] not + // yet set true, so that retransmit will go to this remote node. + if (repl_write_should_retransmit_replicas(rw, result_code)) { + pthread_mutex_unlock(&rw->lock); + rw_request_release(rw); + as_fabric_msg_put(m); + return; + } + + rw->dest_complete[i] = true; + + for (uint32_t j = 0; j < rw->n_dest_nodes; j++) { + if (! rw->dest_complete[j]) { + // Still haven't heard from all replicas. + pthread_mutex_unlock(&rw->lock); + rw_request_release(rw); + as_fabric_msg_put(m); + return; + } + } + + // Success for all replicas. + rw->repl_write_cb(rw); + repl_write_send_confirmation(rw); + + rw->repl_write_complete = true; + + pthread_mutex_unlock(&rw->lock); + rw_request_hash_delete(&hkey, rw); + rw_request_release(rw); + as_fabric_msg_put(m); +} + + +//========================================================== +// Local helpers. +// + +uint32_t +pack_info_bits(as_transaction* tr) +{ + uint32_t info = 0; + + if (as_transaction_is_xdr(tr)) { + info |= RW_INFO_XDR; + } + + if ((tr->flags & AS_TRANSACTION_FLAG_SINDEX_TOUCHED) != 0) { + info |= RW_INFO_SINDEX_TOUCHED; + } + + if (as_transaction_is_nsup_delete(tr)) { + info |= (RW_INFO_NSUP_DELETE | RW_INFO_NO_REPL_ACK); + } + + if (respond_on_master_complete(tr)) { + info |= RW_INFO_NO_REPL_ACK; + } + + return info; +} + + +void +send_repl_write_ack(cf_node node, msg* m, uint32_t result) +{ + uint32_t info = 0; + + msg_get_uint32(m, RW_FIELD_INFO, &info); + + if ((info & RW_INFO_NO_REPL_ACK) != 0) { + as_fabric_msg_put(m); + return; + } + + msg_preserve_fields(m, 3, RW_FIELD_NS_ID, RW_FIELD_DIGEST, RW_FIELD_TID); + + msg_set_uint32(m, RW_FIELD_OP, RW_OP_WRITE_ACK); + msg_set_uint32(m, RW_FIELD_RESULT, result); + + if (as_fabric_send(node, m, AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(m); + } +} + + +uint32_t +parse_result_code(msg* m) +{ + uint32_t result_code; + + if (msg_get_uint32(m, RW_FIELD_RESULT, &result_code) != 0) { + cf_warning(AS_RW, "repl-write ack: no result_code"); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + return result_code; +} + + +void +drop_replica(as_partition_reservation* rsv, cf_digest* keyd, + bool is_nsup_delete, bool is_xdr_op, cf_node master) +{ + // Shortcut pointers & flags. + as_namespace* ns = rsv->ns; + as_index_tree* tree = rsv->tree; + + as_index_ref r_ref; + r_ref.skip_lock = false; + + if (as_record_get(tree, keyd, &r_ref) != 0) { + return; // not found is ok from master's perspective. + } + + as_record* r = r_ref.r; + + if (ns->storage_data_in_memory) { + record_delete_adjust_sindex(r, ns); + } + + // Save the set-ID for XDR. + uint16_t set_id = as_index_get_set_id(r); + + as_index_delete(tree, keyd); + as_record_done(&r_ref, ns); + + if (xdr_must_ship_delete(ns, is_nsup_delete, is_xdr_op)) { + xdr_write(ns, keyd, 0, master, XDR_OP_TYPE_DROP, set_id, NULL); + } +} diff --git a/as/src/transaction/rw_request.c b/as/src/transaction/rw_request.c new file mode 100644 index 00000000..4493db09 --- /dev/null +++ b/as/src/transaction/rw_request.c @@ -0,0 +1,223 @@ +/* + * rw_request.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/rw_request.h" + +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_digest.h" + +#include "dynbuf.h" +#include "fault.h" + +#include "base/datamodel.h" +#include "base/proto.h" +#include "base/rec_props.h" +#include "base/thr_tsvc.h" +#include "base/transaction.h" +#include "fabric/fabric.h" +#include "fabric/partition.h" + + +//========================================================== +// Globals. +// + +static cf_atomic32 g_rw_tid = 0; + + +//========================================================== +// Public API. +// + +rw_request* +rw_request_create(cf_digest* keyd) +{ + rw_request* rw = cf_rc_alloc(sizeof(rw_request)); + + // as_transaction look-alike: + rw->msgp = NULL; + rw->msg_fields = 0; + rw->origin = 0; + rw->from_flags = 0; + rw->from.any = NULL; + rw->from_data.any = 0; + rw->keyd = *keyd; + rw->start_time = 0; + rw->benchmark_time = 0; + + AS_PARTITION_RESERVATION_INIT(rw->rsv); + + rw->end_time = 0; + rw->result_code = AS_PROTO_RESULT_OK; + rw->flags = 0; + rw->generation = 0; + rw->void_time = 0; + rw->last_update_time = 0; + // End of as_transaction look-alike. + + pthread_mutex_init(&rw->lock, NULL); + + rw->wait_queue_head = NULL; + rw->wait_queue_tail = NULL; + rw->wait_queue_depth = 0; + + rw->is_set_up = false; + + rw->pickled_buf = NULL; + rw->pickled_sz = 0; + as_rec_props_clear(&rw->pickled_rec_props); + + rw->response_db.buf = NULL; + rw->response_db.is_stack = false; + rw->response_db.alloc_sz = 0; + rw->response_db.used_sz = 0; + + rw->tid = cf_atomic32_incr(&g_rw_tid); + rw->dup_res_complete = false; + rw->repl_write_complete = false; + rw->repl_ping_complete = false; + rw->dup_res_cb = NULL; + rw->repl_write_cb = NULL; + rw->repl_ping_cb = NULL; + rw->timeout_cb = NULL; + + rw->dest_msg = NULL; + rw->xmit_ms = 0; + rw->retry_interval_ms = 0; + + rw->n_dest_nodes = 0; + + rw->best_dup_msg = NULL; + rw->best_dup_result_code = AS_PROTO_RESULT_OK; + rw->best_dup_gen = 0; + rw->best_dup_lut = 0; + + rw->tie_was_replicated = false; + + return rw; +} + + +void +rw_request_destroy(rw_request* rw) +{ + // Paranoia: + if (rw->from.any) { + cf_crash(AS_RW, "rw_request_destroy: origin %d has non-null 'from'", + rw->origin); + } + + if (rw->msgp && rw->origin != FROM_BATCH) { + cf_free(rw->msgp); + } + + if (rw->pickled_buf) { + cf_free(rw->pickled_buf); + } + + if (rw->pickled_rec_props.p_data) { + cf_free(rw->pickled_rec_props.p_data); + } + + cf_dyn_buf_free(&rw->response_db); + + if (rw->dest_msg) { + as_fabric_msg_put(rw->dest_msg); + } + + if (rw->is_set_up) { + if (rw->best_dup_msg) { + as_fabric_msg_put(rw->best_dup_msg); + } + + as_partition_release(&rw->rsv); + } + + pthread_mutex_destroy(&rw->lock); + + rw_wait_ele* e = rw->wait_queue_head; + + while (e) { + rw_wait_ele* next = e->next; + + e->tr.from_flags |= FROM_FLAG_RESTART; + as_tsvc_enqueue(&e->tr); + + cf_free(e); + e = next; + } +} + + +void +rw_request_wait_q_push(rw_request* rw, as_transaction* tr) +{ + rw_wait_ele* e = cf_malloc(sizeof(rw_wait_ele)); + + as_transaction_copy_head(&e->tr, tr); + tr->from.any = NULL; + tr->msgp = NULL; + + e->next = NULL; + + if (rw->wait_queue_tail) { + rw->wait_queue_tail->next = e; + rw->wait_queue_tail = e; + } + else { + rw->wait_queue_head = e; + rw->wait_queue_tail = e; + } + + rw->wait_queue_depth++; +} + + +void +rw_request_wait_q_push_head(rw_request* rw, as_transaction* tr) +{ + rw_wait_ele* e = cf_malloc(sizeof(rw_wait_ele)); + cf_assert(e, AS_RW, "alloc rw_wait_ele"); + + as_transaction_copy_head(&e->tr, tr); + tr->from.any = NULL; + tr->msgp = NULL; + + e->next = rw->wait_queue_head; + rw->wait_queue_head = e; + + if (! rw->wait_queue_tail) { + rw->wait_queue_tail = e; + } + + rw->wait_queue_depth++; +} diff --git a/as/src/transaction/rw_request_hash.c b/as/src/transaction/rw_request_hash.c new file mode 100644 index 00000000..4b97d85a --- /dev/null +++ b/as/src/transaction/rw_request_hash.c @@ -0,0 +1,448 @@ +/* + * rw_request_hash.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/rw_request_hash.h" + +#include +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_rchash.h" + +#include "fault.h" +#include "msg.h" +#include "node.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/proto.h" +#include "base/transaction.h" +#include "base/transaction_policy.h" +#include "fabric/fabric.h" +#include "transaction/duplicate_resolve.h" +#include "transaction/replica_ping.h" +#include "transaction/replica_write.h" +#include "transaction/rw_request.h" +#include "transaction/rw_utils.h" + + +//========================================================== +// Typedefs & constants. +// + +const msg_template rw_mt[] = { + { RW_FIELD_OP, M_FT_UINT32 }, + { RW_FIELD_RESULT, M_FT_UINT32 }, + { RW_FIELD_NAMESPACE, M_FT_BUF }, + { RW_FIELD_NS_ID, M_FT_UINT32 }, + { RW_FIELD_GENERATION, M_FT_UINT32 }, + { RW_FIELD_DIGEST, M_FT_BUF }, + { RW_FIELD_UNUSED_6, M_FT_BUF }, + { RW_FIELD_UNUSED_7, M_FT_BUF }, + { RW_FIELD_CLUSTER_KEY, M_FT_UINT64 }, + { RW_FIELD_RECORD, M_FT_BUF }, + { RW_FIELD_TID, M_FT_UINT32 }, + { RW_FIELD_VOID_TIME, M_FT_UINT32 }, + { RW_FIELD_INFO, M_FT_UINT32 }, + { RW_FIELD_UNUSED_13, M_FT_BUF }, + { RW_FIELD_UNUSED_14, M_FT_BUF }, + { RW_FIELD_UNUSED_15, M_FT_UINT64 }, + { RW_FIELD_LAST_UPDATE_TIME, M_FT_UINT64 }, + { RW_FIELD_SET_NAME, M_FT_BUF }, + { RW_FIELD_KEY, M_FT_BUF }, + { RW_FIELD_REGIME, M_FT_UINT32 } +}; + +COMPILER_ASSERT(sizeof(rw_mt) / sizeof(msg_template) == NUM_RW_FIELDS); + +#define RW_MSG_SCRATCH_SIZE 192 + + +//========================================================== +// Globals. +// + +static cf_rchash* g_rw_request_hash = NULL; + + +//========================================================== +// Forward declarations. +// + +uint32_t rw_request_hash_fn(const void* value, uint32_t value_len); +transaction_status handle_hot_key(rw_request* rw0, as_transaction* tr); + +void* run_retransmit(void* arg); +int retransmit_reduce_fn(const void* key, uint32_t keylen, void* data, void* udata); +void update_retransmit_stats(const rw_request* rw); + +int rw_msg_cb(cf_node id, msg* m, void* udata); + + +//========================================================== +// Public API. +// + +void +as_rw_init() +{ + cf_rchash_create(&g_rw_request_hash, rw_request_hash_fn, + rw_request_hdestroy, sizeof(rw_request_hkey), 32 * 1024, + CF_RCHASH_MANY_LOCK); + + pthread_t thread; + pthread_attr_t attrs; + + pthread_attr_init(&attrs); + pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); + + if (pthread_create(&thread, &attrs, run_retransmit, NULL) != 0) { + cf_crash(AS_RW, "failed to create retransmit thread"); + } + + as_fabric_register_msg_fn(M_TYPE_RW, rw_mt, sizeof(rw_mt), + RW_MSG_SCRATCH_SIZE, rw_msg_cb, NULL); +} + + +uint32_t +rw_request_hash_count() +{ + return cf_rchash_get_size(g_rw_request_hash); +} + + +transaction_status +rw_request_hash_insert(rw_request_hkey* hkey, rw_request* rw, + as_transaction* tr) +{ + int insert_rv; + + while ((insert_rv = cf_rchash_put_unique(g_rw_request_hash, hkey, + sizeof(*hkey), rw)) != CF_RCHASH_OK) { + cf_assert(insert_rv == CF_RCHASH_ERR_FOUND, AS_RW, "put-unique error"); + // rw_request with this digest already in hash - get it. + + rw_request* rw0; + int get_rv = cf_rchash_get(g_rw_request_hash, hkey, sizeof(*hkey), + (void**)&rw0); + + if (get_rv == CF_RCHASH_ERR_NOT_FOUND) { + // Try insertion again immediately. + continue; + } + // else - got it - handle "hot key" scenario. + cf_assert(get_rv == CF_RCHASH_OK, AS_RW, "cf_rchash_get error"); + + pthread_mutex_lock(&rw0->lock); + + transaction_status status = handle_hot_key(rw0, tr); + + pthread_mutex_unlock(&rw0->lock); + rw_request_release(rw0); + + return status; // rw_request was not inserted in the hash + } + + return TRANS_IN_PROGRESS; // rw_request was inserted in the hash +} + + +void +rw_request_hash_delete(rw_request_hkey* hkey, rw_request* rw) +{ + cf_rchash_delete_object(g_rw_request_hash, hkey, sizeof(*hkey), rw); +} + + +rw_request* +rw_request_hash_get(rw_request_hkey* hkey) +{ + rw_request* rw = NULL; + + cf_rchash_get(g_rw_request_hash, hkey, sizeof(*hkey), (void**)&rw); + + return rw; +} + + +// For debugging only. +void +rw_request_hash_dump() +{ + cf_info(AS_RW, "rw_request_hash dump not yet implemented"); + // TODO - implement something, or deprecate. +} + + +//========================================================== +// Local helpers - hash insertion. +// + +uint32_t +rw_request_hash_fn(const void* key, uint32_t key_size) +{ + rw_request_hkey* hkey = (rw_request_hkey*)key; + + return *(uint32_t*)&hkey->keyd.digest[DIGEST_SCRAMBLE_BYTE1]; +} + + +transaction_status +handle_hot_key(rw_request* rw0, as_transaction* tr) +{ + if (rw0->is_set_up && + rw0->origin == FROM_PROXY && tr->origin == FROM_PROXY && + rw0->from.proxy_node == tr->from.proxy_node && + rw0->from_data.proxy_tid == tr->from_data.proxy_tid) { + // If the new transaction is a retransmitted proxy request, don't + // queue it or reply to origin, just drop it and feign success. (Older + // servers will retransmit proxy requests - must handle them.) + + return TRANS_DONE_SUCCESS; + } + else if (tr->origin == FROM_RE_REPL) { + // Always put this transaction at the head of the original rw_request's + // queue - it will be retried (first) when the original is complete. + rw_request_wait_q_push_head(rw0, tr); + + return TRANS_WAITING; + } + else if (g_config.transaction_pending_limit != 0 && + rw0->wait_queue_depth > g_config.transaction_pending_limit) { + // If we're over the hot key pending limit, fail this transaction. + cf_detail_digest(AS_RW, &tr->keyd, "{%s} key busy ", tr->rsv.ns->name); + + cf_atomic64_incr(&tr->rsv.ns->n_fail_key_busy); + tr->result_code = AS_PROTO_RESULT_FAIL_KEY_BUSY; + + return TRANS_DONE_ERROR; + } + else { + // Queue this transaction on the original rw_request - it will be + // retried when the original is complete. + rw_request_wait_q_push(rw0, tr); + + return TRANS_WAITING; + } +} + + +//========================================================== +// Local helpers - retransmit. +// + +void* +run_retransmit(void* arg) +{ + while (true) { + usleep(130 * 1000); + + now_times now; + + now.now_ns = cf_getns(); + now.now_ms = now.now_ns / 1000000; + + cf_rchash_reduce(g_rw_request_hash, retransmit_reduce_fn, &now); + } + + return NULL; +} + + +int +retransmit_reduce_fn(const void* key, uint32_t keylen, void* data, void* udata) +{ + rw_request* rw = data; + now_times* now = (now_times*)udata; + + if (! rw->is_set_up) { + return 0; + } + + if (now->now_ns > rw->end_time) { + pthread_mutex_lock(&rw->lock); + + rw->timeout_cb(rw); + + pthread_mutex_unlock(&rw->lock); + + return CF_RCHASH_REDUCE_DELETE; + } + + if (rw->xmit_ms < now->now_ms) { + pthread_mutex_lock(&rw->lock); + + if (rw->from.any) { + rw->xmit_ms = now->now_ms + rw->retry_interval_ms; + rw->retry_interval_ms *= 2; + + send_rw_messages(rw); + update_retransmit_stats(rw); + } + // else - lost race against dup-res or repl-write callback. + + pthread_mutex_unlock(&rw->lock); + } + + return 0; +} + + +void +update_retransmit_stats(const rw_request* rw) +{ + as_namespace* ns = rw->rsv.ns; + as_msg* m = &rw->msgp->msg; + bool is_dup_res = rw->repl_write_cb == NULL; + + // Note - only one retransmit thread, so no need for atomic increments. + + switch (rw->origin) { + case FROM_CLIENT: { + bool is_write = (m->info2 & AS_MSG_INFO2_WRITE) != 0; + bool is_delete = (m->info2 & AS_MSG_INFO2_DELETE) != 0; + bool is_udf = (rw->msg_fields & AS_MSG_FIELD_BIT_UDF_FILENAME) != 0; + + if (is_dup_res) { + if (is_write) { + if (is_delete) { + ns->n_retransmit_client_delete_dup_res++; + } + else if (is_udf) { + ns->n_retransmit_client_udf_dup_res++; + } + else { + ns->n_retransmit_client_write_dup_res++; + } + } + else { + ns->n_retransmit_client_read_dup_res++; + } + } + else { + cf_assert(is_write, AS_RW, "read doing replica write"); + + if (is_delete) { + ns->n_retransmit_client_delete_repl_write++; + } + else if (is_udf) { + ns->n_retransmit_client_udf_repl_write++; + } + else { + ns->n_retransmit_client_write_repl_write++; + } + } + } + break; + case FROM_PROXY: + // For now we don't report proxyee stats. + break; + case FROM_BATCH: + // For now batch sub transactions are read-only. + ns->n_retransmit_batch_sub_dup_res++; + break; + case FROM_IUDF: + if (is_dup_res) { + ns->n_retransmit_udf_sub_dup_res++; + } + else { + ns->n_retransmit_udf_sub_repl_write++; + } + break; + case FROM_RE_REPL: + // For now we don't report re-replication retransmit stats. + break; + default: + cf_crash(AS_RW, "unexpected transaction origin %u", rw->origin); + break; + } +} + + +//========================================================== +// Local helpers - handle RW fabric messages. +// + +int +rw_msg_cb(cf_node id, msg* m, void* udata) +{ + uint32_t op; + + if (msg_get_uint32(m, RW_FIELD_OP, &op) != 0) { + cf_warning(AS_RW, "got rw msg without op field"); + as_fabric_msg_put(m); + return 0; + } + + switch (op) { + //-------------------------------------------- + // Duplicate resolution: + // + case RW_OP_DUP: + dup_res_handle_request(id, m); + break; + case RW_OP_DUP_ACK: + dup_res_handle_ack(id, m); + break; + + //-------------------------------------------- + // Replica writes: + // + case RW_OP_WRITE: + repl_write_handle_op(id, m); + break; + case RW_OP_WRITE_ACK: + repl_write_handle_ack(id, m); + break; + case RW_OP_REPL_CONFIRM: + repl_write_handle_confirmation(m); + break; + + //-------------------------------------------- + // Replica pings: + // + case RW_OP_REPL_PING: + repl_ping_handle_op(id, m); + break; + case RW_OP_REPL_PING_ACK: + repl_ping_handle_ack(id, m); + break; + + default: + cf_warning(AS_RW, "got rw msg with unrecognized op %u", op); + as_fabric_msg_put(m); + break; + } + + return 0; +} diff --git a/as/src/transaction/rw_utils.c b/as/src/transaction/rw_utils.c new file mode 100644 index 00000000..72bb0c21 --- /dev/null +++ b/as/src/transaction/rw_utils.c @@ -0,0 +1,470 @@ +/* + * rw_utils.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/rw_utils.h" + +#include +#include +#include +#include + +#include "citrusleaf/cf_atomic.h" // xdr_allows_write +#include "citrusleaf/cf_clock.h" +#include "citrusleaf/cf_digest.h" + +#include "fault.h" +#include "msg.h" + +#include "base/cfg.h" // xdr_allows_write +#include "base/datamodel.h" +#include "base/proto.h" // xdr_allows_write +#include "base/secondary_index.h" +#include "base/transaction.h" +#include "base/xdr_serverside.h" +#include "fabric/fabric.h" +#include "storage/storage.h" +#include "transaction/rw_request.h" + + +//========================================================== +// Public API. +// + +// TODO - really? we can't hide this behind an XDR stub? +bool +xdr_allows_write(as_transaction* tr) +{ + if (as_transaction_is_xdr(tr)) { + if (tr->rsv.ns->ns_allow_xdr_writes) { + return true; + } + } + else { + if (tr->rsv.ns->ns_allow_nonxdr_writes || tr->origin == FROM_NSUP) { + return true; + } + } + + cf_atomic_int_incr(&tr->rsv.ns->n_fail_xdr_forbidden); + + return false; +} + + +void +send_rw_messages(rw_request* rw) +{ + for (uint32_t i = 0; i < rw->n_dest_nodes; i++) { + if (rw->dest_complete[i]) { + continue; + } + + msg_incr_ref(rw->dest_msg); + + if (as_fabric_send(rw->dest_nodes[i], rw->dest_msg, + AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(rw->dest_msg); + rw->xmit_ms = 0; // force a retransmit on next cycle + } + } +} + + +void +send_rw_messages_forget(rw_request* rw) +{ + for (uint32_t i = 0; i < rw->n_dest_nodes; i++) { + msg_incr_ref(rw->dest_msg); + + if (as_fabric_send(rw->dest_nodes[i], rw->dest_msg, + AS_FABRIC_CHANNEL_RW) != AS_FABRIC_SUCCESS) { + as_fabric_msg_put(rw->dest_msg); + } + } +} + + +int +set_set_from_msg(as_record* r, as_namespace* ns, as_msg* m) +{ + as_msg_field* f = as_msg_field_get(m, AS_MSG_FIELD_TYPE_SET); + size_t name_len = (size_t)as_msg_field_get_value_sz(f); + + if (name_len == 0) { + return 0; + } + + // Given the name, find/assign the set-ID and write it in the as_index. + return as_index_set_set_w_len(r, ns, (const char*)f->data, name_len, true); +} + + +// Caller must have checked that key is present in message. +bool +check_msg_key(as_msg* m, as_storage_rd* rd) +{ + as_msg_field* f = as_msg_field_get(m, AS_MSG_FIELD_TYPE_KEY); + uint32_t key_size = as_msg_field_get_value_sz(f); + uint8_t* key = f->data; + + if (key_size != rd->key_size || memcmp(key, rd->key, key_size) != 0) { + cf_warning(AS_RW, "key mismatch - end of universe?"); + return false; + } + + return true; +} + + +bool +get_msg_key(as_transaction* tr, as_storage_rd* rd) +{ + if (! as_transaction_has_key(tr)) { + return true; + } + + as_msg_field* f = as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_KEY); + + if (rd->ns->single_bin && rd->ns->storage_data_in_memory) { + cf_warning(AS_RW, "{%s} can't store key if data-in-memory & single-bin", + tr->rsv.ns->name); + return false; + } + + rd->key_size = as_msg_field_get_value_sz(f); + rd->key = f->data; + + return true; +} + + +int +handle_msg_key(as_transaction* tr, as_storage_rd* rd) +{ + // Shortcut pointers. + as_msg* m = &tr->msgp->msg; + as_namespace* ns = tr->rsv.ns; + + if (rd->r->key_stored == 1) { + // Key stored for this record - be sure it gets rewritten. + + // This will force a device read for non-data-in-memory, even if + // must_fetch_data is false! Since there's no advantage to using the + // loaded block after this if must_fetch_data is false, leave the + // subsequent code as-is. + if (! as_storage_record_get_key(rd)) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} can't get stored key ", + ns->name); + return AS_PROTO_RESULT_FAIL_UNKNOWN; + } + + // Check the client-sent key, if any, against the stored key. + if (as_transaction_has_key(tr) && ! check_msg_key(m, rd)) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} key mismatch ", ns->name); + return AS_PROTO_RESULT_FAIL_KEY_MISMATCH; + } + } + // If we got a key without a digest, it's an old client, not a cue to store + // the key. (Remove this check when we're sure all old C clients are gone.) + else if (as_transaction_has_digest(tr)) { + // Key not stored for this record - store one if sent from client. For + // data-in-memory, don't allocate the key until we reach the point of no + // return. Also don't set AS_INDEX_FLAG_KEY_STORED flag until then. + if (! get_msg_key(tr, rd)) { + return AS_PROTO_RESULT_FAIL_UNSUPPORTED_FEATURE; + } + } + + return 0; +} + + +void +update_metadata_in_index(as_transaction* tr, bool increment_generation, + as_record* r) +{ + // Shortcut pointers. + as_msg* m = &tr->msgp->msg; + as_namespace* ns = tr->rsv.ns; + + uint64_t now = cf_clepoch_milliseconds(); + + switch (m->record_ttl) { + case TTL_NAMESPACE_DEFAULT: + if (ns->default_ttl != 0) { + // Set record void-time using default TTL value. + r->void_time = (now / 1000) + ns->default_ttl; + } + else { + // Default TTL is "never expire". + r->void_time = 0; + } + break; + case TTL_NEVER_EXPIRE: + // Set record to "never expire". + r->void_time = 0; + break; + case TTL_DONT_UPDATE: + // Do not change record's void time. + break; + default: + // Apply non-special m->record_ttl directly. Have already checked + // m->record_ttl <= 10 years, so no overflow etc. + r->void_time = (now / 1000) + m->record_ttl; + break; + } + + as_record_set_lut(r, tr->rsv.regime, now, ns); + + if (increment_generation) { + as_record_increment_generation(r, ns); + } +} + + +void +pickle_all(as_storage_rd* rd, rw_request* rw) +{ + if (rw->n_dest_nodes == 0) { + return; + } + + rw->pickled_buf = as_record_pickle(rd, &rw->pickled_sz); + + // TODO - we could avoid this copy (and maybe even not do this here at all) + // if all callers malloc'd rd->rec_props.p_data upstream for hand-off... + if (rd->rec_props.p_data) { + rw->pickled_rec_props.size = rd->rec_props.size; + rw->pickled_rec_props.p_data = cf_malloc(rd->rec_props.size); + memcpy(rw->pickled_rec_props.p_data, rd->rec_props.p_data, + rd->rec_props.size); + } +} + + +bool +write_sindex_update(as_namespace* ns, const char* set_name, cf_digest* keyd, + as_bin* old_bins, uint32_t n_old_bins, as_bin* new_bins, + uint32_t n_new_bins) +{ + int n_populated = 0; + bool not_just_created[n_new_bins]; + + memset(not_just_created, 0, sizeof(not_just_created)); + + // Maximum number of sindexes which can be changed in one transaction is + // 2 * ns->sindex_cnt. + + SINDEX_GRLOCK(); + SINDEX_BINS_SETUP(sbins, 2 * ns->sindex_cnt); + as_sindex* si_arr[2 * ns->sindex_cnt]; + int si_arr_index = 0; + + // Reserve matching SIs. + + for (int i = 0; i < n_old_bins; i++) { + si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, + old_bins[i].id, &si_arr[si_arr_index]); + } + + for (int i = 0; i < n_new_bins; i++) { + si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, + new_bins[i].id, &si_arr[si_arr_index]); + } + + // For every old bin, find the corresponding new bin (if any) and adjust the + // secondary index if the bin was modified. If no corresponding new bin is + // found, it means the old bin was deleted - also adjust the secondary index + // accordingly. + + for (int32_t i_old = 0; i_old < (int32_t)n_old_bins; i_old++) { + as_bin* b_old = &old_bins[i_old]; + bool found = false; + + // Loop over new bins. Start at old bin index (if possible) and go down, + // wrapping around to do the higher indexes last. This will find a match + // (if any) very quickly - instantly, unless there were bins deleted. + + bool any_new = n_new_bins != 0; + int32_t n_new_minus_1 = (int32_t)n_new_bins - 1; + int32_t i_new = n_new_minus_1 < i_old ? n_new_minus_1 : i_old; + + while (any_new) { + as_bin* b_new = &new_bins[i_new]; + + if (b_old->id == b_new->id) { + if (as_bin_get_particle_type(b_old) != + as_bin_get_particle_type(b_new) || + b_old->particle != b_new->particle) { + n_populated += as_sindex_sbins_populate( + &sbins[n_populated], ns, set_name, b_old, b_new); + } + + found = true; + not_just_created[i_new] = true; + break; + } + + if (--i_new < 0 && (i_new = n_new_minus_1) <= i_old) { + break; + } + + if (i_new == i_old) { + break; + } + } + + if (! found) { + n_populated += as_sindex_sbins_from_bin(ns, set_name, b_old, + &sbins[n_populated], AS_SINDEX_OP_DELETE); + } + } + + // Now find the new bins that are just-created bins. We've marked the others + // in the loop above, so any left are just-created. + + for (uint32_t i_new = 0; i_new < n_new_bins; i_new++) { + if (not_just_created[i_new]) { + continue; + } + + n_populated += as_sindex_sbins_from_bin(ns, set_name, &new_bins[i_new], + &sbins[n_populated], AS_SINDEX_OP_INSERT); + } + + SINDEX_GRUNLOCK(); + + if (n_populated != 0) { + as_sindex_update_by_sbin(ns, set_name, sbins, n_populated, keyd); + as_sindex_sbin_freeall(sbins, n_populated); + } + + as_sindex_release_arr(si_arr, si_arr_index); + + return n_populated != 0; +} + + +// If called for data-not-in-memory, this may read record from drive! +// TODO - rename as as_record_... and move to record.c? +void +record_delete_adjust_sindex(as_record* r, as_namespace* ns) +{ + if (! record_has_sindex(r, ns)) { + return; + } + + as_storage_rd rd; + + as_storage_record_open(ns, r, &rd); + as_storage_rd_load_n_bins(&rd); + + as_bin stack_bins[ns->storage_data_in_memory ? 0 : rd.n_bins]; + + as_storage_rd_load_bins(&rd, stack_bins); + + remove_from_sindex(ns, as_index_get_set_name(r, ns), &r->keyd, rd.bins, + rd.n_bins); + + as_storage_record_close(&rd); +} + + +// Remove record from secondary index. Called only for data-in-memory. If +// data-not-in-memory, existing record is not read, and secondary index entry is +// cleaned up by background sindex defrag thread. +// TODO - rename as as_record_... and move to record.c? +void +delete_adjust_sindex(as_storage_rd* rd) +{ + as_namespace* ns = rd->ns; + + if (! record_has_sindex(rd->r, ns)) { + return; + } + + as_storage_rd_load_n_bins(rd); + as_storage_rd_load_bins(rd, NULL); + + remove_from_sindex(ns, as_index_get_set_name(rd->r, ns), &rd->r->keyd, + rd->bins, rd->n_bins); +} + + +// TODO - rename as as_record_..., move to record.c, take r instead of set_name, +// and lose keyd parameter? +void +remove_from_sindex(as_namespace* ns, const char* set_name, cf_digest* keyd, + as_bin* bins, uint32_t n_bins) +{ + SINDEX_GRLOCK(); + + SINDEX_BINS_SETUP(sbins, ns->sindex_cnt); + + as_sindex* si_arr[ns->sindex_cnt]; + int si_arr_index = 0; + int sbins_populated = 0; + + // Reserve matching sindexes. + for (int i = 0; i < (int)n_bins; i++) { + si_arr_index += as_sindex_arr_lookup_by_set_binid_lockfree(ns, set_name, + bins[i].id, &si_arr[si_arr_index]); + } + + for (int i = 0; i < (int)n_bins; i++) { + sbins_populated += as_sindex_sbins_from_bin(ns, set_name, &bins[i], + &sbins[sbins_populated], AS_SINDEX_OP_DELETE); + } + + SINDEX_GRUNLOCK(); + + if (sbins_populated) { + as_sindex_update_by_sbin(ns, set_name, sbins, sbins_populated, keyd); + as_sindex_sbin_freeall(sbins, sbins_populated); + } + + as_sindex_release_arr(si_arr, si_arr_index); +} + + +bool +xdr_must_ship_delete(as_namespace* ns, bool is_nsup_delete, bool is_xdr_op) +{ + if (! is_xdr_delete_shipping_enabled()) { + return false; + } + + // If this delete is a result of expiration/eviction, don't ship it unless + // configured to do so. + if (is_nsup_delete && ! is_xdr_nsup_deletes_enabled()) { + return false; + } + + return ! is_xdr_op || + // If this delete is a result of XDR shipping, don't ship it unless + // configured to do so. + is_xdr_forwarding_enabled() || ns->ns_forward_xdr_writes; +} diff --git a/as/src/transaction/rw_utils_ce.c b/as/src/transaction/rw_utils_ce.c new file mode 100644 index 00000000..931c91b5 --- /dev/null +++ b/as/src/transaction/rw_utils_ce.c @@ -0,0 +1,259 @@ +/* + * rw_utils_ce.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/rw_utils.h" + +#include +#include + +#include "fault.h" +#include "msg.h" + +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/transaction.h" +#include "base/udf_record.h" +#include "storage/storage.h" +#include "transaction/rw_request.h" +#include "transaction/udf.h" + + +//========================================================== +// Public API. +// + +bool +validate_delete_durability(as_transaction* tr) +{ + return true; +} + + +int +repl_state_check(as_record* r, as_transaction* tr) +{ + return 0; +} + + +void +will_replicate(as_record* r, as_namespace* ns) +{ +} + + +bool +insufficient_replica_destinations(const as_namespace* ns, uint32_t n_dests) +{ + return false; +} + + +void +finished_replicated(as_transaction* tr) +{ +} + + +void +finished_not_replicated(rw_request* rw) +{ +} + + +bool +generation_check(const as_record* r, const as_msg* m, const as_namespace* ns) +{ + if ((m->info2 & AS_MSG_INFO2_GENERATION) != 0) { + return m->generation == r->generation; + } + + if ((m->info2 & AS_MSG_INFO2_GENERATION_GT) != 0) { + return m->generation > r->generation; + } + + return true; // no generation requirement +} + + +int +set_delete_durablility(const as_transaction* tr, as_storage_rd* rd) +{ + if (as_transaction_is_durable_delete(tr)) { + cf_warning(AS_RW, "durable delete is an enterprise feature"); + return AS_PROTO_RESULT_FAIL_ENTERPRISE_ONLY; + } + + return 0; +} + + +//========================================================== +// Private API - for enterprise separation only. +// + +bool +create_only_check(const as_record* r, const as_msg* m) +{ + // Ok (return true) if no requirement. + return (m->info2 & AS_MSG_INFO2_CREATE_ONLY) == 0; +} + + +void +write_delete_record(as_record* r, as_index_tree* tree) +{ + as_index_delete(tree, &r->keyd); +} + + +udf_optype +udf_finish_delete(udf_record* urecord) +{ + return (urecord->flag & UDF_RECORD_FLAG_PREEXISTS) != 0 ? + UDF_OPTYPE_DELETE : UDF_OPTYPE_NONE; +} + + +uint32_t +dup_res_pack_repl_state_info(const as_record* r, as_namespace* ns) +{ + return 0; +} + + +uint32_t +dup_res_pack_info(const as_record* r, as_namespace* ns) +{ + return 0; +} + + +bool +dup_res_should_retry_transaction(rw_request* rw, uint32_t result_code) +{ + // TODO - JUMP - can get this from 3.14.x nodes or older - retry if so. + return result_code == AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH; +} + + +void +dup_res_handle_tie(rw_request* rw, const msg* m, uint32_t result_code) +{ +} + + +void +apply_if_tie(rw_request* rw) +{ +} + + +void +dup_res_translate_result_code(rw_request* rw) +{ + rw->result_code = AS_PROTO_RESULT_OK; +} + + +bool +dup_res_ignore_pickle(const uint8_t* buf, uint32_t info) +{ + return as_record_pickle_is_binless(buf); +} + + +void +dup_res_init_repl_state(as_remote_record* rr, uint32_t info) +{ +} + + +void +repl_write_flag_pickle(const as_transaction* tr, const uint8_t* buf, + uint32_t* info) +{ + // Do nothing. +} + + +bool +repl_write_pickle_is_drop(const uint8_t* buf, uint32_t info) +{ + return as_record_pickle_is_binless(buf); +} + + +void +repl_write_init_repl_state(as_remote_record* rr, bool from_replica) +{ +} + + +conflict_resolution_pol +repl_write_conflict_resolution_policy(const as_namespace* ns) +{ + return AS_NAMESPACE_CONFLICT_RESOLUTION_POLICY_LAST_UPDATE_TIME; +} + + +bool +repl_write_should_retransmit_replicas(rw_request* rw, uint32_t result_code) +{ + switch (result_code) { + case AS_PROTO_RESULT_FAIL_CLUSTER_KEY_MISMATCH: + rw->xmit_ms = 0; // force retransmit on next cycle + return true; + default: + return false; + } +} + + +void +repl_write_send_confirmation(rw_request* rw) +{ +} + + +void +repl_write_handle_confirmation(msg* m) +{ +} + + +int +record_replace_check(as_record* r, as_namespace* ns) +{ + return 0; +} + + +void +record_replaced(as_record* r, as_remote_record* rr) +{ +} diff --git a/as/src/transaction/udf.c b/as/src/transaction/udf.c new file mode 100644 index 00000000..62cf9158 --- /dev/null +++ b/as/src/transaction/udf.c @@ -0,0 +1,1094 @@ +/* + * udf.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/udf.h" + +#include +#include +#include +#include + +#include "aerospike/as_aerospike.h" +#include "aerospike/as_buffer.h" +#include "aerospike/as_log.h" +#include "aerospike/as_list.h" +#include "aerospike/as_module.h" +#include "aerospike/as_msgpack.h" +#include "aerospike/as_serializer.h" +#include "aerospike/as_types.h" +#include "aerospike/as_udf_context.h" +#include "aerospike/mod_lua.h" + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" + +#include "dynbuf.h" +#include "fault.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/proto.h" +#include "base/secondary_index.h" +#include "base/transaction.h" +#include "base/transaction_policy.h" +#include "base/udf_aerospike.h" +#include "base/udf_arglist.h" +#include "base/udf_cask.h" +#include "base/udf_record.h" +#include "base/udf_timer.h" +#include "fabric/partition.h" +#include "storage/storage.h" +#include "transaction/duplicate_resolve.h" +#include "transaction/proxy.h" +#include "transaction/replica_write.h" +#include "transaction/rw_request.h" +#include "transaction/rw_request_hash.h" +#include "transaction/rw_utils.h" + + +//========================================================== +// Typedefs & constants. +// + +static const cf_fault_severity as_log_level_map[5] = { + [AS_LOG_LEVEL_ERROR] = CF_WARNING, + [AS_LOG_LEVEL_WARN] = CF_WARNING, + [AS_LOG_LEVEL_INFO] = CF_INFO, + [AS_LOG_LEVEL_DEBUG] = CF_DEBUG, + [AS_LOG_LEVEL_TRACE] = CF_DETAIL +}; + +typedef struct udf_call_s { + udf_def* def; + as_transaction* tr; +} udf_call; + + +//========================================================== +// Globals. +// + +as_aerospike g_as_aerospike; + + +//========================================================== +// Forward declarations. +// + +bool log_callback(as_log_level level, const char* func, const char* file, + uint32_t line, const char* fmt, ...); + +void start_udf_dup_res(rw_request* rw, as_transaction* tr); +void start_udf_repl_write(rw_request* rw, as_transaction* tr); +void start_udf_repl_write_forget(rw_request* rw, as_transaction* tr); +bool udf_dup_res_cb(rw_request* rw); +void udf_repl_write_after_dup_res(rw_request* rw, as_transaction* tr); +void udf_repl_write_forget_after_dup_res(rw_request* rw, as_transaction* tr); +void udf_repl_write_cb(rw_request* rw); + +void send_udf_response(as_transaction* tr, cf_dyn_buf* db); +void udf_timeout_cb(rw_request* rw); + +transaction_status udf_master(rw_request* rw, as_transaction* tr); +udf_optype udf_master_apply(udf_call* call, rw_request* rw); +int udf_apply_record(udf_call* call, as_rec* rec, as_result* result); +uint64_t udf_end_time(time_tracker* tt); +void udf_finish(udf_record* urecord, rw_request* rw, udf_optype* record_op); +udf_optype udf_finish_op(udf_record* urecord); +void udf_post_processing(udf_record* urecord, rw_request* rw, + udf_optype urecord_op); + +void update_lua_complete_stats(uint8_t origin, as_namespace* ns, udf_optype op, + int ret, bool is_success); + +void process_failure_str(udf_call* call, const char* err_str, size_t len, + cf_dyn_buf* db); +void process_result(const as_result* result, udf_call* call, cf_dyn_buf* db); +void process_response(udf_call* call, bool success, const as_val* val, + cf_dyn_buf* db); + + +//========================================================== +// Inlines & macros. +// + +static inline void +client_udf_update_stats(as_namespace* ns, uint8_t result_code) +{ + switch (result_code) { + case AS_PROTO_RESULT_OK: + cf_atomic64_incr(&ns->n_client_udf_complete); + break; + case AS_PROTO_RESULT_FAIL_TIMEOUT: + cf_atomic64_incr(&ns->n_client_udf_timeout); + break; + default: + cf_atomic64_incr(&ns->n_client_udf_error); + break; + } +} + +static inline void +udf_sub_udf_update_stats(as_namespace* ns, uint8_t result_code) +{ + switch (result_code) { + case AS_PROTO_RESULT_OK: + cf_atomic64_incr(&ns->n_udf_sub_udf_complete); + break; + case AS_PROTO_RESULT_FAIL_TIMEOUT: + cf_atomic64_incr(&ns->n_udf_sub_udf_timeout); + break; + default: + cf_atomic64_incr(&ns->n_udf_sub_udf_error); + break; + } +} + +static inline bool +udf_zero_bins_left(udf_record* urecord) +{ + return (urecord->flag & UDF_RECORD_FLAG_OPEN) != 0 && + ! as_bin_inuse_has(urecord->rd); +} + +static inline void +process_failure(udf_call* call, const as_val* val, cf_dyn_buf* db) +{ + process_response(call, false, val, db); +} + +static inline void +process_success(udf_call* call, const as_val* val, cf_dyn_buf* db) +{ + process_response(call, true, val, db); +} + + +//========================================================== +// Public API. +// + +void +as_udf_init() +{ + as_module_configure(&mod_lua, &g_config.mod_lua); + as_log_set_callback(log_callback); + udf_cask_init(); + as_aerospike_init(&g_as_aerospike, NULL, &udf_aerospike_hooks); +} + + +// Public API for udf_def class, not big enough for it's own file. +udf_def* +udf_def_init_from_msg(udf_def* def, const as_transaction* tr) +{ + def->arglist = NULL; + + as_msg* m = &tr->msgp->msg; + as_msg_field* filename = + as_msg_field_get(m, AS_MSG_FIELD_TYPE_UDF_FILENAME); + + if (! filename) { + return NULL; + } + + as_msg_field* function = + as_msg_field_get(m, AS_MSG_FIELD_TYPE_UDF_FUNCTION); + + if (! function) { + return NULL; + } + + as_msg_field* arglist = as_msg_field_get(m, AS_MSG_FIELD_TYPE_UDF_ARGLIST); + + if (! arglist) { + return NULL; + } + + as_msg_field_get_strncpy(filename, def->filename, sizeof(def->filename)); + as_msg_field_get_strncpy(function, def->function, sizeof(def->function)); + + as_unpacker unpacker; + + unpacker.buffer = (const unsigned char*)arglist->data; + unpacker.length = as_msg_field_get_value_sz(arglist); + unpacker.offset = 0; + + if (unpacker.length > 0) { + as_val* val = NULL; + int ret = as_unpack_val(&unpacker, &val); + + if (ret == 0 && as_val_type(val) == AS_LIST) { + def->arglist = (as_list*)val; + } + } + + as_msg_field* op = as_transaction_has_udf_op(tr) ? + as_msg_field_get(m, AS_MSG_FIELD_TYPE_UDF_OP) : NULL; + + def->type = op ? *op->data : AS_UDF_OP_KVS; + + return def; +} + + +transaction_status +as_udf_start(as_transaction* tr) +{ + BENCHMARK_START(tr, udf, FROM_CLIENT); + BENCHMARK_START(tr, udf_sub, FROM_IUDF); + + // Apply XDR filter. + if (! xdr_allows_write(tr)) { + tr->result_code = AS_PROTO_RESULT_FAIL_ALWAYS_FORBIDDEN; + send_udf_response(tr, NULL); + return TRANS_DONE_ERROR; + } + + // Don't know if UDF is read or delete - check that we aren't backed up. + if (as_storage_overloaded(tr->rsv.ns)) { + tr->result_code = AS_PROTO_RESULT_FAIL_DEVICE_OVERLOAD; + send_udf_response(tr, NULL); + return TRANS_DONE_ERROR; + } + + // Create rw_request and add to hash. + rw_request_hkey hkey = { tr->rsv.ns->id, tr->keyd }; + rw_request* rw = rw_request_create(&tr->keyd); + transaction_status status = rw_request_hash_insert(&hkey, rw, tr); + + // If rw_request wasn't inserted in hash, transaction is finished. + if (status != TRANS_IN_PROGRESS) { + rw_request_release(rw); + + if (status != TRANS_WAITING) { + send_udf_response(tr, NULL); + } + + return status; + } + // else - rw_request is now in hash, continue... + + if (tr->rsv.ns->write_dup_res_disabled) { + // Note - preventing duplicate resolution this way allows + // rw_request_destroy() to handle dup_msg[] cleanup correctly. + tr->rsv.n_dupl = 0; + } + + // If there are duplicates to resolve, start doing so. + if (tr->rsv.n_dupl != 0) { + start_udf_dup_res(rw, tr); + + // Started duplicate resolution. + return TRANS_IN_PROGRESS; + } + // else - no duplicate resolution phase, apply operation to master. + + // Set up the nodes to which we'll write replicas. + rw->n_dest_nodes = as_partition_get_other_replicas(tr->rsv.p, + rw->dest_nodes); + + if (insufficient_replica_destinations(tr->rsv.ns, rw->n_dest_nodes)) { + rw_request_hash_delete(&hkey, rw); + tr->result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE; + send_udf_response(tr, NULL); + return TRANS_DONE_ERROR; + } + + status = udf_master(rw, tr); + + BENCHMARK_NEXT_DATA_POINT(tr, udf, master); + BENCHMARK_NEXT_DATA_POINT(tr, udf_sub, master); + + // If error or UDF was a read, transaction is finished. + if (status != TRANS_IN_PROGRESS) { + if (status != TRANS_WAITING) { + send_udf_response(tr, &rw->response_db); + } + + rw_request_hash_delete(&hkey, rw); + return status; + } + + // If we don't need replica writes, transaction is finished. + if (rw->n_dest_nodes == 0) { + finished_replicated(tr); + send_udf_response(tr, &rw->response_db); + rw_request_hash_delete(&hkey, rw); + return TRANS_DONE_SUCCESS; + } + + // If we don't need to wait for replica write acks, fire and forget. + if (respond_on_master_complete(tr)) { + start_udf_repl_write_forget(rw, tr); + send_udf_response(tr, &rw->response_db); + rw_request_hash_delete(&hkey, rw); + return TRANS_DONE_SUCCESS; + } + + start_udf_repl_write(rw, tr); + + // Started replica write. + return TRANS_IN_PROGRESS; +} + + +//========================================================== +// Local helpers - initialization. +// + +bool +log_callback(as_log_level level, const char* func, const char* file, + uint32_t line, const char* fmt, ...) +{ + cf_fault_severity severity = as_log_level_map[level]; + + if (severity > cf_fault_filter[AS_UDF]) { + return true; + } + + va_list ap; + + va_start(ap, fmt); + char message[1024] = { '\0' }; + vsnprintf(message, 1024, fmt, ap); + va_end(ap); + + cf_fault_event(AS_UDF, severity, file, line, "%s", message); + + return true; +} + + +//========================================================== +// Local helpers - transaction flow. +// + +void +start_udf_dup_res(rw_request* rw, as_transaction* tr) +{ + // Finish initializing rw, construct and send dup-res message. + + dup_res_make_message(rw, tr); + + pthread_mutex_lock(&rw->lock); + + dup_res_setup_rw(rw, tr, udf_dup_res_cb, udf_timeout_cb); + send_rw_messages(rw); + + pthread_mutex_unlock(&rw->lock); +} + + +void +start_udf_repl_write(rw_request* rw, as_transaction* tr) +{ + // Finish initializing rw, construct and send repl-write message. + + repl_write_make_message(rw, tr); + + pthread_mutex_lock(&rw->lock); + + repl_write_setup_rw(rw, tr, udf_repl_write_cb, udf_timeout_cb); + send_rw_messages(rw); + + pthread_mutex_unlock(&rw->lock); +} + + +void +start_udf_repl_write_forget(rw_request* rw, as_transaction* tr) +{ + // Construct and send repl-write message. No need to finish rw setup. + + repl_write_make_message(rw, tr); + send_rw_messages_forget(rw); +} + + +bool +udf_dup_res_cb(rw_request* rw) +{ + BENCHMARK_NEXT_DATA_POINT(rw, udf, dup_res); + BENCHMARK_NEXT_DATA_POINT(rw, udf_sub, dup_res); + + as_transaction tr; + as_transaction_init_from_rw(&tr, rw); + + if (tr.result_code != AS_PROTO_RESULT_OK) { + send_udf_response(&tr, NULL); + return true; + } + + // Set up the nodes to which we'll write replicas. + rw->n_dest_nodes = as_partition_get_other_replicas(tr.rsv.p, + rw->dest_nodes); + + if (insufficient_replica_destinations(tr.rsv.ns, rw->n_dest_nodes)) { + tr.result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE; + send_udf_response(&tr, NULL); + return true; + } + + transaction_status status = udf_master(rw, &tr); + + BENCHMARK_NEXT_DATA_POINT((&tr), udf, master); + BENCHMARK_NEXT_DATA_POINT((&tr), udf_sub, master); + + if (status == TRANS_WAITING) { + // Note - new tr now owns msgp, make sure rw destructor doesn't free it. + // Also, rw will release rsv - new tr will get a new one. + rw->msgp = NULL; + return true; + } + + if (status != TRANS_IN_PROGRESS) { + send_udf_response(&tr, &rw->response_db); + return true; + } + + // If we don't need replica writes, transaction is finished. + if (rw->n_dest_nodes == 0) { + finished_replicated(&tr); + send_udf_response(&tr, &rw->response_db); + return true; + } + + // If we don't need to wait for replica write acks, fire and forget. + if (respond_on_master_complete(&tr)) { + udf_repl_write_forget_after_dup_res(rw, &tr); + send_udf_response(&tr, &rw->response_db); + return true; + } + + udf_repl_write_after_dup_res(rw, &tr); + + // Started replica write - don't delete rw_request from hash. + return false; +} + + +void +udf_repl_write_after_dup_res(rw_request* rw, as_transaction* tr) +{ + // Recycle rw_request that was just used for duplicate resolution to now do + // replica writes. Note - we are under the rw_request lock here! + + repl_write_make_message(rw, tr); + repl_write_reset_rw(rw, tr, udf_repl_write_cb); + send_rw_messages(rw); +} + + +void +udf_repl_write_forget_after_dup_res(rw_request* rw, as_transaction* tr) +{ + // Send replica writes. Not waiting for acks, so need to reset rw_request. + // Note - we are under the rw_request lock here! + + repl_write_make_message(rw, tr); + send_rw_messages_forget(rw); +} + + +void +udf_repl_write_cb(rw_request* rw) +{ + BENCHMARK_NEXT_DATA_POINT(rw, udf, repl_write); + BENCHMARK_NEXT_DATA_POINT(rw, udf_sub, repl_write); + + as_transaction tr; + as_transaction_init_from_rw(&tr, rw); + + finished_replicated(&tr); + send_udf_response(&tr, &rw->response_db); + + // Finished transaction - rw_request cleans up reservation and msgp! +} + + +//========================================================== +// Local helpers - transaction end. +// + +void +send_udf_response(as_transaction* tr, cf_dyn_buf* db) +{ + // Paranoia - shouldn't get here on losing race with timeout. + if (! tr->from.any) { + cf_warning(AS_RW, "transaction origin %u has null 'from'", tr->origin); + return; + } + + // Note - if tr was setup from rw, rw->from.any has been set null and + // informs timeout it lost the race. + + clear_delete_response_metadata(tr); + + switch (tr->origin) { + case FROM_CLIENT: + if (db && db->used_sz != 0) { + as_msg_send_ops_reply(tr->from.proto_fd_h, db); + } + else { + as_msg_send_reply(tr->from.proto_fd_h, tr->result_code, + tr->generation, tr->void_time, NULL, NULL, 0, tr->rsv.ns, + as_transaction_trid(tr)); + } + BENCHMARK_NEXT_DATA_POINT(tr, udf, response); + HIST_TRACK_ACTIVATE_INSERT_DATA_POINT(tr, udf_hist); + client_udf_update_stats(tr->rsv.ns, tr->result_code); + break; + case FROM_PROXY: + if (db && db->used_sz != 0) { + as_proxy_send_ops_response(tr->from.proxy_node, + tr->from_data.proxy_tid, db); + } + else { + as_proxy_send_response(tr->from.proxy_node, tr->from_data.proxy_tid, + tr->result_code, tr->generation, tr->void_time, NULL, NULL, + 0, tr->rsv.ns, as_transaction_trid(tr)); + } + break; + case FROM_IUDF: + if (db && db->used_sz != 0) { + cf_crash(AS_RW, "unexpected - internal udf has response"); + } + tr->from.iudf_orig->cb(tr->from.iudf_orig->udata, tr->result_code); + BENCHMARK_NEXT_DATA_POINT(tr, udf_sub, response); + udf_sub_udf_update_stats(tr->rsv.ns, tr->result_code); + break; + default: + cf_crash(AS_RW, "unexpected transaction origin %u", tr->origin); + break; + } + + tr->from.any = NULL; // pattern, not needed +} + + +void +udf_timeout_cb(rw_request* rw) +{ + if (! rw->from.any) { + return; // lost race against dup-res or repl-write callback + } + + finished_not_replicated(rw); + + switch (rw->origin) { + case FROM_CLIENT: + as_msg_send_reply(rw->from.proto_fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0, + 0, NULL, NULL, 0, rw->rsv.ns, rw_request_trid(rw)); + // Timeouts aren't included in histograms. + client_udf_update_stats(rw->rsv.ns, AS_PROTO_RESULT_FAIL_TIMEOUT); + break; + case FROM_PROXY: + break; + case FROM_IUDF: + rw->from.iudf_orig->cb(rw->from.iudf_orig->udata, + AS_PROTO_RESULT_FAIL_TIMEOUT); + // Timeouts aren't included in histograms. + udf_sub_udf_update_stats(rw->rsv.ns, AS_PROTO_RESULT_FAIL_TIMEOUT); + break; + default: + cf_crash(AS_RW, "unexpected transaction origin %u", rw->origin); + break; + } + + rw->from.any = NULL; // inform other callback it lost the race +} + + +//========================================================== +// Local helpers - UDF. +// + +transaction_status +udf_master(rw_request* rw, as_transaction* tr) +{ + CF_ALLOC_SET_NS_ARENA(tr->rsv.ns); + + udf_def def; + udf_call call = { &def, tr }; + + if (tr->origin == FROM_IUDF) { + call.def = &tr->from.iudf_orig->def; + } + else if (! udf_def_init_from_msg(call.def, tr)) { + cf_warning(AS_UDF, "failed udf_def_init_from_msg"); + tr->result_code = AS_PROTO_RESULT_FAIL_PARAMETER; + return TRANS_DONE_ERROR; + } + + udf_optype optype = udf_master_apply(&call, rw); + + if (tr->origin != FROM_IUDF && call.def->arglist) { + as_list_destroy(call.def->arglist); + } + + if (optype == UDF_OPTYPE_READ || optype == UDF_OPTYPE_NONE) { + // UDF is done, no replica writes needed. + return TRANS_DONE_SUCCESS; + } + + return optype == UDF_OPTYPE_WAITING ? TRANS_WAITING : TRANS_IN_PROGRESS; +} + + +udf_optype +udf_master_apply(udf_call* call, rw_request* rw) +{ + as_transaction* tr = call->tr; + as_namespace* ns = tr->rsv.ns; + + // Find record in index. + + as_index_ref r_ref; + r_ref.skip_lock = false; + + int get_rv = as_record_get(tr->rsv.tree, &tr->keyd, &r_ref); + + if (get_rv == 0 && as_record_is_doomed(r_ref.r, ns)) { + // If record is expired or truncated, pretend it was not found. + as_record_done(&r_ref, ns); + get_rv = -1; + } + + if (get_rv == 0 && repl_state_check(r_ref.r, tr) < 0) { + as_record_done(&r_ref, ns); + return UDF_OPTYPE_WAITING; + } + + if (tr->origin == FROM_IUDF && + (get_rv == -1 || ! as_record_is_live(r_ref.r))) { + // Internal UDFs must not create records. + tr->result_code = AS_PROTO_RESULT_FAIL_NOT_FOUND; + process_failure(call, NULL, &rw->response_db); + return UDF_OPTYPE_NONE; + } + + // Open storage record. + + as_storage_rd rd; + + udf_record urecord; + udf_record_init(&urecord, true); + + xdr_dirty_bins dirty_bins; + xdr_clear_dirty_bins(&dirty_bins); + + urecord.r_ref = &r_ref; + urecord.tr = tr; + urecord.rd = &rd; + urecord.dirty = &dirty_bins; + urecord.keyd = tr->keyd; + + if (get_rv == 0) { + urecord.flag |= (UDF_RECORD_FLAG_OPEN | UDF_RECORD_FLAG_PREEXISTS); + + if (udf_storage_record_open(&urecord) != 0) { + udf_record_close(&urecord); + tr->result_code = AS_PROTO_RESULT_FAIL_BIN_NAME; // overloaded... add bin_count error? + process_failure(call, NULL, &rw->response_db); + return UDF_OPTYPE_NONE; + } + + if (tr->origin == FROM_IUDF && tr->from.iudf_orig->predexp) { + predexp_args_t predargs = { + .ns = ns, .md = r_ref.r, .vl = NULL, .rd = &rd + }; + + if (! predexp_matches_record(tr->from.iudf_orig->predexp, + &predargs)) { + udf_record_close(&urecord); + tr->result_code = AS_PROTO_RESULT_FAIL_NOT_FOUND; // not ideal + process_failure(call, NULL, &rw->response_db); + return UDF_OPTYPE_NONE; + } + } + + as_msg* m = &tr->msgp->msg; + + // If both the record and the message have keys, check them. + if (rd.key) { + if (as_transaction_has_key(tr) && ! check_msg_key(m, &rd)) { + udf_record_close(&urecord); + tr->result_code = AS_PROTO_RESULT_FAIL_KEY_MISMATCH; + process_failure(call, NULL, &rw->response_db); + return UDF_OPTYPE_NONE; + } + } + else { + // If the message has a key, apply it to the record. + if (! get_msg_key(tr, &rd)) { + udf_record_close(&urecord); + tr->result_code = AS_PROTO_RESULT_FAIL_UNSUPPORTED_FEATURE; + process_failure(call, NULL, &rw->response_db); + return UDF_OPTYPE_NONE; + } + + urecord.flag |= UDF_RECORD_FLAG_METADATA_UPDATED; + } + } + else { + urecord.flag &= ~(UDF_RECORD_FLAG_OPEN | + UDF_RECORD_FLAG_STORAGE_OPEN | + UDF_RECORD_FLAG_PREEXISTS); + } + + // Run UDF. + + // This as_rec needs to be in the heap - once passed into the lua scope it + // gets garbage collected later. Also, the destroy hook is set to NULL so + // garbage collection has nothing to do. + as_rec* urec = as_rec_new(&urecord, &udf_record_hooks); + + as_val_reserve(urec); // for lua + + as_result result; + as_result_init(&result); + + int apply_rv = udf_apply_record(call, urec, &result); + + udf_optype optype = UDF_OPTYPE_NONE; + + if (apply_rv == 0) { + udf_finish(&urecord, rw, &optype); + process_result(&result, call, &rw->response_db); + } + else { + udf_record_close(&urecord); + + char* rs = as_module_err_string(apply_rv); + + tr->result_code = AS_PROTO_RESULT_FAIL_UDF_EXECUTION; + process_failure_str(call, rs, strlen(rs), &rw->response_db); + cf_free(rs); + } + + update_lua_complete_stats(tr->origin, ns, optype, apply_rv, + result.is_success); + + as_result_destroy(&result); + udf_record_destroy(urec); + + return optype; +} + + +int +udf_apply_record(udf_call* call, as_rec* rec, as_result* result) +{ + time_tracker udf_timer_tracker = { + .udata = as_rec_source(rec), + .end_time = udf_end_time + }; + + udf_timer_setup(&udf_timer_tracker); + + as_timer timer; + as_timer_init(&timer, &udf_timer_tracker, &udf_timer_hooks); + + as_udf_context ctx = { + .as = &g_as_aerospike, + .timer = &timer, + .memtracker = NULL + }; + + int apply_rv = as_module_apply_record(&mod_lua, &ctx, call->def->filename, + call->def->function, rec, call->def->arglist, result); + + udf_timer_cleanup(); + + return apply_rv; +} + + +uint64_t +udf_end_time(time_tracker* tt) +{ + udf_record* urecord = (udf_record*)tt->udata; + + if (! urecord) { + return -1; // TODO - should be impossible. + } + + return urecord->tr->end_time; +} + + +void +udf_finish(udf_record* urecord, rw_request* rw, udf_optype* record_op) +{ + *record_op = UDF_OPTYPE_READ; + + udf_optype final_op = udf_finish_op(urecord); + + if (final_op == UDF_OPTYPE_DELETE) { + *record_op = UDF_OPTYPE_DELETE; + urecord->tr->flags |= AS_TRANSACTION_FLAG_IS_DELETE; + } + else if (final_op == UDF_OPTYPE_WRITE) { + *record_op = UDF_OPTYPE_WRITE; + } + + udf_post_processing(urecord, rw, final_op); +} + + +udf_optype +udf_finish_op(udf_record* urecord) +{ + if (udf_zero_bins_left(urecord)) { + // Amazingly, with respect to stored key, memory statistics work out + // correctly regardless of what this returns. + return udf_finish_delete(urecord); + } + + if ((urecord->flag & UDF_RECORD_FLAG_HAS_UPDATES) != 0) { + if ((urecord->flag & UDF_RECORD_FLAG_OPEN) == 0) { + cf_crash(AS_UDF, "updated record not open"); + } + + return UDF_OPTYPE_WRITE; + } + + return UDF_OPTYPE_READ; +} + + +void +udf_post_processing(udf_record* urecord, rw_request* rw, udf_optype urecord_op) +{ + as_storage_rd* rd = urecord->rd; + as_transaction* tr = urecord->tr; + as_record* r = rd->r; + + uint16_t generation = 0; + uint16_t set_id = 0; + xdr_dirty_bins dirty_bins; + + if (urecord_op == UDF_OPTYPE_WRITE || urecord_op == UDF_OPTYPE_DELETE) { + size_t rec_props_data_size = as_storage_record_rec_props_size(rd); + uint8_t rec_props_data[rec_props_data_size]; + + if (rec_props_data_size > 0) { + as_storage_record_set_rec_props(rd, rec_props_data); + } + + as_msg* m = &tr->msgp->msg; + + // Convert message TTL special value if appropriate. + if (m->record_ttl == TTL_DONT_UPDATE && + (urecord->flag & UDF_RECORD_FLAG_PREEXISTS) == 0) { + m->record_ttl = TTL_NAMESPACE_DEFAULT; + } + + update_metadata_in_index(tr, true, r); + + pickle_all(rd, rw); + + tr->generation = r->generation; + tr->void_time = r->void_time; + tr->last_update_time = r->last_update_time; + + // Now ok to accommodate a new stored key... + if (r->key_stored == 0 && rd->key) { + if (rd->ns->storage_data_in_memory) { + as_record_allocate_key(r, rd->key, rd->key_size); + } + + r->key_stored = 1; + } + // ... or drop a stored key. + else if (r->key_stored == 1 && ! rd->key) { + if (rd->ns->storage_data_in_memory) { + as_record_remove_key(r); + } + + r->key_stored = 0; + } + + as_storage_record_adjust_mem_stats(rd, urecord->starting_memory_bytes); + + will_replicate(r, rd->ns); + + // Collect information for XDR before closing the record. + generation = plain_generation(r->generation, rd->ns); + set_id = as_index_get_set_id(r); + + if (urecord->dirty && urecord_op == UDF_OPTYPE_WRITE) { + xdr_clear_dirty_bins(&dirty_bins); + xdr_copy_dirty_bins(urecord->dirty, &dirty_bins); + } + } + + // Close the record for all the cases. + udf_record_close(urecord); + + // Write to XDR pipe. + if (urecord_op == UDF_OPTYPE_WRITE) { + xdr_write(tr->rsv.ns, &tr->keyd, generation, 0, XDR_OP_TYPE_WRITE, + set_id, &dirty_bins); + } + else if (urecord_op == UDF_OPTYPE_DELETE) { + xdr_write(tr->rsv.ns, &tr->keyd, 0, 0, + as_transaction_is_durable_delete(tr) ? + XDR_OP_TYPE_DURABLE_DELETE : XDR_OP_TYPE_DROP, + set_id, NULL); + } +} + + +//========================================================== +// Local helpers - statistics. +// + +void +update_lua_complete_stats(uint8_t origin, as_namespace* ns, udf_optype op, + int ret, bool is_success) +{ + switch (origin) { + case FROM_CLIENT: + if (ret == 0 && is_success) { + if (op == UDF_OPTYPE_READ) { + cf_atomic_int_incr(&ns->n_client_lang_read_success); + } + else if (op == UDF_OPTYPE_DELETE) { + cf_atomic_int_incr(&ns->n_client_lang_delete_success); + } + else if (op == UDF_OPTYPE_WRITE) { + cf_atomic_int_incr(&ns->n_client_lang_write_success); + } + } + else { + cf_info(AS_UDF, "lua error, ret:%d", ret); + cf_atomic_int_incr(&ns->n_client_lang_error); + } + break; + case FROM_PROXY: + // TODO? + break; + case FROM_IUDF: + if (ret == 0 && is_success) { + if (op == UDF_OPTYPE_READ) { + // Note - this would be weird, since there's nowhere for a + // response to go in our current UDF scans & queries. + cf_atomic_int_incr(&ns->n_udf_sub_lang_read_success); + } + else if (op == UDF_OPTYPE_DELETE) { + cf_atomic_int_incr(&ns->n_udf_sub_lang_delete_success); + } + else if (op == UDF_OPTYPE_WRITE) { + cf_atomic_int_incr(&ns->n_udf_sub_lang_write_success); + } + } + else { + cf_info(AS_UDF, "lua error, ret:%d", ret); + cf_atomic_int_incr(&ns->n_udf_sub_lang_error); + } + break; + default: + cf_crash(AS_UDF, "unexpected transaction origin %u", origin); + break; + } +} + + +//========================================================== +// Local helpers - construct response to be sent to origin. +// + +void +process_failure_str(udf_call* call, const char* err_str, size_t len, + cf_dyn_buf* db) +{ + if (! err_str) { + // Better than sending an as_string with null value. + process_failure(call, NULL, db); + return; + } + + as_string stack_s; + as_string_init_wlen(&stack_s, (char*)err_str, len, false); + + process_failure(call, as_string_toval(&stack_s), db); +} + + +void +process_result(const as_result* result, udf_call* call, cf_dyn_buf* db) +{ + as_val* val = result->value; + + if (result->is_success) { + process_success(call, val, db); + return; + } + + // Failures... + + if (as_val_type(val) == AS_STRING) { + call->tr->result_code = AS_PROTO_RESULT_FAIL_UDF_EXECUTION; + process_failure(call, val, db); + return; + } + + char lua_err_str[1024]; + size_t len = (size_t)sprintf(lua_err_str, + "%s:0: in function %s() - error() argument type not handled", + call->def->filename, call->def->function); + + call->tr->result_code = AS_PROTO_RESULT_FAIL_UDF_EXECUTION; + process_failure_str(call, lua_err_str, len, db); +} + + +void +process_response(udf_call* call, bool success, const as_val* val, + cf_dyn_buf* db) +{ + // No response for background (internal) UDF. + if (call->def->type == AS_UDF_OP_BACKGROUND) { + return; + } + + as_transaction* tr = call->tr; + + // Note - this function quietly handles a null val. The response call will + // be given a bin with a name but not 'in use', and it does the right thing. + + size_t msg_sz = 0; + + db->buf = (uint8_t *)as_msg_make_val_response(success, val, tr->result_code, + tr->generation, tr->void_time, as_transaction_trid(tr), &msg_sz); + + db->is_stack = false; + db->alloc_sz = msg_sz; + db->used_sz = msg_sz; +} diff --git a/as/src/transaction/write.c b/as/src/transaction/write.c new file mode 100644 index 00000000..c17bfcff --- /dev/null +++ b/as/src/transaction/write.c @@ -0,0 +1,1958 @@ +/* + * write.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "transaction/write.h" + +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" + +#include "dynbuf.h" +#include "fault.h" + +#include "base/cfg.h" +#include "base/datamodel.h" +#include "base/index.h" +#include "base/proto.h" +#include "base/secondary_index.h" +#include "base/transaction.h" +#include "base/transaction_policy.h" +#include "base/truncate.h" +#include "base/xdr_serverside.h" +#include "fabric/partition.h" +#include "storage/storage.h" +#include "transaction/duplicate_resolve.h" +#include "transaction/proxy.h" +#include "transaction/replica_write.h" +#include "transaction/rw_request.h" +#include "transaction/rw_request_hash.h" +#include "transaction/rw_utils.h" + + +//========================================================== +// Typedefs & constants. +// + +#define STACK_PARTICLES_SIZE (1024 * 1024) + + +//========================================================== +// Forward declarations. +// + +void start_write_dup_res(rw_request* rw, as_transaction* tr); +void start_write_repl_write(rw_request* rw, as_transaction* tr); +void start_write_repl_write_forget(rw_request* rw, as_transaction* tr); +bool write_dup_res_cb(rw_request* rw); +void write_repl_write_after_dup_res(rw_request* rw, as_transaction* tr); +void write_repl_write_forget_after_dup_res(rw_request* rw, as_transaction* tr); +void write_repl_write_cb(rw_request* rw); + +void send_write_response(as_transaction* tr, cf_dyn_buf* db); +void write_timeout_cb(rw_request* rw); + +transaction_status write_master(rw_request* rw, as_transaction* tr); +void write_master_failed(as_transaction* tr, as_index_ref* r_ref, + bool record_created, as_index_tree* tree, as_storage_rd* rd, + int result_code); +int write_master_preprocessing(as_transaction* tr); +int write_master_policies(as_transaction* tr, bool* p_must_not_create, + bool* p_record_level_replace, bool* p_must_fetch_data, + bool* p_increment_generation); +bool check_msg_set_name(as_transaction* tr, const char* set_name); + +int write_master_dim_single_bin(as_transaction* tr, as_storage_rd* rd, + bool increment_generation, rw_request* rw, bool* is_delete, + xdr_dirty_bins* dirty_bins); +int write_master_dim(as_transaction* tr, const char* set_name, + as_storage_rd* rd, bool record_level_replace, bool increment_generation, + rw_request* rw, bool* is_delete, xdr_dirty_bins* dirty_bins); +int write_master_ssd_single_bin(as_transaction* tr, as_storage_rd* rd, + bool must_fetch_data, bool increment_generation, rw_request* rw, + bool* is_delete, xdr_dirty_bins* dirty_bins); +int write_master_ssd(as_transaction* tr, const char* set_name, + as_storage_rd* rd, bool must_fetch_data, bool record_level_replace, + bool increment_generation, rw_request* rw, bool* is_delete, + xdr_dirty_bins* dirty_bins); + +void write_master_update_index_metadata(as_transaction* tr, + bool increment_generation, index_metadata* old, as_record* r); +int write_master_bin_ops(as_transaction* tr, as_storage_rd* rd, + cf_ll_buf* particles_llb, as_bin* cleanup_bins, + uint32_t* p_n_cleanup_bins, cf_dyn_buf* db, uint32_t* p_n_final_bins, + xdr_dirty_bins* dirty_bins); +int write_master_bin_ops_loop(as_transaction* tr, as_storage_rd* rd, + as_msg_op** ops, as_bin* response_bins, uint32_t* p_n_response_bins, + as_bin* result_bins, uint32_t* p_n_result_bins, + cf_ll_buf* particles_llb, as_bin* cleanup_bins, + uint32_t* p_n_cleanup_bins, xdr_dirty_bins* dirty_bins); + +void write_master_index_metadata_unwind(index_metadata* old, as_record* r); +void write_master_dim_single_bin_unwind(as_bin* old_bin, as_bin* new_bin, + as_bin* cleanup_bins, uint32_t n_cleanup_bins); +void write_master_dim_unwind(as_bin* old_bins, uint32_t n_old_bins, + as_bin* new_bins, uint32_t n_new_bins, as_bin* cleanup_bins, + uint32_t n_cleanup_bins); + + +//========================================================== +// Inlines & macros. +// + +static inline void +client_write_update_stats(as_namespace* ns, uint8_t result_code, bool is_xdr_op) +{ + switch (result_code) { + case AS_PROTO_RESULT_OK: + cf_atomic64_incr(&ns->n_client_write_success); + if (is_xdr_op) { + cf_atomic64_incr(&ns->n_xdr_write_success); + } + break; + case AS_PROTO_RESULT_FAIL_TIMEOUT: + cf_atomic64_incr(&ns->n_client_write_timeout); + if (is_xdr_op) { + cf_atomic64_incr(&ns->n_xdr_write_timeout); + } + break; + default: + cf_atomic64_incr(&ns->n_client_write_error); + if (is_xdr_op) { + cf_atomic64_incr(&ns->n_xdr_write_error); + } + break; + } +} + +static inline void +append_bin_to_destroy(as_bin* b, as_bin* bins, uint32_t* p_n_bins) +{ + if (as_bin_is_external_particle(b)) { + bins[(*p_n_bins)++] = *b; + } +} + + +//========================================================== +// Public API. +// + +transaction_status +as_write_start(as_transaction* tr) +{ + BENCHMARK_START(tr, write, FROM_CLIENT); + + // Apply XDR filter. + if (! xdr_allows_write(tr)) { + tr->result_code = AS_PROTO_RESULT_FAIL_ALWAYS_FORBIDDEN; + send_write_response(tr, NULL); + return TRANS_DONE_ERROR; + } + + // Check that we aren't backed up. + if (as_storage_overloaded(tr->rsv.ns)) { + tr->result_code = AS_PROTO_RESULT_FAIL_DEVICE_OVERLOAD; + send_write_response(tr, NULL); + return TRANS_DONE_ERROR; + } + + // Create rw_request and add to hash. + rw_request_hkey hkey = { tr->rsv.ns->id, tr->keyd }; + rw_request* rw = rw_request_create(&tr->keyd); + transaction_status status = rw_request_hash_insert(&hkey, rw, tr); + + // If rw_request wasn't inserted in hash, transaction is finished. + if (status != TRANS_IN_PROGRESS) { + rw_request_release(rw); + + if (status != TRANS_WAITING) { + send_write_response(tr, NULL); + } + + return status; + } + // else - rw_request is now in hash, continue... + + if (tr->rsv.ns->write_dup_res_disabled) { + // Note - preventing duplicate resolution this way allows + // rw_request_destroy() to handle dup_msg[] cleanup correctly. + tr->rsv.n_dupl = 0; + } + + // If there are duplicates to resolve, start doing so. + if (tr->rsv.n_dupl != 0) { + start_write_dup_res(rw, tr); + + // Started duplicate resolution. + return TRANS_IN_PROGRESS; + } + // else - no duplicate resolution phase, apply operation to master. + + // Set up the nodes to which we'll write replicas. + rw->n_dest_nodes = as_partition_get_other_replicas(tr->rsv.p, + rw->dest_nodes); + + if (insufficient_replica_destinations(tr->rsv.ns, rw->n_dest_nodes)) { + rw_request_hash_delete(&hkey, rw); + tr->result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE; + send_write_response(tr, NULL); + return TRANS_DONE_ERROR; + } + + status = write_master(rw, tr); + + BENCHMARK_NEXT_DATA_POINT(tr, write, master); + + // If error, transaction is finished. + if (status != TRANS_IN_PROGRESS) { + rw_request_hash_delete(&hkey, rw); + + if (status != TRANS_WAITING) { + send_write_response(tr, NULL); + } + + return status; + } + + // If we don't need replica writes, transaction is finished. + if (rw->n_dest_nodes == 0) { + finished_replicated(tr); + send_write_response(tr, &rw->response_db); + rw_request_hash_delete(&hkey, rw); + return TRANS_DONE_SUCCESS; + } + + // If we don't need to wait for replica write acks, fire and forget. + if (respond_on_master_complete(tr)) { + start_write_repl_write_forget(rw, tr); + send_write_response(tr, &rw->response_db); + rw_request_hash_delete(&hkey, rw); + return TRANS_DONE_SUCCESS; + } + + start_write_repl_write(rw, tr); + + // Started replica write. + return TRANS_IN_PROGRESS; +} + + +//========================================================== +// Local helpers - transaction flow. +// + +void +start_write_dup_res(rw_request* rw, as_transaction* tr) +{ + // Finish initializing rw, construct and send dup-res message. + + dup_res_make_message(rw, tr); + + pthread_mutex_lock(&rw->lock); + + dup_res_setup_rw(rw, tr, write_dup_res_cb, write_timeout_cb); + send_rw_messages(rw); + + pthread_mutex_unlock(&rw->lock); +} + + +void +start_write_repl_write(rw_request* rw, as_transaction* tr) +{ + // Finish initializing rw, construct and send repl-write message. + + repl_write_make_message(rw, tr); + + pthread_mutex_lock(&rw->lock); + + repl_write_setup_rw(rw, tr, write_repl_write_cb, write_timeout_cb); + send_rw_messages(rw); + + pthread_mutex_unlock(&rw->lock); +} + + +void +start_write_repl_write_forget(rw_request* rw, as_transaction* tr) +{ + // Construct and send repl-write message. No need to finish rw setup. + + repl_write_make_message(rw, tr); + send_rw_messages_forget(rw); +} + + +bool +write_dup_res_cb(rw_request* rw) +{ + BENCHMARK_NEXT_DATA_POINT(rw, write, dup_res); + + as_transaction tr; + as_transaction_init_from_rw(&tr, rw); + + if (tr.result_code != AS_PROTO_RESULT_OK) { + send_write_response(&tr, NULL); + return true; + } + + // Set up the nodes to which we'll write replicas. + rw->n_dest_nodes = as_partition_get_other_replicas(tr.rsv.p, + rw->dest_nodes); + + if (insufficient_replica_destinations(tr.rsv.ns, rw->n_dest_nodes)) { + tr.result_code = AS_PROTO_RESULT_FAIL_UNAVAILABLE; + send_write_response(&tr, NULL); + return true; + } + + transaction_status status = write_master(rw, &tr); + + BENCHMARK_NEXT_DATA_POINT((&tr), write, master); + + if (status == TRANS_WAITING) { + // Note - new tr now owns msgp, make sure rw destructor doesn't free it. + // Also, rw will release rsv - new tr will get a new one. + rw->msgp = NULL; + return true; + } + + if (status == TRANS_DONE_ERROR) { + send_write_response(&tr, NULL); + return true; + } + + // If we don't need replica writes, transaction is finished. + if (rw->n_dest_nodes == 0) { + finished_replicated(&tr); + send_write_response(&tr, &rw->response_db); + return true; + } + + // If we don't need to wait for replica write acks, fire and forget. + if (respond_on_master_complete(&tr)) { + write_repl_write_forget_after_dup_res(rw, &tr); + send_write_response(&tr, &rw->response_db); + return true; + } + + write_repl_write_after_dup_res(rw, &tr); + + // Started replica write - don't delete rw_request from hash. + return false; +} + + +void +write_repl_write_after_dup_res(rw_request* rw, as_transaction* tr) +{ + // Recycle rw_request that was just used for duplicate resolution to now do + // replica writes. Note - we are under the rw_request lock here! + + repl_write_make_message(rw, tr); + repl_write_reset_rw(rw, tr, write_repl_write_cb); + send_rw_messages(rw); +} + + +void +write_repl_write_forget_after_dup_res(rw_request* rw, as_transaction* tr) +{ + // Send replica writes. Not waiting for acks, so need to reset rw_request. + // Note - we are under the rw_request lock here! + + repl_write_make_message(rw, tr); + send_rw_messages_forget(rw); +} + + +void +write_repl_write_cb(rw_request* rw) +{ + BENCHMARK_NEXT_DATA_POINT(rw, write, repl_write); + + as_transaction tr; + as_transaction_init_from_rw(&tr, rw); + + finished_replicated(&tr); + send_write_response(&tr, &rw->response_db); + + // Finished transaction - rw_request cleans up reservation and msgp! +} + + +//========================================================== +// Local helpers - transaction end. +// + +void +send_write_response(as_transaction* tr, cf_dyn_buf* db) +{ + // Paranoia - shouldn't get here on losing race with timeout. + if (! tr->from.any) { + cf_warning(AS_RW, "transaction origin %u has null 'from'", tr->origin); + return; + } + + // Note - if tr was setup from rw, rw->from.any has been set null and + // informs timeout it lost the race. + + clear_delete_response_metadata(tr); + + switch (tr->origin) { + case FROM_CLIENT: + if (db && db->used_sz != 0) { + as_msg_send_ops_reply(tr->from.proto_fd_h, db); + } + else { + as_msg_send_reply(tr->from.proto_fd_h, tr->result_code, + tr->generation, tr->void_time, NULL, NULL, 0, tr->rsv.ns, + as_transaction_trid(tr)); + } + BENCHMARK_NEXT_DATA_POINT(tr, write, response); + HIST_TRACK_ACTIVATE_INSERT_DATA_POINT(tr, write_hist); + client_write_update_stats(tr->rsv.ns, tr->result_code, + as_transaction_is_xdr(tr)); + break; + case FROM_PROXY: + if (db && db->used_sz != 0) { + as_proxy_send_ops_response(tr->from.proxy_node, + tr->from_data.proxy_tid, db); + } + else { + as_proxy_send_response(tr->from.proxy_node, tr->from_data.proxy_tid, + tr->result_code, tr->generation, tr->void_time, NULL, NULL, + 0, tr->rsv.ns, as_transaction_trid(tr)); + } + break; + default: + cf_crash(AS_RW, "unexpected transaction origin %u", tr->origin); + break; + } + + tr->from.any = NULL; // pattern, not needed +} + + +void +write_timeout_cb(rw_request* rw) +{ + if (! rw->from.any) { + return; // lost race against dup-res or repl-write callback + } + + finished_not_replicated(rw); + + switch (rw->origin) { + case FROM_CLIENT: + as_msg_send_reply(rw->from.proto_fd_h, AS_PROTO_RESULT_FAIL_TIMEOUT, 0, + 0, NULL, NULL, 0, rw->rsv.ns, rw_request_trid(rw)); + // Timeouts aren't included in histograms. + client_write_update_stats(rw->rsv.ns, AS_PROTO_RESULT_FAIL_TIMEOUT, + as_msg_is_xdr(&rw->msgp->msg)); + break; + case FROM_PROXY: + break; + default: + cf_crash(AS_RW, "unexpected transaction origin %u", rw->origin); + break; + } + + rw->from.any = NULL; // inform other callback it lost the race +} + + +//========================================================== +// Local helpers - write master. +// + +transaction_status +write_master(rw_request* rw, as_transaction* tr) +{ + CF_ALLOC_SET_NS_ARENA(tr->rsv.ns); + + //------------------------------------------------------ + // Perform checks that don't need to loop over ops, or + // create or find (and lock) the as_index. + // + + if (! write_master_preprocessing(tr)) { + // Failure cases all call write_master_failed(). + return TRANS_DONE_ERROR; + } + + //------------------------------------------------------ + // Loop over ops to set some essential policy flags. + // + + bool must_not_create; + bool record_level_replace; + bool must_fetch_data; + bool increment_generation; + + int result = write_master_policies(tr, &must_not_create, + &record_level_replace, &must_fetch_data, &increment_generation); + + if (result != 0) { + write_master_failed(tr, 0, false, 0, 0, result); + return TRANS_DONE_ERROR; + } + + //------------------------------------------------------ + // Find or create the as_index and get a reference - + // this locks the record. Perform all checks that don't + // need the as_storage_rd. + // + + // Shortcut pointers. + as_msg* m = &tr->msgp->msg; + as_namespace* ns = tr->rsv.ns; + as_index_tree* tree = tr->rsv.tree; + + // Find or create as_index, populate as_index_ref, lock record. + as_index_ref r_ref; + r_ref.skip_lock = false; + as_record* r = NULL; + bool record_created = false; + + if (must_not_create) { + if (as_record_get(tree, &tr->keyd, &r_ref) != 0) { + write_master_failed(tr, 0, record_created, tree, 0, AS_PROTO_RESULT_FAIL_NOT_FOUND); + return TRANS_DONE_ERROR; + } + + r = r_ref.r; + + if (as_record_is_doomed(r, ns)) { + write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_NOT_FOUND); + return TRANS_DONE_ERROR; + } + + if (repl_state_check(r, tr) < 0) { + as_record_done(&r_ref, ns); + return TRANS_WAITING; + } + + if (! as_record_is_live(r)) { + write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_NOT_FOUND); + return TRANS_DONE_ERROR; + } + } + else { + int rv = as_record_get_create(tree, &tr->keyd, &r_ref, ns); + + if (rv < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: fail as_record_get_create() ", ns->name); + write_master_failed(tr, 0, record_created, tree, 0, AS_PROTO_RESULT_FAIL_UNKNOWN); + return TRANS_DONE_ERROR; + } + + r = r_ref.r; + record_created = rv == 1; + + bool is_doomed = as_record_is_doomed(r, ns); + + if (! record_created && ! is_doomed && repl_state_check(r, tr) < 0) { + as_record_done(&r_ref, ns); + return TRANS_WAITING; + } + + // If it's an expired or truncated record, pretend it's a fresh create. + if (! record_created && is_doomed) { + as_record_rescue(&r_ref, ns); + record_created = true; + } + } + + // Enforce record-level create-only existence policy. + if (! record_created && ! create_only_check(r, m)) { + write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_RECORD_EXISTS); + return TRANS_DONE_ERROR; + } + + // Check generation requirement, if any. + if (! generation_check(r, m, ns)) { + write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_GENERATION); + return TRANS_DONE_ERROR; + } + + // If creating record, write set-ID into index. + if (record_created) { + int rv_set = as_transaction_has_set(tr) ? + set_set_from_msg(r, ns, m) : 0; + + if (rv_set == -1) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: set can't be added ", ns->name); + write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_PARAMETER); + return TRANS_DONE_ERROR; + } + else if (rv_set == -2) { + write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_FORBIDDEN); + return TRANS_DONE_ERROR; + } + + // Don't write record if it would be truncated. + if (as_truncate_now_is_truncated(ns, as_index_get_set_id(r))) { + write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_FORBIDDEN); + return TRANS_DONE_ERROR; + } + } + + // Shortcut set name. + const char* set_name = as_index_get_set_name(r, ns); + + // If record existed, check that as_msg set name matches. + if (! record_created && ! check_msg_set_name(tr, set_name)) { + write_master_failed(tr, &r_ref, record_created, tree, 0, AS_PROTO_RESULT_FAIL_PARAMETER); + return TRANS_DONE_ERROR; + } + + //------------------------------------------------------ + // Open or create the as_storage_rd, and handle record + // metadata. + // + + as_storage_rd rd; + + if (record_created) { + as_storage_record_create(ns, r, &rd); + } + else { + as_storage_record_open(ns, r, &rd); + } + + // Deal with delete durability (enterprise only). + if ((result = set_delete_durablility(tr, &rd)) != 0) { + write_master_failed(tr, &r_ref, record_created, tree, &rd, result); + return TRANS_DONE_ERROR; + } + + // Deal with key storage as needed. + if ((result = handle_msg_key(tr, &rd)) != 0) { + write_master_failed(tr, &r_ref, record_created, tree, &rd, result); + return TRANS_DONE_ERROR; + } + + // Assemble record properties from index information. + size_t rec_props_data_size = as_storage_record_rec_props_size(&rd); + uint8_t rec_props_data[rec_props_data_size]; + + if (rec_props_data_size > 0) { + as_storage_record_set_rec_props(&rd, rec_props_data); + } + + // Convert message TTL special value if appropriate. + if (record_created && m->record_ttl == TTL_DONT_UPDATE) { + m->record_ttl = TTL_NAMESPACE_DEFAULT; + } + + //------------------------------------------------------ + // Split write_master() according to configuration to + // handle record bins. + // + + xdr_dirty_bins dirty_bins; + xdr_clear_dirty_bins(&dirty_bins); + + bool is_delete = false; + + if (ns->storage_data_in_memory) { + if (ns->single_bin) { + result = write_master_dim_single_bin(tr, &rd, + increment_generation, + rw, &is_delete, &dirty_bins); + } + else { + result = write_master_dim(tr, set_name, &rd, + record_level_replace, increment_generation, + rw, &is_delete, &dirty_bins); + } + } + else { + if (ns->single_bin) { + result = write_master_ssd_single_bin(tr, &rd, + must_fetch_data, increment_generation, + rw, &is_delete, &dirty_bins); + } + else { + result = write_master_ssd(tr, set_name, &rd, + must_fetch_data, record_level_replace, increment_generation, + rw, &is_delete, &dirty_bins); + } + } + + if (result != 0) { + write_master_failed(tr, &r_ref, record_created, tree, &rd, result); + return TRANS_DONE_ERROR; + } + + //------------------------------------------------------ + // Done - complete function's output, release the record + // lock, and do XDR write if appropriate. + // + + tr->generation = r->generation; + tr->void_time = r->void_time; + tr->last_update_time = r->last_update_time; + + // Get set-id before releasing. + uint16_t set_id = as_index_get_set_id(r_ref.r); + + // Collect more info for XDR. + uint16_t generation = plain_generation(r->generation, ns); + xdr_op_type op_type = XDR_OP_TYPE_WRITE; + + // Handle deletion if appropriate. + if (is_delete) { + write_delete_record(r_ref.r, tree); + cf_atomic64_incr(&ns->n_deleted_last_bin); + tr->flags |= AS_TRANSACTION_FLAG_IS_DELETE; + + generation = 0; + op_type = as_transaction_is_durable_delete(tr) ? + XDR_OP_TYPE_DURABLE_DELETE : XDR_OP_TYPE_DROP; + } + // Or (normally) adjust max void-time. + else if (r->void_time != 0) { + cf_atomic64_setmax(&tr->rsv.p->max_void_time, r->void_time); + } + + will_replicate(r, ns); + + as_storage_record_close(&rd); + as_record_done(&r_ref, ns); + + // Don't send an XDR delete if it's disallowed. + if (is_delete && ! is_xdr_delete_shipping_enabled()) { + return TRANS_IN_PROGRESS; + } + + // Do an XDR write if the write is a non-XDR write or is an XDR write with + // forwarding enabled. + if (! as_msg_is_xdr(m) || is_xdr_forwarding_enabled() || + ns->ns_forward_xdr_writes) { + xdr_write(ns, &tr->keyd, generation, 0, op_type, set_id, &dirty_bins); + } + + return TRANS_IN_PROGRESS; +} + + +void +write_master_failed(as_transaction* tr, as_index_ref* r_ref, + bool record_created, as_index_tree* tree, as_storage_rd* rd, + int result_code) +{ + as_namespace* ns = tr->rsv.ns; + + if (r_ref) { + if (record_created) { + as_index_delete(tree, &tr->keyd); + } + + if (rd) { + as_storage_record_close(rd); + } + + as_record_done(r_ref, ns); + } + + switch (result_code) { + case AS_PROTO_RESULT_FAIL_GENERATION: + cf_atomic64_incr(&ns->n_fail_generation); + break; + case AS_PROTO_RESULT_FAIL_RECORD_TOO_BIG: + cf_detail_digest(AS_RW, &tr->keyd, "{%s} write_master: record too big ", ns->name); + cf_atomic64_incr(&ns->n_fail_record_too_big); + break; + default: + // These either log warnings or aren't interesting enough to count. + break; + } + + tr->result_code = (uint8_t)result_code; +} + + +int +write_master_preprocessing(as_transaction* tr) +{ + as_namespace* ns = tr->rsv.ns; + as_msg* m = &tr->msgp->msg; + + if (ns->clock_skew_stop_writes) { + // TODO - new error code? + write_master_failed(tr, 0, false, 0, 0, AS_PROTO_RESULT_FAIL_FORBIDDEN); + return false; + } + + // ns->stop_writes is set by thr_nsup if configured threshold is breached. + if (cf_atomic32_get(ns->stop_writes) == 1) { + write_master_failed(tr, 0, false, 0, 0, AS_PROTO_RESULT_FAIL_OUT_OF_SPACE); + return false; + } + + if (! as_storage_has_space(ns)) { + cf_warning(AS_RW, "{%s}: write_master: drives full", ns->name); + write_master_failed(tr, 0, false, 0, 0, AS_PROTO_RESULT_FAIL_OUT_OF_SPACE); + return false; + } + + if (! is_valid_ttl(ns, m->record_ttl)) { + cf_warning(AS_RW, "write_master: invalid ttl %u", m->record_ttl); + write_master_failed(tr, 0, false, 0, 0, AS_PROTO_RESULT_FAIL_PARAMETER); + return false; + } + + // Fail if disallow_null_setname is true and set name is absent or empty. + if (ns->disallow_null_setname) { + as_msg_field* f = as_transaction_has_set(tr) ? + as_msg_field_get(m, AS_MSG_FIELD_TYPE_SET) : NULL; + + if (! f || as_msg_field_get_value_sz(f) == 0) { + cf_warning(AS_RW, "write_master: null/empty set name not allowed for namespace %s", ns->name); + write_master_failed(tr, 0, false, 0, 0, AS_PROTO_RESULT_FAIL_PARAMETER); + return false; + } + } + + return true; +} + + +int +write_master_policies(as_transaction* tr, bool* p_must_not_create, + bool* p_record_level_replace, bool* p_must_fetch_data, + bool* p_increment_generation) +{ + // Shortcut pointers. + as_msg* m = &tr->msgp->msg; + as_namespace* ns = tr->rsv.ns; + + if (m->n_ops == 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: bin op(s) expected, none present ", ns->name); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + bool info1_get_all = (m->info1 & AS_MSG_INFO1_GET_ALL) != 0; + bool respond_all_ops = (m->info2 & AS_MSG_INFO2_RESPOND_ALL_OPS) != 0; + + bool must_not_create = + (m->info3 & AS_MSG_INFO3_UPDATE_ONLY) != 0 || + (m->info3 & AS_MSG_INFO3_REPLACE_ONLY) != 0; + + bool record_level_replace = + (m->info3 & AS_MSG_INFO3_CREATE_OR_REPLACE) != 0 || + (m->info3 & AS_MSG_INFO3_REPLACE_ONLY) != 0; + + bool must_fetch_data = false; + + bool increment_generation = false; + + bool has_read_all_op = false; + bool generates_response_bin = false; + + // Loop over ops to check and modify flags. + as_msg_op* op = NULL; + int i = 0; + + while ((op = as_msg_op_iterate(m, op, &i)) != NULL) { + if (op->op != AS_MSG_OP_MC_TOUCH) { + increment_generation = true; + } + + if (OP_IS_TOUCH(op->op)) { + if (record_level_replace) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: touch op can't have record-level replace flag ", ns->name); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + must_not_create = true; + must_fetch_data = true; + continue; + } + + if (ns->data_in_index && + ! is_embedded_particle_type(op->particle_type) && + // Allow AS_PARTICLE_TYPE_NULL, although bin-delete operations + // are not likely in single-bin configuration. + op->particle_type != AS_PARTICLE_TYPE_NULL) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: can't write data type %u in data-in-index configuration ", ns->name, op->particle_type); + return AS_PROTO_RESULT_FAIL_INCOMPATIBLE_TYPE; + } + + if (op->name_sz >= AS_ID_BIN_SZ) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: bin name too long (%d) ", ns->name, op->name_sz); + return AS_PROTO_RESULT_FAIL_BIN_NAME; + } + + if (op->op == AS_MSG_OP_WRITE) { + if (op->particle_type == AS_PARTICLE_TYPE_NULL && + record_level_replace) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: bin delete can't have record-level replace flag ", ns->name); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + else if (OP_IS_MODIFY(op->op)) { + if (record_level_replace) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: modify op can't have record-level replace flag ", ns->name); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + must_fetch_data = true; + } + else if (op_is_read_all(op, m)) { + if (respond_all_ops) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: read-all op can't have respond-all-ops flag ", ns->name); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (has_read_all_op) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: can't have more than one read-all op ", ns->name); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + has_read_all_op = true; + must_fetch_data = true; + } + else if (op->op == AS_MSG_OP_READ) { + generates_response_bin = true; + must_fetch_data = true; + } + else if (op->op == AS_MSG_OP_CDT_MODIFY) { + if (record_level_replace) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: cdt modify op can't have record-level replace flag ", ns->name); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + generates_response_bin = true; // CDT modify may generate a response bin + must_fetch_data = true; + } + else if (op->op == AS_MSG_OP_CDT_READ) { + generates_response_bin = true; + must_fetch_data = true; + } + } + + if (has_read_all_op && generates_response_bin) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: read-all op can't mix with ops that generate response bins ", ns->name); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + if (info1_get_all && ! has_read_all_op) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: get-all flag set with no read-all op ", ns->name); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + + *p_must_not_create = must_not_create; + *p_record_level_replace = record_level_replace; + *p_must_fetch_data = must_fetch_data; + *p_increment_generation = increment_generation; + + return 0; +} + + +bool +check_msg_set_name(as_transaction* tr, const char* set_name) +{ + as_msg_field* f = as_transaction_has_set(tr) ? + as_msg_field_get(&tr->msgp->msg, AS_MSG_FIELD_TYPE_SET) : NULL; + + if (! f || as_msg_field_get_value_sz(f) == 0) { + if (set_name) { + cf_warning_digest(AS_RW, &tr->keyd, "overwriting record in set '%s' but msg has no set name ", + set_name); + } + + return true; + } + + size_t msg_set_name_len = as_msg_field_get_value_sz(f); + + if (! set_name || + strncmp(set_name, (const char*)f->data, msg_set_name_len) != 0 || + set_name[msg_set_name_len] != 0) { + CF_ZSTR_DEFINE(msg_set_name, AS_SET_NAME_MAX_SIZE + 4, f->data, + msg_set_name_len); + + cf_warning_digest(AS_RW, &tr->keyd, "overwriting record in set '%s' but msg has different set name '%s' ", + set_name ? set_name : "(null)", msg_set_name); + return false; + } + + return true; +} + + +//========================================================== +// write_master() splits based on configuration - +// data-in-memory & single-bin. +// +// These handle the bin operations part of write_master() +// which are very different per configuration. +// + +int +write_master_dim_single_bin(as_transaction* tr, as_storage_rd* rd, + bool increment_generation, rw_request* rw, bool* is_delete, + xdr_dirty_bins* dirty_bins) +{ + // Shortcut pointers. + as_msg* m = &tr->msgp->msg; + as_namespace* ns = tr->rsv.ns; + as_record* r = rd->r; + + rd->n_bins = 1; + + // Set rd->bins! + // For data-in-memory: + // - if just created record - sets rd->bins to empty bin embedded in index + // - otherwise - sets rd->bins to existing embedded bin + as_storage_rd_load_bins(rd, NULL); + + // For memory accounting, note current usage. + uint64_t memory_bytes = 0; + + if (as_bin_inuse(rd->bins)) { + memory_bytes = as_storage_record_get_n_bytes_memory(rd); + } + + //------------------------------------------------------ + // Copy existing bin into old_bin to enable unwinding. + // + + uint32_t n_old_bins = as_bin_inuse(rd->bins) ? 1 : 0; + as_bin old_bin; + + as_single_bin_copy(&old_bin, rd->bins); + + // Collect bins (old or intermediate versions) to destroy on cleanup. + as_bin cleanup_bins[m->n_ops]; + uint32_t n_cleanup_bins = 0; + + //------------------------------------------------------ + // Apply changes to metadata in as_index needed for + // response, pickling, and writing. + // + + index_metadata old_metadata; + + write_master_update_index_metadata(tr, increment_generation, &old_metadata, r); + + //------------------------------------------------------ + // Loop over bin ops to affect new bin space, creating + // the new record bin to write. + // + + uint32_t n_new_bins = 0; + int result = write_master_bin_ops(tr, rd, NULL, cleanup_bins, + &n_cleanup_bins, &rw->response_db, &n_new_bins, dirty_bins); + + if (result != 0) { + write_master_index_metadata_unwind(&old_metadata, r); + write_master_dim_single_bin_unwind(&old_bin, rd->bins, cleanup_bins, n_cleanup_bins); + return result; + } + + //------------------------------------------------------ + // Created the new bin to write. + // + + if (n_new_bins == 0) { + if (n_old_bins == 0) { + write_master_index_metadata_unwind(&old_metadata, r); + write_master_dim_single_bin_unwind(&old_bin, rd->bins, cleanup_bins, n_cleanup_bins); + return AS_PROTO_RESULT_FAIL_NOT_FOUND; + } + + if (! validate_delete_durability(tr)) { + write_master_index_metadata_unwind(&old_metadata, r); + write_master_dim_single_bin_unwind(&old_bin, rd->bins, cleanup_bins, n_cleanup_bins); + return AS_PROTO_RESULT_FAIL_FORBIDDEN; + } + + *is_delete = true; + } + + // Pickle before writing - can't fail after. (Historic - now can't fail.) + pickle_all(rd, rw); + + //------------------------------------------------------ + // Write the record to storage. + // + + if ((result = as_storage_record_write(rd)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_record_write() ", ns->name); + write_master_index_metadata_unwind(&old_metadata, r); + write_master_dim_single_bin_unwind(&old_bin, rd->bins, cleanup_bins, n_cleanup_bins); + return -result; + } + + //------------------------------------------------------ + // Cleanup - destroy relevant bins, can't unwind after. + // + + destroy_stack_bins(cleanup_bins, n_cleanup_bins); + + as_storage_record_adjust_mem_stats(rd, memory_bytes); + + return 0; +} + + +int +write_master_dim(as_transaction* tr, const char* set_name, as_storage_rd* rd, + bool record_level_replace, bool increment_generation, rw_request* rw, + bool* is_delete, xdr_dirty_bins* dirty_bins) +{ + // Shortcut pointers. + as_msg* m = &tr->msgp->msg; + as_namespace* ns = tr->rsv.ns; + as_record* r = rd->r; + + // Set rd->n_bins! + // For data-in-memory - number of bins in existing record. + as_storage_rd_load_n_bins(rd); + + // Set rd->bins! + // For data-in-memory: + // - if just created record - sets rd->bins to NULL + // - otherwise - sets rd->bins to existing (already populated) bins array + as_storage_rd_load_bins(rd, NULL); + + // For memory accounting, note current usage. + uint64_t memory_bytes = as_storage_record_get_n_bytes_memory(rd); + + //------------------------------------------------------ + // Copy existing bins to new space, and keep old bins + // intact for sindex adjustment and so it's possible to + // unwind on failure. + // + + uint32_t n_old_bins = (uint32_t)rd->n_bins; + uint32_t n_new_bins = n_old_bins + m->n_ops; // can't be more than this + + size_t old_bins_size = n_old_bins * sizeof(as_bin); + size_t new_bins_size = n_new_bins * sizeof(as_bin); + + as_bin* old_bins = rd->bins; + as_bin new_bins[n_new_bins]; + + if (old_bins_size == 0 || record_level_replace) { + memset(new_bins, 0, new_bins_size); + } + else { + memcpy(new_bins, old_bins, old_bins_size); + memset(new_bins + n_old_bins, 0, new_bins_size - old_bins_size); + } + + rd->n_bins = (uint16_t)n_new_bins; + rd->bins = new_bins; + + // Collect bins (old or intermediate versions) to destroy on cleanup. + as_bin cleanup_bins[m->n_ops]; + uint32_t n_cleanup_bins = 0; + + //------------------------------------------------------ + // Apply changes to metadata in as_index needed for + // response, pickling, and writing. + // + + index_metadata old_metadata; + + write_master_update_index_metadata(tr, increment_generation, &old_metadata, r); + + //------------------------------------------------------ + // Loop over bin ops to affect new bin space, creating + // the new record bins to write. + // + + int result = write_master_bin_ops(tr, rd, NULL, cleanup_bins, + &n_cleanup_bins, &rw->response_db, &n_new_bins, dirty_bins); + + if (result != 0) { + write_master_index_metadata_unwind(&old_metadata, r); + write_master_dim_unwind(old_bins, n_old_bins, new_bins, n_new_bins, cleanup_bins, n_cleanup_bins); + return result; + } + + //------------------------------------------------------ + // Created the new bins to write. + // + + as_bin_space* new_bin_space = NULL; + + // Adjust - the actual number of new bins. + rd->n_bins = n_new_bins; + + if (n_new_bins != 0) { + new_bins_size = n_new_bins * sizeof(as_bin); + new_bin_space = (as_bin_space*) + cf_malloc_ns(sizeof(as_bin_space) + new_bins_size); + } + else { + if (n_old_bins == 0) { + write_master_index_metadata_unwind(&old_metadata, r); + write_master_dim_unwind(old_bins, n_old_bins, new_bins, n_new_bins, cleanup_bins, n_cleanup_bins); + return AS_PROTO_RESULT_FAIL_NOT_FOUND; + } + + if (! validate_delete_durability(tr)) { + write_master_index_metadata_unwind(&old_metadata, r); + write_master_dim_unwind(old_bins, n_old_bins, new_bins, n_new_bins, cleanup_bins, n_cleanup_bins); + return AS_PROTO_RESULT_FAIL_FORBIDDEN; + } + + *is_delete = true; + } + + // Pickle before writing - can't fail after. (Historic - now can't fail.) + pickle_all(rd, rw); + + //------------------------------------------------------ + // Write the record to storage. + // + + if ((result = as_storage_record_write(rd)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_record_write() ", ns->name); + + if (new_bin_space) { + cf_free(new_bin_space); + } + + write_master_index_metadata_unwind(&old_metadata, r); + write_master_dim_unwind(old_bins, n_old_bins, new_bins, n_new_bins, cleanup_bins, n_cleanup_bins); + return -result; + } + + //------------------------------------------------------ + // Success - adjust sindex, looking at old and new bins. + // + + if (record_has_sindex(r, ns) && + write_sindex_update(ns, set_name, &tr->keyd, old_bins, n_old_bins, + new_bins, n_new_bins)) { + tr->flags |= AS_TRANSACTION_FLAG_SINDEX_TOUCHED; + } + + //------------------------------------------------------ + // Cleanup - destroy relevant bins, can't unwind after. + // + + if (record_level_replace) { + destroy_stack_bins(old_bins, n_old_bins); + } + + destroy_stack_bins(cleanup_bins, n_cleanup_bins); + + //------------------------------------------------------ + // Final changes to record data in as_index. + // + + // Fill out new_bin_space. + if (n_new_bins != 0) { + new_bin_space->n_bins = rd->n_bins; + memcpy((void*)new_bin_space->bins, new_bins, new_bins_size); + } + + // Swizzle the index element's as_bin_space pointer. + as_bin_space* old_bin_space = as_index_get_bin_space(r); + + if (old_bin_space) { + cf_free(old_bin_space); + } + + as_index_set_bin_space(r, new_bin_space); + + // Accommodate a new stored key - wasn't needed for pickling and writing. + if (r->key_stored == 0 && rd->key) { + as_record_allocate_key(r, rd->key, rd->key_size); + r->key_stored = 1; + } + + as_storage_record_adjust_mem_stats(rd, memory_bytes); + + return 0; +} + + +int +write_master_ssd_single_bin(as_transaction* tr, as_storage_rd* rd, + bool must_fetch_data, bool increment_generation, rw_request* rw, + bool* is_delete, xdr_dirty_bins* dirty_bins) +{ + // Shortcut pointers. + as_namespace* ns = tr->rsv.ns; + as_record* r = rd->r; + + rd->ignore_record_on_device = ! must_fetch_data; + rd->n_bins = 1; + + as_bin stack_bin; + + // Set rd->bins! + // For non-data-in-memory: + // - if just created record, or must_fetch_data is false - sets rd->bins to + // empty stack_bin + // - otherwise - sets rd->bins to stack_bin, reads existing record off + // device and populates bin (including particle pointer into block + // buffer) + int result = as_storage_rd_load_bins(rd, &stack_bin); + + if (result < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_rd_load_bins()", ns->name); + return -result; + } + + uint32_t n_old_bins = as_bin_inuse(rd->bins) ? 1 : 0; + + //------------------------------------------------------ + // Apply changes to metadata in as_index needed for + // response, pickling, and writing. + // + + index_metadata old_metadata; + + write_master_update_index_metadata(tr, increment_generation, &old_metadata, r); + + //------------------------------------------------------ + // Loop over bin ops to affect new bin space, creating + // the new record bin to write. + // + + cf_ll_buf_define(particles_llb, STACK_PARTICLES_SIZE); + + uint32_t n_new_bins = 0; + + if ((result = write_master_bin_ops(tr, rd, &particles_llb, NULL, NULL, + &rw->response_db, &n_new_bins, dirty_bins)) != 0) { + cf_ll_buf_free(&particles_llb); + write_master_index_metadata_unwind(&old_metadata, r); + return result; + } + + //------------------------------------------------------ + // Created the new bin to write. + // + + if (n_new_bins == 0) { + if (n_old_bins == 0) { + cf_ll_buf_free(&particles_llb); + write_master_index_metadata_unwind(&old_metadata, r); + return AS_PROTO_RESULT_FAIL_NOT_FOUND; + } + + if (! validate_delete_durability(tr)) { + cf_ll_buf_free(&particles_llb); + write_master_index_metadata_unwind(&old_metadata, r); + return AS_PROTO_RESULT_FAIL_FORBIDDEN; + } + + *is_delete = true; + } + + // Pickle before writing - bins may disappear on as_storage_record_close(). + pickle_all(rd, rw); + + //------------------------------------------------------ + // Write the record to storage. + // + + if ((result = as_storage_record_write(rd)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_record_write() ", ns->name); + cf_ll_buf_free(&particles_llb); + write_master_index_metadata_unwind(&old_metadata, r); + return -result; + } + + //------------------------------------------------------ + // Final changes to record data in as_index. + // + + // Accommodate a new stored key - wasn't needed for pickling and writing. + if (r->key_stored == 0 && rd->key) { + r->key_stored = 1; + } + + cf_ll_buf_free(&particles_llb); + + return 0; +} + + +int +write_master_ssd(as_transaction* tr, const char* set_name, as_storage_rd* rd, + bool must_fetch_data, bool record_level_replace, + bool increment_generation, rw_request* rw, bool* is_delete, + xdr_dirty_bins* dirty_bins) +{ + // Shortcut pointers. + as_msg* m = &tr->msgp->msg; + as_namespace* ns = tr->rsv.ns; + as_record* r = rd->r; + bool has_sindex = record_has_sindex(r, ns); + + // If it's not touch or modify, determine if we must read existing record. + if (! must_fetch_data) { + must_fetch_data = has_sindex || ! record_level_replace; + } + + rd->ignore_record_on_device = ! must_fetch_data; + + // Set rd->n_bins! + // For non-data-in-memory: + // - if just created record, or must_fetch_data is false - 0 + // - otherwise - number of bins in existing record + int result = as_storage_rd_load_n_bins(rd); + + if (result < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_rd_load_n_bins()", ns->name); + return -result; + } + + uint32_t n_old_bins = (uint32_t)rd->n_bins; + uint32_t n_new_bins = n_old_bins + m->n_ops; // can't be more than this + + // Needed for as_storage_rd_load_bins() to clear all unused bins. + rd->n_bins = (uint16_t)n_new_bins; + + // Stack space for resulting record's bins. + as_bin old_bins[n_old_bins]; + as_bin new_bins[n_new_bins]; + + // Set rd->bins! + // For non-data-in-memory: + // - if just created record, or must_fetch_data is false - sets rd->bins to + // empty new_bins + // - otherwise - sets rd->bins to new_bins, reads existing record off device + // and populates bins (including particle pointers into block buffer) + if ((result = as_storage_rd_load_bins(rd, new_bins)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_rd_load_bins()", ns->name); + return -result; + } + + //------------------------------------------------------ + // Copy old bins (if any) - which are currently in new + // bins array - to old bins array, for sindex purposes. + // + + if (has_sindex && n_old_bins != 0) { + memcpy(old_bins, new_bins, n_old_bins * sizeof(as_bin)); + + // If it's a replace, clear the new bins array. + if (record_level_replace) { + as_bin_set_all_empty(rd); + } + } + + //------------------------------------------------------ + // Apply changes to metadata in as_index needed for + // response, pickling, and writing. + // + + index_metadata old_metadata; + + write_master_update_index_metadata(tr, increment_generation, &old_metadata, r); + + //------------------------------------------------------ + // Loop over bin ops to affect new bin space, creating + // the new record bins to write. + // + + cf_ll_buf_define(particles_llb, STACK_PARTICLES_SIZE); + + if ((result = write_master_bin_ops(tr, rd, &particles_llb, NULL, NULL, + &rw->response_db, &n_new_bins, dirty_bins)) != 0) { + cf_ll_buf_free(&particles_llb); + write_master_index_metadata_unwind(&old_metadata, r); + return result; + } + + //------------------------------------------------------ + // Created the new bins to write. + // + + // Adjust - the actual number of new bins. + rd->n_bins = n_new_bins; + + if (n_new_bins == 0) { + if (n_old_bins == 0) { + cf_ll_buf_free(&particles_llb); + write_master_index_metadata_unwind(&old_metadata, r); + return AS_PROTO_RESULT_FAIL_NOT_FOUND; + } + + if (! validate_delete_durability(tr)) { + cf_ll_buf_free(&particles_llb); + write_master_index_metadata_unwind(&old_metadata, r); + return AS_PROTO_RESULT_FAIL_FORBIDDEN; + } + + *is_delete = true; + } + + // Pickle before writing - bins may disappear on as_storage_record_close(). + pickle_all(rd, rw); + + //------------------------------------------------------ + // Write the record to storage. + // + + if ((result = as_storage_record_write(rd)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_storage_record_write() ", ns->name); + cf_ll_buf_free(&particles_llb); + write_master_index_metadata_unwind(&old_metadata, r); + return -result; + } + + //------------------------------------------------------ + // Success - adjust sindex, looking at old and new bins. + // + + if (has_sindex && + write_sindex_update(ns, set_name, &tr->keyd, old_bins, n_old_bins, + new_bins, n_new_bins)) { + tr->flags |= AS_TRANSACTION_FLAG_SINDEX_TOUCHED; + } + + //------------------------------------------------------ + // Final changes to record data in as_index. + // + + // Accommodate a new stored key - wasn't needed for pickling and writing. + if (r->key_stored == 0 && rd->key) { + r->key_stored = 1; + } + + cf_ll_buf_free(&particles_llb); + + return 0; +} + + +//========================================================== +// write_master() - apply record updates. +// + +void +write_master_update_index_metadata(as_transaction* tr, + bool increment_generation, index_metadata* old, as_record* r) +{ + old->void_time = r->void_time; + old->last_update_time = r->last_update_time; + old->generation = r->generation; + + update_metadata_in_index(tr, increment_generation, r); +} + + +int +write_master_bin_ops(as_transaction* tr, as_storage_rd* rd, + cf_ll_buf* particles_llb, as_bin* cleanup_bins, + uint32_t* p_n_cleanup_bins, cf_dyn_buf* db, uint32_t* p_n_final_bins, + xdr_dirty_bins* dirty_bins) +{ + // Shortcut pointers. + as_msg* m = &tr->msgp->msg; + as_namespace* ns = tr->rsv.ns; + as_record* r = rd->r; + bool has_read_all_op = (m->info1 & AS_MSG_INFO1_GET_ALL) != 0; + + as_msg_op* ops[m->n_ops]; + as_bin response_bins[has_read_all_op ? rd->n_bins : m->n_ops]; + as_bin result_bins[m->n_ops]; + + uint32_t n_response_bins = 0; + uint32_t n_result_bins = 0; + + int result = write_master_bin_ops_loop(tr, rd, ops, response_bins, + &n_response_bins, result_bins, &n_result_bins, particles_llb, + cleanup_bins, p_n_cleanup_bins, dirty_bins); + + if (result != 0) { + destroy_stack_bins(result_bins, n_result_bins); + return result; + } + + *p_n_final_bins = as_bin_inuse_count(rd); + + if (n_response_bins == 0) { + // If 'ordered-ops' flag was not set, and there were no read ops or CDT + // ops with results, there's no response to build and send later. + return 0; + } + + as_bin* bins[n_response_bins]; + + for (uint32_t i = 0; i < n_response_bins; i++) { + as_bin* b = &response_bins[i]; + + bins[i] = as_bin_inuse(b) ? b : NULL; + } + + uint32_t generation = r->generation; + uint32_t void_time = r->void_time; + + // Deletes don't return metadata. + if (*p_n_final_bins == 0) { + generation = 0; + void_time = 0; + } + + size_t msg_sz = 0; + uint8_t* msgp = (uint8_t*)as_msg_make_response_msg(AS_PROTO_RESULT_OK, + generation, void_time, has_read_all_op ? NULL : ops, bins, + (uint16_t)n_response_bins, ns, NULL, &msg_sz, + as_transaction_trid(tr)); + + destroy_stack_bins(result_bins, n_result_bins); + + // Stash the message, to be sent later. + db->buf = msgp; + db->is_stack = false; + db->alloc_sz = msg_sz; + db->used_sz = msg_sz; + + return 0; +} + + +int +write_master_bin_ops_loop(as_transaction* tr, as_storage_rd* rd, + as_msg_op** ops, as_bin* response_bins, uint32_t* p_n_response_bins, + as_bin* result_bins, uint32_t* p_n_result_bins, + cf_ll_buf* particles_llb, as_bin* cleanup_bins, + uint32_t* p_n_cleanup_bins, xdr_dirty_bins* dirty_bins) +{ + // Shortcut pointers. + as_msg* m = &tr->msgp->msg; + as_namespace* ns = tr->rsv.ns; + bool respond_all_ops = (m->info2 & AS_MSG_INFO2_RESPOND_ALL_OPS) != 0; + + int result; + + as_msg_op* op = NULL; + int i = 0; + + while ((op = as_msg_op_iterate(m, op, &i)) != NULL) { + if (OP_IS_TOUCH(op->op)) { + continue; + } + + if (op->op == AS_MSG_OP_WRITE) { + // AS_PARTICLE_TYPE_NULL means delete the bin. + // TODO - should this even be allowed for single-bin? + if (op->particle_type == AS_PARTICLE_TYPE_NULL) { + int32_t j = as_bin_get_index_from_buf(rd, op->name, op->name_sz); + + if (j != -1) { + if (ns->storage_data_in_memory) { + // Double copy necessary for single-bin, but doing it + // generally for code simplicity. + as_bin cleanup_bin; + as_bin_copy(ns, &cleanup_bin, &rd->bins[j]); + + append_bin_to_destroy(&cleanup_bin, cleanup_bins, p_n_cleanup_bins); + } + + as_bin_set_empty_shift(rd, j); + xdr_fill_dirty_bins(dirty_bins); + } + } + // It's a regular bin write. + else { + as_bin* b = as_bin_get_or_create_from_buf(rd, op->name, op->name_sz, &result); + + if (! b) { + return result; + } + + if (ns->storage_data_in_memory) { + as_bin cleanup_bin; + as_bin_copy(ns, &cleanup_bin, b); + + if ((result = as_bin_particle_alloc_from_client(b, op)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_particle_alloc_from_client() ", ns->name); + return -result; + } + + append_bin_to_destroy(&cleanup_bin, cleanup_bins, p_n_cleanup_bins); + } + else { + if ((result = as_bin_particle_stack_from_client(b, particles_llb, op)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_particle_stack_from_client() ", ns->name); + return -result; + } + } + + xdr_add_dirty_bin(ns, dirty_bins, (const char*)op->name, op->name_sz); + } + + if (respond_all_ops) { + ops[*p_n_response_bins] = op; + as_bin_set_empty(&response_bins[(*p_n_response_bins)++]); + } + } + // Modify an existing bin value. + else if (OP_IS_MODIFY(op->op)) { + as_bin* b = as_bin_get_or_create_from_buf(rd, op->name, op->name_sz, &result); + + if (! b) { + return result; + } + + if (ns->storage_data_in_memory) { + as_bin cleanup_bin; + as_bin_copy(ns, &cleanup_bin, b); + + if ((result = as_bin_particle_alloc_modify_from_client(b, op)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_particle_alloc_modify_from_client() ", ns->name); + return -result; + } + + append_bin_to_destroy(&cleanup_bin, cleanup_bins, p_n_cleanup_bins); + } + else { + if ((result = as_bin_particle_stack_modify_from_client(b, particles_llb, op)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_particle_stack_modify_from_client() ", ns->name); + return -result; + } + } + + xdr_add_dirty_bin(ns, dirty_bins, (const char*)op->name, op->name_sz); + + if (respond_all_ops) { + ops[*p_n_response_bins] = op; + as_bin_set_empty(&response_bins[(*p_n_response_bins)++]); + } + } + else if (op_is_read_all(op, m)) { + for (uint16_t i = 0; i < rd->n_bins; i++) { + as_bin* b = &rd->bins[i]; + + if (! as_bin_inuse(b)) { + break; + } + + // ops array will not be not used in this case. + as_bin_copy(ns, &response_bins[(*p_n_response_bins)++], b); + } + } + else if (op->op == AS_MSG_OP_READ) { + as_bin* b = as_bin_get_from_buf(rd, op->name, op->name_sz); + + if (b) { + ops[*p_n_response_bins] = op; + as_bin_copy(ns, &response_bins[(*p_n_response_bins)++], b); + } + else if (respond_all_ops) { + ops[*p_n_response_bins] = op; + as_bin_set_empty(&response_bins[(*p_n_response_bins)++]); + } + } + else if (op->op == AS_MSG_OP_CDT_MODIFY) { + as_bin* b = as_bin_get_or_create_from_buf(rd, op->name, op->name_sz, &result); + + if (! b) { + return result; + } + + as_bin result_bin; + as_bin_set_empty(&result_bin); + + if (ns->storage_data_in_memory) { + as_bin cleanup_bin; + as_bin_copy(ns, &cleanup_bin, b); + + if ((result = as_bin_cdt_alloc_modify_from_client(b, op, &result_bin)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_cdt_alloc_modify_from_client() ", ns->name); + return -result; + } + + // Account for noop CDT operations. Modifying non-mutable + // particle contents in-place is still disallowed. + if (cleanup_bin.particle != b->particle) { + append_bin_to_destroy(&cleanup_bin, cleanup_bins, p_n_cleanup_bins); + } + } + else { + if ((result = as_bin_cdt_stack_modify_from_client(b, particles_llb, op, &result_bin)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_cdt_stack_modify_from_client() ", ns->name); + return -result; + } + } + + if (respond_all_ops || as_bin_inuse(&result_bin)) { + ops[*p_n_response_bins] = op; + response_bins[(*p_n_response_bins)++] = result_bin; + append_bin_to_destroy(&result_bin, result_bins, p_n_result_bins); + } + + if (! as_bin_inuse(b)) { + // TODO - could do better than finding index from name. + int32_t index = as_bin_get_index_from_buf(rd, op->name, op->name_sz); + + if (index >= 0) { + as_bin_set_empty_shift(rd, (uint32_t)index); + xdr_fill_dirty_bins(dirty_bins); + } + } + else { + xdr_add_dirty_bin(ns, dirty_bins, (const char*)op->name, op->name_sz); + } + } + else if (op->op == AS_MSG_OP_CDT_READ) { + as_bin* b = as_bin_get_from_buf(rd, op->name, op->name_sz); + + if (b) { + as_bin result_bin; + as_bin_set_empty(&result_bin); + + if ((result = as_bin_cdt_read_from_client(b, op, &result_bin)) < 0) { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: failed as_bin_cdt_read_from_client() ", ns->name); + return -result; + } + + ops[*p_n_response_bins] = op; + response_bins[(*p_n_response_bins)++] = result_bin; + append_bin_to_destroy(&result_bin, result_bins, p_n_result_bins); + } + else if (respond_all_ops) { + ops[*p_n_response_bins] = op; + as_bin_set_empty(&response_bins[(*p_n_response_bins)++]); + } + } + else { + cf_warning_digest(AS_RW, &tr->keyd, "{%s} write_master: unknown bin op %u ", ns->name, op->op); + return AS_PROTO_RESULT_FAIL_PARAMETER; + } + } + + return 0; +} + + +//========================================================== +// write_master() - unwind on failure or cleanup. +// + +void +write_master_index_metadata_unwind(index_metadata* old, as_record* r) +{ + r->void_time = old->void_time; + r->last_update_time = old->last_update_time; + r->generation = old->generation; +} + + +void +write_master_dim_single_bin_unwind(as_bin* old_bin, as_bin* new_bin, + as_bin* cleanup_bins, uint32_t n_cleanup_bins) +{ + as_particle* p_old = as_bin_get_particle(old_bin); + + if (as_bin_is_external_particle(new_bin) && new_bin->particle != p_old) { + as_bin_particle_destroy(new_bin, true); + } + + for (uint32_t i_cleanup = 0; i_cleanup < n_cleanup_bins; i_cleanup++) { + as_bin* b_cleanup = &cleanup_bins[i_cleanup]; + + if (b_cleanup->particle != p_old) { + as_bin_particle_destroy(b_cleanup, true); + } + } + + as_single_bin_copy(new_bin, old_bin); +} + + +void +write_master_dim_unwind(as_bin* old_bins, uint32_t n_old_bins, as_bin* new_bins, + uint32_t n_new_bins, as_bin* cleanup_bins, uint32_t n_cleanup_bins) +{ + for (uint32_t i_new = 0; i_new < n_new_bins; i_new++) { + as_bin* b_new = &new_bins[i_new]; + + if (! as_bin_inuse(b_new)) { + break; + } + + // Embedded particles have no-op destructors - skip loop over old bins. + if (as_bin_is_embedded_particle(b_new)) { + continue; + } + + as_particle* p_new = b_new->particle; + uint32_t i_old; + + for (i_old = 0; i_old < n_old_bins; i_old++) { + as_bin* b_old = &old_bins[i_old]; + + if (b_new->id == b_old->id) { + if (p_new != as_bin_get_particle(b_old)) { + as_bin_particle_destroy(b_new, true); + } + + break; + } + } + + if (i_old == n_old_bins) { + as_bin_particle_destroy(b_new, true); + } + } + + for (uint32_t i_cleanup = 0; i_cleanup < n_cleanup_bins; i_cleanup++) { + as_bin* b_cleanup = &cleanup_bins[i_cleanup]; + as_particle* p_cleanup = b_cleanup->particle; + uint32_t i_old; + + for (i_old = 0; i_old < n_old_bins; i_old++) { + as_bin* b_old = &old_bins[i_old]; + + if (b_cleanup->id == b_old->id) { + if (p_cleanup != as_bin_get_particle(b_old)) { + as_bin_particle_destroy(b_cleanup, true); + } + + break; + } + } + + if (i_old == n_old_bins) { + as_bin_particle_destroy(b_cleanup, true); + } + } + + // The index element's as_bin_space pointer still points at old bins. +} diff --git a/build/VersionCheck.py b/build/VersionCheck.py new file mode 100755 index 00000000..5fffe13e --- /dev/null +++ b/build/VersionCheck.py @@ -0,0 +1,32 @@ +#!/usr/bin/python + +# +# VersionCheck.py: +# Execute the given command, which must output a version of the form: +# +# {..}, where all three fields are non-negative integers and missing components default to 0 +# +# and check against the supplied minimum version components. +# +# Returns 1 if the version is at least the minimum, 0 if not, or else -1 if an error occurs. +# + +import os, sys + +def VersionCheck(command, minVersion): + try: + minVers = minVersion.split('.') + while (len(minVers) < 3): + minVers.append(0) + minMajor, minMinor, minPatch = [int(c) for c in minVers] + vers = os.popen(command).read().strip().split('.') + while (len(vers) < 3): + vers.append(0) + major, minor, patch = [int(c) for c in vers] + return 1 if (major > minMajor or + (major == minMajor and (minor > minMinor or + (minor == minMinor and patch >= minPatch)))) else 0 + except: + return -1 + +sys.stdout.write(str(VersionCheck(*sys.argv[1:3]))) diff --git a/build/gen_version b/build/gen_version new file mode 100755 index 00000000..fdef4958 --- /dev/null +++ b/build/gen_version @@ -0,0 +1,18 @@ +#!/bin/bash + +EDITION=${1:-community} +BUILD_OS=${2:-unknown} +FEATURES="" +if [ $EDITION == enterprise ]; then + FEATURES=";xdr" +fi + +echo "//// +//// AUTOMATICALLY GENERATED BY BUILD SYSTEM +//// +const char aerospike_build_id[] = \"`git describe`\"; +const char aerospike_build_time[] = \"`date`\"; +const char aerospike_build_type[] = \"Aerospike ${EDITION^} Edition\"; +const char aerospike_build_os[] = \"${BUILD_OS}\"; +const char aerospike_build_features[] = \"${FEATURES}\"; +" diff --git a/build/os_version b/build/os_version new file mode 100755 index 00000000..e3d1afb2 --- /dev/null +++ b/build/os_version @@ -0,0 +1,135 @@ +#!/usr/bin/env bash +# ------------------------------------------------------------------------------ +# Copyright 2012-2015 Aerospike, Inc. +# +# Portions may be licensed to Aerospike, Inc. under one or more contributor +# license agreements. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# ------------------------------------------------------------------------------ + +OPT_LONG=0 + +if [ "$1" = "-long" ] +then + OPT_LONG=1 +fi + +error() { + echo 'error:' $* >&2 +} + +main() { + + local kernel='' + local distro_id='' + local distro_version='' + local distro_long='' + local distro_short='' + + # Make sure this script is running on Linux + # The script is not designed to work on non-Linux + # operating systems. + kernel=$(uname -s | tr '[:upper:]' '[:lower:]') + if [ "$kernel" != 'linux' ] + then + error "$kernel is not supported." + exit 1 + fi + + if [ -f /etc/os-release ] + then + . /etc/os-release + distro_id=${ID,,} + distro_version=${VERSION_ID} + elif [ -f /etc/issue ] + then + issue=$(cat /etc/issue | tr '[:upper:]' '[:lower:]') + case "$issue" in + *'centos'* ) + distro_id='centos' + ;; + *'redhat'* ) + distro_id='redhat' + ;; + *'debian'* ) + distro_id='debian' + ;; + * ) + error "/etc/issue contained an unsupported linux distibution: $issue" + exit 1 + ;; + esac + + case "$distro_id" in + 'centos' | 'redhat' ) + local release='' + if [ -f /etc/centos-release ]; then + release=$(cat /etc/centos-release | tr '[:upper:]' '[:lower:]') + elif [ -f /etc/redhat-release ]; then + release=$(cat /etc/redhat-release | tr '[:upper:]' '[:lower:]') + fi + release_version=${release##*release} + distro_version=${release_version%.*} + ;; + 'debian' ) + debian_version=$(cat /etc/debian_version | tr '[:upper:]' '[:lower:]') + distro_version=${debian_version%%.*} + ;; + * ) + error "/etc/issue contained an unsupported linux distibution: $issue" + exit 1 + ;; + esac + fi + + distro_id=${distro_id//[[:space:]]/} + distro_version=${distro_version//[[:space:]]/} + + case "$distro_id" in + 'centos' | 'redhat' ) + distro_long="centos${distro_version}" + distro_short="el${distro_version}" + ;; + 'fedora' ) + if [ "$distro_version" -gt "15" ] + then + distro_version=7 + elif [ "$distro_version" -gt "10" ] + then + distro_version=6 + else + error "Unsupported linux distibution: $distro_id $distro_version" + exit 1 + fi + distro_long="centos${distro_version}" + distro_short="el${distro_version}" + ;; + 'amzn' ) + distro_long="ami" + distro_short="ami" + ;; + * ) + distro_long="${distro_id}${distro_version}" + distro_short="${distro_id}${distro_version}" + ;; + esac + + if [ "$OPT_LONG" = "1" ] + then + echo "${distro_long}" + else + echo "${distro_short}" + fi + exit 0 +} + +main diff --git a/build/prep-ce b/build/prep-ce new file mode 100755 index 00000000..5f0b8f83 --- /dev/null +++ b/build/prep-ce @@ -0,0 +1,19 @@ +#!/bin/bash +# +# File: build/prep-ce +# Description: Prepare for building from Community Edition source distribution. +# Usage: prompt$ build/prep-ce +# +# Executing this script replaces the version-related build scripts with versions +# using frozen version information instead of regenerating it every time. +# + +mv build/gen_version{,.ORIG} +cat > build/gen_version < build/version +chmod ugo+x build/version diff --git a/build/version b/build/version new file mode 100755 index 00000000..2545f25d --- /dev/null +++ b/build/version @@ -0,0 +1,10 @@ +#!/bin/bash + +rev=`git describe` +subbuild=`echo $rev | awk -F'-' '{print $2}'` + +if [ "$subbuild" != "" ] +then + rev=`echo $rev | awk -F'-' '{printf("%s-%s\n",$1,$2)}'` +fi +echo $rev diff --git a/cf/.gitignore b/cf/.gitignore new file mode 100644 index 00000000..4c16076a --- /dev/null +++ b/cf/.gitignore @@ -0,0 +1,2 @@ +.DS_Store +target \ No newline at end of file diff --git a/cf/Makefile b/cf/Makefile new file mode 100644 index 00000000..3acd1a10 --- /dev/null +++ b/cf/Makefile @@ -0,0 +1,9 @@ +# Citrusleaf Foundation +# Makefile + +.PHONY: default +default: all + @echo "done." + +%: + $(MAKE) -C src $@ diff --git a/cf/README.md b/cf/README.md new file mode 100644 index 00000000..05cbd02c --- /dev/null +++ b/cf/README.md @@ -0,0 +1,13 @@ +# Aerospike CF + +Library of objects shared between ASD and XDR. + +## Build + +To build + + $ make + +To clean: + + $ make clean diff --git a/cf/include/arenax.h b/cf/include/arenax.h new file mode 100644 index 00000000..53e39d12 --- /dev/null +++ b/cf/include/arenax.h @@ -0,0 +1,131 @@ +/* + * arenax.h + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include +#include +#include +#include + + +//========================================================== +// Typedefs & constants. +// + +#define CF_ARENAX_BIGLOCK (1 << 0) +#define CF_ARENAX_CALLOC (1 << 1) + +#ifndef CF_ARENAX_MAX_STAGES +#define CF_ARENAX_MAX_STAGES 256 +#endif + +typedef uint64_t cf_arenax_handle; + +// Must be in-sync with internal array ARENAX_ERR_STRINGS[]: +typedef enum { + CF_ARENAX_OK = 0, + CF_ARENAX_ERR_BAD_PARAM, + CF_ARENAX_ERR_STAGE_CREATE, + CF_ARENAX_ERR_STAGE_ATTACH, + CF_ARENAX_ERR_STAGE_DETACH, + CF_ARENAX_ERR_UNKNOWN +} cf_arenax_err; + +//------------------------------------------------ +// For enterprise separation only. +// + +// Element is indexed by 24 bits. +#define ELEMENT_ID_NUM_BITS 24 +#define ELEMENT_ID_MASK ((1UL << ELEMENT_ID_NUM_BITS) - 1) // 0xFFffff + +#define MAX_STAGE_CAPACITY (1 << ELEMENT_ID_NUM_BITS) // 16 M + +// DO NOT access this member data directly - use the API! +typedef struct cf_arenax_s { + // Configuration (passed in constructors). + key_t key_base; + uint32_t element_size; + uint32_t stage_capacity; + uint32_t max_stages; + uint32_t flags; + + // Configuration (derived). + size_t stage_size; + + // Free-element list. + cf_arenax_handle free_h; + + // Where to end-allocate. + uint32_t at_stage_id; + uint32_t at_element_id; + + // Thread safety. + pthread_mutex_t lock; + + // Current stages. + uint32_t stage_count; + uint8_t* stages[CF_ARENAX_MAX_STAGES]; +} cf_arenax; + +typedef struct free_element_s { + uint32_t magic; + cf_arenax_handle next_h; +} free_element; + +#define FREE_MAGIC 0xff1234ff + + +//========================================================== +// Public API. +// + +size_t cf_arenax_sizeof(); +const char* cf_arenax_errstr(cf_arenax_err err); + +void cf_arenax_init(cf_arenax* arena, key_t key_base, uint32_t element_size, + uint32_t stage_capacity, uint32_t max_stages, uint32_t flags); + +cf_arenax_handle cf_arenax_alloc(cf_arenax* arena); +void cf_arenax_free(cf_arenax* arena, cf_arenax_handle h); + +void* cf_arenax_resolve(cf_arenax* arena, cf_arenax_handle h); + + +//========================================================== +// Private API - for enterprise separation only. +// + +static inline void +cf_arenax_set_handle(cf_arenax_handle* h, uint32_t stage_id, + uint32_t element_id) +{ + *h = ((uint64_t)stage_id << ELEMENT_ID_NUM_BITS) | element_id; +} + +cf_arenax_err cf_arenax_add_stage(cf_arenax* arena); diff --git a/cf/include/bits.h b/cf/include/bits.h new file mode 100644 index 00000000..458cf179 --- /dev/null +++ b/cf/include/bits.h @@ -0,0 +1,80 @@ +/* + * bits.h + * + * Copyright (C) 2018 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include + + +//========================================================== +// Public API. +// + +// Position of most significant bit, 0 ... 63 from low to high. -1 for value 0. +static inline int +cf_msb(uint64_t value) +{ + int n = -1; + + while (value != 0) { + value >>= 1; + n++; + } + + return n; +} + +// Returns number of trailing zeros in a uint64_t, 64 for x == 0. +static inline uint32_t +cf_lsb64(uint64_t x) +{ + if (x == 0) { + return 64; + } + + return (uint32_t)__builtin_ctzll(x); +} + +// Returns number of leading zeros in a uint64_t, 64 for x == 0. +static inline uint32_t +cf_msb64(uint64_t x) +{ + if (x == 0) { + return 64; + } + + return (uint32_t)__builtin_clzll(x); +} + +static inline uint32_t +cf_bit_count64(uint64_t x) +{ + x -= (x >> 1) & 0x5555555555555555; + x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; + + return (uint32_t)((x * 0x0101010101010101) >> 56); +} diff --git a/cf/include/cf_mutex.h b/cf/include/cf_mutex.h new file mode 100644 index 00000000..eaced35e --- /dev/null +++ b/cf/include/cf_mutex.h @@ -0,0 +1,63 @@ +/* + * cf_mutex.h + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + + +//========================================================== +// Includes. +// + +#include +#include + + +//========================================================== +// Typedefs & constants. +// + +typedef struct cf_mutex_s { + uint32_t u32; +} cf_mutex __attribute__ ((aligned(4))); + +typedef struct cf_condition_s { + uint32_t seq; +} cf_condition __attribute__ ((aligned(4))); + +#define CF_MUTEX_INIT { 0 } +#define cf_mutex_init(__m) (__m)->u32 = 0 +#define cf_mutex_destroy(__m) // no-op + + +//========================================================== +// Public API. +// + +void cf_mutex_lock(cf_mutex *m); +void cf_mutex_unlock(cf_mutex *m); +bool cf_mutex_trylock(cf_mutex *m); + +void cf_mutex_lock_spin(cf_mutex *m); +void cf_mutex_unlock_spin(cf_mutex *m); + +void cf_condition_wait(cf_condition *c, cf_mutex *m); +void cf_condition_signal(cf_condition *c); diff --git a/cf/include/cf_str.h b/cf/include/cf_str.h new file mode 100644 index 00000000..7feb6a36 --- /dev/null +++ b/cf/include/cf_str.h @@ -0,0 +1,73 @@ +/* + * cf_str.h + * + * Copyright (C) 2008-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include + +// These functions convert integers into a string, writing into the provided +// buffer, and return the number of bytes written. +unsigned int cf_str_itoa(int value, char *s, int radix); +unsigned int cf_str_itoa_u64(uint64_t value, char *s, int radix); +unsigned int cf_str_itoa_u32(uint32_t value, char *s, int radix); + +// These functions convert a string to a number of different integer types, and +// returns 0 on success. +int cf_str_atoi(char *s, int *value); +int cf_str_atoi_u32(char *s, uint32_t *value); +int cf_str_atoi_64(char *s, int64_t *value); +int cf_str_atoi_u64(char *s, uint64_t *value); +int cf_str_atoi_x64(const char *s, uint64_t *value); +int cf_str_atoi_seconds(char *s, uint64_t *value); + +// And this does the same, with radix. +int cf_str_atoi_u64_x(char *s, uint64_t *value, int radix); + +// Split the string 'str' based on input breaks in 'fmt'. +// - The splitting is destructive. +// - The pointers will be added to the end of vector '*v'. +// - The vector better be created with object size 'void *'. +struct cf_vector_s; +extern void cf_str_split(char *fmt, char *str, struct cf_vector_s *v); + +static inline int +cf_str_strnchr(uint8_t *s, int sz, int c) +{ + for (int i = 0; i < sz; i++) { + if (s[i] == c) { + return i; + } + } + return -1; +} + +static inline const char * +cf_str_safe_as_empty(const char *s) +{ + return s ? s : ""; +} + +static inline const char * +cf_str_safe_as_null(const char *s) +{ + return s ? s : "null"; +} diff --git a/cf/include/compare.h b/cf/include/compare.h new file mode 100644 index 00000000..f2066806 --- /dev/null +++ b/cf/include/compare.h @@ -0,0 +1,52 @@ +/* + * compare.h + * + * Copyright (C) 2018 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include + + +//========================================================== +// Public API - qsort() comparators. +// + +static inline int +cf_compare_uint64_desc(const void* pa, const void* pb) +{ + uint64_t a = *(const uint64_t*)pa; + uint64_t b = *(const uint64_t*)pb; + + return a > b ? -1 : (a == b ? 0 : 1); +} + +static inline int +cf_compare_uint32_desc(const void* pa, const void* pb) +{ + uint32_t a = *(const uint32_t*)pa; + uint32_t b = *(const uint32_t*)pb; + + return a > b ? -1 : (a == b ? 0 : 1); +} diff --git a/cf/include/daemon.h b/cf/include/daemon.h new file mode 100644 index 00000000..cb325bc8 --- /dev/null +++ b/cf/include/daemon.h @@ -0,0 +1,30 @@ +/* + * daemon.h + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include + +void cf_process_daemonize(int *fd_ignore_list, int list_size); +void cf_process_privsep(uid_t uid, gid_t gid); +void cf_process_holdcap(void); +void cf_process_clearcap(void); diff --git a/cf/include/dynbuf.h b/cf/include/dynbuf.h new file mode 100644 index 00000000..5c6fd93b --- /dev/null +++ b/cf/include/dynbuf.h @@ -0,0 +1,126 @@ +/* + * dynbuf.h + * + * Copyright (C) 2009 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * A simple dynamic buffer implementation + * Allows the first, simpler part of the buffer to be on the stack + * which is usually all that's needed + * + */ + +#pragma once + +#include +#include +#include + +typedef struct cf_dyn_buf_s { + uint8_t *buf; + bool is_stack; + size_t alloc_sz; + size_t used_sz; +} cf_dyn_buf; + +#define cf_dyn_buf_define(__x) uint8_t dyn_buf##__x[1024]; cf_dyn_buf __x = { dyn_buf##__x, true, 1024, 0 } +#define cf_dyn_buf_define_size(__x, __sz) uint8_t dyn_buf##__x[__sz]; cf_dyn_buf __x = { dyn_buf##__x, true, __sz, 0 } + +extern void cf_dyn_buf_init_heap(cf_dyn_buf *db, size_t sz); +extern void cf_dyn_buf_reserve(cf_dyn_buf *db, size_t sz, uint8_t **from); +extern void cf_dyn_buf_append_string(cf_dyn_buf *db, const char *s); +extern void cf_dyn_buf_append_char(cf_dyn_buf *db, char c); +extern void cf_dyn_buf_append_bool(cf_dyn_buf *db, bool b); +extern void cf_dyn_buf_append_buf(cf_dyn_buf *db, uint8_t *buf, size_t sz); +extern void cf_dyn_buf_append_int(cf_dyn_buf *db, int i); +extern void cf_dyn_buf_append_uint64_x(cf_dyn_buf *db, uint64_t i); // HEX FORMAT! +extern void cf_dyn_buf_append_uint64(cf_dyn_buf *db, uint64_t i); +extern void cf_dyn_buf_append_uint32(cf_dyn_buf *db, uint32_t i); +extern void cf_dyn_buf_chomp(cf_dyn_buf *db); +extern char *cf_dyn_buf_strdup(cf_dyn_buf *db); +extern void cf_dyn_buf_free(cf_dyn_buf *db); + +// Helpers to append name value pairs to a cf_dyn_buf in pattern: name=value; +void info_append_bool(cf_dyn_buf *db, const char *name, bool value); +void info_append_int(cf_dyn_buf *db, const char *name, int value); +void info_append_string(cf_dyn_buf *db, const char *name, const char *value); +void info_append_string_safe(cf_dyn_buf *db, const char *name, const char *value); +void info_append_uint32(cf_dyn_buf *db, const char *name, uint32_t value); +void info_append_uint64(cf_dyn_buf *db, const char *name, uint64_t value); +void info_append_uint64_x(cf_dyn_buf *db, const char *name, uint64_t value); + +typedef struct cf_buf_builder_s { + size_t alloc_sz; + size_t used_sz; + uint8_t buf[]; +} cf_buf_builder; + +extern cf_buf_builder *cf_buf_builder_create(); +extern cf_buf_builder *cf_buf_builder_create_size(size_t sz); +extern void cf_buf_builder_free(cf_buf_builder *bb); +extern void cf_buf_builder_reset(cf_buf_builder *bb); +extern void cf_buf_builder_chomp(cf_buf_builder *bb_r); +// If you use any binary components, this strdup thing is a bad idea: +extern char *cf_buf_builder_strdup(cf_buf_builder *bb_r); + +extern void cf_buf_builder_append_string(cf_buf_builder **bb_r, const char *s); +extern void cf_buf_builder_append_char(cf_buf_builder **bb_r, char c); +extern void cf_buf_builder_append_buf(cf_buf_builder **bb_r, uint8_t *buf, size_t sz); +// These append ASCII versions: +extern void cf_buf_builder_append_ascii_uint64_x(cf_buf_builder **bb_r, uint64_t i); // HEX FORMAT! +extern void cf_buf_builder_append_ascii_uint64(cf_buf_builder **bb_r, uint64_t i); +extern void cf_buf_builder_append_ascii_uint32(cf_buf_builder **bb_r, uint32_t i); +extern void cf_buf_builder_append_ascii_int(cf_buf_builder **bb_r, int i); +// These append network-order bytes: +extern void cf_buf_builder_append_uint64(cf_buf_builder **bb_r, uint64_t i); +extern void cf_buf_builder_append_uint32(cf_buf_builder **bb_r, uint32_t i); +extern void cf_buf_builder_append_uint16(cf_buf_builder **bb_r, uint16_t i); +extern void cf_buf_builder_append_uint8(cf_buf_builder **bb_r, uint8_t i); +// Reserve the bytes and give me the handle to the spot reserved: +extern void cf_buf_builder_reserve(cf_buf_builder **bb_r, int sz, uint8_t **buf); +extern int cf_buf_builder_size(cf_buf_builder *bb); +extern size_t get_new_size(int alloc, int used, int requested); + +// TODO - We've only implemented a few cf_ll_buf methods for now. We'll add more +// functionality if and when it's needed. + +typedef struct cf_ll_buf_stage_s { + struct cf_ll_buf_stage_s *next; + size_t buf_sz; + size_t used_sz; + uint8_t buf[]; +} cf_ll_buf_stage; + +typedef struct cf_ll_buf_s { + bool head_is_stack; + cf_ll_buf_stage *head; + cf_ll_buf_stage *tail; +} cf_ll_buf; + +#define cf_ll_buf_define(__x, __sz) \ + uint8_t llb_stage##__x[sizeof(cf_ll_buf_stage) + __sz]; \ + cf_ll_buf_stage* ll_buf_stage##__x = (cf_ll_buf_stage*)llb_stage##__x; \ + ll_buf_stage##__x->next = NULL; \ + ll_buf_stage##__x->buf_sz = __sz; \ + ll_buf_stage##__x->used_sz = 0; \ + cf_ll_buf __x = { true, ll_buf_stage##__x, ll_buf_stage##__x } + +extern void cf_ll_buf_reserve(cf_ll_buf *llb, size_t sz, uint8_t **from); +extern void cf_ll_buf_free(cf_ll_buf *llb); diff --git a/cf/include/enhanced_alloc.h b/cf/include/enhanced_alloc.h new file mode 100644 index 00000000..cb6b49fb --- /dev/null +++ b/cf/include/enhanced_alloc.h @@ -0,0 +1,126 @@ +/* + * enhanced_alloc.h + * + * Copyright (C) 2013-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include +#include +#include + +#include "citrusleaf/cf_atomic.h" + + +//========================================================== +// Typedefs & constants. +// + +typedef struct cf_rc_header_s { + cf_atomic32 rc; + uint32_t sz; +} cf_rc_header; + +typedef enum { + CF_ALLOC_DEBUG_NONE, + CF_ALLOC_DEBUG_TRANSIENT, + CF_ALLOC_DEBUG_PERSISTENT, + CF_ALLOC_DEBUG_ALL +} cf_alloc_debug; + + +//========================================================== +// Public API - arena management and stats. +// + +extern __thread int32_t g_ns_arena; + +void cf_alloc_init(void); +void cf_alloc_set_debug(cf_alloc_debug debug); +int32_t cf_alloc_create_arena(void); + +#define CF_ALLOC_SET_NS_ARENA(_ns) \ + (g_ns_arena = _ns->storage_data_in_memory ? _ns->jem_arena : -1) + +static inline int32_t +cf_alloc_clear_ns_arena(void) +{ + int32_t old_arena = g_ns_arena; + g_ns_arena = -1; + return old_arena; +} + +static inline void +cf_alloc_restore_ns_arena(int32_t old_arena) +{ + g_ns_arena = old_arena; +} + +void cf_alloc_heap_stats(size_t *allocated_kbytes, size_t *active_kbytes, size_t *mapped_kbytes, double *efficiency_pct, uint32_t *site_count); +void cf_alloc_log_stats(const char *file, const char *opts); +void cf_alloc_log_site_infos(const char *file); + + +//========================================================== +// Public API - ordinary allocation. +// + +// Don't call these directly - use wrappers below. +void *cf_alloc_try_malloc(size_t sz); +void *cf_alloc_malloc_arena(size_t sz, int32_t arena); +void *cf_alloc_calloc_arena(size_t n, size_t sz, int32_t arena); +void *cf_alloc_realloc_arena(void *p, size_t sz, int32_t arena); + +#define cf_try_malloc(_sz) cf_alloc_try_malloc(_sz) + +#define cf_malloc(_sz) malloc(_sz) +#define cf_malloc_ns(_sz) cf_alloc_malloc_arena(_sz, g_ns_arena) + +#define cf_calloc(_n, _sz) calloc(_n, _sz) +#define cf_calloc_ns(_n, _sz) cf_alloc_calloc_arena(_n, _sz, g_ns_arena) + +#define cf_realloc(_p, _sz) realloc(_p, _sz) +#define cf_realloc_ns(_p, _sz) cf_alloc_realloc_arena(_p, _sz, g_ns_arena) + +#define cf_valloc(_sz) valloc(_sz) + +#define cf_strdup(_s) strdup(_s) +#define cf_strndup(_s, _n) strndup(_s, _n) +#define cf_asprintf(_s, _f, ...) asprintf(_s, _f, __VA_ARGS__) + +#define cf_free(_p) free(_p) + + +//========================================================== +// Public API - reference-counted allocation. +// + +void *cf_rc_alloc(size_t sz); +void cf_rc_free(void *p); + +int32_t cf_rc_count(const void *p); +int32_t cf_rc_reserve(void *p); +int32_t cf_rc_release(void *p); +int32_t cf_rc_releaseandfree(void *p); diff --git a/cf/include/fault.h b/cf/include/fault.h new file mode 100644 index 00000000..d0fc3c8c --- /dev/null +++ b/cf/include/fault.h @@ -0,0 +1,434 @@ +/* + * fault.h + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "dynbuf.h" + + +// Use COMPILER_ASSERT() for compile-time verification. +// +// Usage does not add any compiled code, or cost anything at runtime. When the +// evaluated expression is false, it causes a compile error which will draw +// attention to the relevant line. +// +// e.g. +// COMPILER_ASSERT(sizeof(my_int_array) / sizeof(int) == MY_INT_ARRAY_SIZE); +// +#define CGLUE(a, b) a##b +#define CVERIFY(expr, line) typedef char CGLUE(compiler_assert_failed_on_line_, line)[(expr) ? 1 : -1] +#define COMPILER_ASSERT(expr) CVERIFY(expr, __LINE__) + +// Use CF_MUST_CHECK with declarations to force caller to handle return value. +// +// e.g. +// CF_MUST_CHECK int my_function(); +// +#define CF_MUST_CHECK __attribute__((warn_unused_result)) + +// Use CF_IGNORE_ERROR() as caller to override CF_MUST_CHECK in declaration. +// +// e.g. +// CF_IGNORE_ERROR(my_function()); +// +#define CF_IGNORE_ERROR(x) ((void)((x) == 12345)) + +// Use CF_NEVER_FAILS() as caller to assert that returned value is not negative. +// +// e.g. +// CF_NEVER_FAILS(my_function()); +// +#define CF_NEVER_FAILS(x) \ +do { \ + if ((x) < 0) { \ + cf_crash(CF_MISC, "this cannot happen..."); \ + } \ +} while (false); + +// Use CF_ZSTR_DEFINE() to null-terminate strings conveniently. +// +// e.g. +// CF_ZSTR_DEFINE(zstr, 40, ns_name, name_sz); +// cf_warning(AS_NAMESPACE, "got namespace %s", zstr); +// +#define CF_ZSTR_DEFINE(zstr, max_sz, str, sz) \ + char zstr[max_sz]; \ + size_t zstr##len = sz < max_sz ? sz : max_sz - 1; \ + memcpy(zstr, str, zstr##len); \ + zstr[zstr##len] = 0; + +// Use CF_ZSTRxx() to null-terminate strings conveniently. Useful especially as +// cf_detail & cf_debug parameters where there's no cost unless the log level +// is enabled. (Cost may be more than CF_ZSTR_DEFINE() due to copying struct on +// function return.) +// +// e.g. +// cf_debug(AS_NAMESPACE, "got namespace %s", CF_ZSTR64(ns_name, name_sz)); +// + +typedef struct cf_zstr64_s { + char s[64]; +} cf_zstr64; + +typedef struct cf_zstr1k_s { + char s[1024]; +} cf_zstr1k; + +static inline cf_zstr64 +cf_null_terminate_64(const char *str, size_t sz) +{ + cf_zstr64 zstr; + size_t len = sz < sizeof(zstr.s) ? sz : sizeof(zstr.s) - 1; + + memcpy(zstr.s, str, len); + zstr.s[len] = 0; + + return zstr; +} + +static inline cf_zstr1k +cf_null_terminate_1k(const char *str, size_t sz) +{ + cf_zstr1k zstr; + size_t len = sz < sizeof(zstr.s) ? sz : sizeof(zstr.s) - 1; + + memcpy(zstr.s, str, len); + zstr.s[len] = 0; + + return zstr; +} + +#define CF_ZSTR64(str, sz) (cf_null_terminate_64((const char *)str, sz).s) +#define CF_ZSTR1K(str, sz) (cf_null_terminate_1k((const char *)str, sz).s) + + +/* SYNOPSIS + * Fault scoping + * + * Faults are identified by a context and severity. The context describes where + * the fault occurred, and the severity determines the required action. + * + * Examples: + * cf_info(CF_MISC, "important message: %s", my_msg); + * cf_crash(CF_MISC, "doom!"); + * cf_assert(my_test, CF_MISC, "gloom!"); + */ + +/* cf_fault_context + * NB: if you add or remove entries from this enum, you must also change + * the corresponding strings structure in fault.c */ +typedef enum { + CF_MISC, + + CF_ALLOC, + CF_ARENAX, + CF_HARDWARE, + CF_MSG, + CF_RBUFFER, + CF_SOCKET, + CF_TLS, + CF_VMAPX, + + AS_AGGR, + AS_APPEAL, + AS_AS, + AS_BATCH, + AS_BIN, + AS_CFG, + AS_CLUSTERING, + AS_COMPRESSION, + AS_DEMARSHAL, + AS_DRV_SSD, + AS_EXCHANGE, + AS_FABRIC, + AS_GEO, + AS_HB, + AS_HLC, + AS_INDEX, + AS_INFO, + AS_INFO_PORT, + AS_JOB, + AS_MIGRATE, + AS_MON, + AS_NAMESPACE, + AS_NSUP, + AS_PARTICLE, + AS_PARTITION, + AS_PAXOS, + AS_PREDEXP, + AS_PROTO, + AS_PROXY, + AS_PROXY_DIVERT, // special detail context + AS_QUERY, + AS_RECORD, + AS_ROSTER, + AS_RW, + AS_RW_CLIENT, // special detail context + AS_SCAN, + AS_SECURITY, + AS_SINDEX, + AS_SKEW, + AS_SMD, + AS_STORAGE, + AS_TRUNCATE, + AS_TSVC, + AS_UDF, + AS_XDR, + CF_FAULT_CONTEXT_UNDEF +} cf_fault_context; + +extern char *cf_fault_context_strings[]; + +/* cf_fault_severity + * CRITICAL fatal runtime panics + * WARNING runtime errors + * INFO informational or advisory messages + * DEBUG debugging messages + * DETAIL detailed debugging messages + */ +typedef enum { + CF_CRITICAL = 0, + CF_WARNING = 1, + CF_INFO = 2, + CF_DEBUG = 3, + CF_DETAIL = 4, + CF_FAULT_SEVERITY_UNDEF = 5 +} cf_fault_severity; + +/* cf_fault_sink + * An endpoint (sink) for a flow of fault messages */ +typedef struct cf_fault_sink { + int fd; + char *path; + int limit[CF_FAULT_CONTEXT_UNDEF]; +} cf_fault_sink; + +#define CF_FAULT_SINKS_MAX 8 + +/** + * When we want to dump out some binary data (like a digest, a bit string + * or a buffer), we want to be able to specify how we'll display the data. + * We expect this list to grow over time, as more binary representations + * are needed. (2014_03_20 tjl). + */ +typedef enum { + CF_DISPLAY_HEX_DIGEST, // Show Special Case DIGEST in Packed Hex + CF_DISPLAY_HEX_SPACED, // Show binary value in regular spaced hex + CF_DISPLAY_HEX_PACKED, // Show binary value in packed hex + CF_DISPLAY_HEX_COLUMNS, // Show binary value in Column Oriented Hex + CF_DISPLAY_BASE64, // Show binary value in Base64 + CF_DISPLAY_BITS_SPACED, // Show binary value in a spaced bit string + CF_DISPLAY_BITS_COLUMNS // Show binary value in Column Oriented Bits +} cf_display_type; + + +/* Function declarations */ + +// note: passing a null sink sets for all currently known sinks +extern int cf_fault_sink_addcontext(cf_fault_sink *s, char *context, char *severity); +extern cf_fault_sink *cf_fault_sink_add(char *path); + +extern cf_fault_sink *cf_fault_sink_hold(char *path); +extern bool cf_fault_console_is_held(); +extern int cf_fault_sink_activate_all_held(); +extern int cf_fault_sink_get_fd_list(int *fds); + +extern int cf_fault_sink_strlist(cf_dyn_buf *db); // pack all contexts into a string - using ids +extern int cf_fault_sink_context_all_strlist(int sink_id, cf_dyn_buf *db); +extern int cf_fault_sink_context_strlist(int sink_id, char *context, cf_dyn_buf *db); + +extern cf_fault_sink *cf_fault_sink_get_id(int id); + +extern void cf_fault_sink_logroll(void); + +extern void cf_fault_use_local_time(bool val); +extern bool cf_fault_is_using_local_time(); + +extern void cf_fault_log_millis(bool log_millis); +extern bool cf_fault_is_logging_millis(); + +// TODO: Rework cf_display_type-based logging to have a more useful +// output format, instead of having this separate function. +extern void cf_fault_hex_dump(const char *title, const void *data, size_t len); + +extern cf_fault_severity cf_fault_filter[]; + +// Define the mechanism that we'll use to write into the Server Log. +// cf_fault_event() is "regular" logging +extern void cf_fault_event(const cf_fault_context, + const cf_fault_severity severity, const char *file_name, + const int line, const char *msg, ...) + __attribute__ ((format (printf, 5, 6))); + +// cf_fault_event2() is for advanced logging, where we want to print some +// binary object (often a digest). +extern void cf_fault_event2(const cf_fault_context, + const cf_fault_severity severity, const char *file_name, const int line, + const void *mem_ptr, size_t len, cf_display_type dt, const char *msg, ...) + __attribute__ ((format (printf, 8, 9))); + +extern void cf_fault_event_nostack(const cf_fault_context, + const cf_fault_severity severity, const char *fn, const int line, + const char *msg, ...) + __attribute__ ((format (printf, 5, 6))); + +// For now there's only one cache, dumped by the ticker. +extern void cf_fault_cache_event(cf_fault_context context, + cf_fault_severity severity, const char *file_name, int line, + char *msg, ...) + __attribute__ ((format (printf, 5, 6))); + +// This is ONLY to keep Eclipse happy without having to tell it __FILENAME__ is +// defined. The make process will define it via the -D mechanism. +#ifndef __FILENAME__ +#define __FILENAME__ "" +#endif + +// The "regular" version. +#define cf_assert(a, context, __msg, ...) \ + ((a) ? (void)0 : \ + cf_fault_event((context), CF_CRITICAL, __FILENAME__, __LINE__, (__msg), ##__VA_ARGS__)) + +// The "no stack" versions. +#define cf_assert_nostack(a, context, __msg, ...) \ + ((a) ? (void)0 : \ + cf_fault_event_nostack((context), CF_CRITICAL, __FILENAME__, __LINE__, (__msg), ##__VA_ARGS__)) +#define cf_crash_nostack(context, __msg, ...) \ + cf_fault_event_nostack((context), CF_CRITICAL, __FILENAME__, __LINE__, (__msg), ##__VA_ARGS__) + +#define MAX_BACKTRACE_DEPTH 50 + +// This must literally be the direct clib "free()", because "strings" is +// allocated by "backtrace_symbols()". +#define PRINT_STACKTRACE() \ +do { \ + void *bt[MAX_BACKTRACE_DEPTH]; \ + int sz = backtrace(bt, MAX_BACKTRACE_DEPTH); \ + cf_fault_event(AS_AS, CF_WARNING, __FILENAME__, __LINE__, "stacktrace: found %d frames", sz); \ + char **strings = backtrace_symbols(bt, sz); \ + if (strings) { \ + for (int i = 0; i < sz; i++) { \ + cf_fault_event(AS_AS, CF_WARNING, __FILENAME__, __LINE__, "stacktrace: frame %d: %s", i, strings[i]); \ + } \ + free(strings); \ + } \ + else { \ + cf_fault_event(AS_AS, CF_WARNING, __FILENAME__, __LINE__, "stacktrace: found no symbols"); \ + } \ +} while (0); + +#define PRINT_CALL_STACK(severity) \ +do { \ + void *bt[MAX_BACKTRACE_DEPTH]; \ + int sz = backtrace(bt, MAX_BACKTRACE_DEPTH); \ + cf_fault_event(AS_AS, severity, __FILENAME__, __LINE__, "call stack: found %d frames", sz); \ + char **strings = backtrace_symbols(bt, sz); \ + if (strings) { \ + for (int i = 0; i < sz; i++) { \ + cf_fault_event(AS_AS, severity, __FILENAME__, __LINE__, "call stack: frame %d: %s", i, strings[i]); \ + } \ + free(strings); \ + } \ + else { \ + cf_fault_event(AS_AS, severity, __FILENAME__, __LINE__, "call stack: found no symbols"); \ + } \ +} while (0); + +// The "regular" versions. +#define __SEVLOG(severity, context, __msg, ...) \ + (severity > cf_fault_filter[context] ? \ + (void)0 : \ + cf_fault_event((context), severity, __FILENAME__, __LINE__, (__msg), ##__VA_ARGS__)) + +#define cf_crash(context, __msg, ...) \ + cf_fault_event((context), CF_CRITICAL, __FILENAME__, __LINE__, (__msg), ##__VA_ARGS__) + +#define cf_warning(...) __SEVLOG(CF_WARNING, ##__VA_ARGS__) +#define cf_info(...) __SEVLOG(CF_INFO, ##__VA_ARGS__) +#define cf_debug(...) __SEVLOG(CF_DEBUG, ##__VA_ARGS__) +#define cf_detail(...) __SEVLOG(CF_DETAIL, ##__VA_ARGS__) + +// In addition to the existing LOG calls, we will now add a new mechanism +// that will the ability to print out a BINARY ARRAY, in a general manner, at +// the end of the passed in PRINT STRING. +// This is a general mechanism that can be used to express a binary array as +// a hex or Base64 value, but we'll often use it to print a full Digest Value, +// in either Hex format or Base64 format. +#define __BINARY_SEVLOG(severity, context, ptr, len, DT, __msg, ...) \ + (severity > cf_fault_filter[context] ? \ + (void)0 : \ + cf_fault_event2((context), severity, __FILENAME__, __LINE__, ptr, len, DT, (__msg), ##__VA_ARGS__)) + +#define cf_crash_binary(context, ptr, len, DT, __msg, ...) \ + cf_fault_event2((context), CF_CRITICAL, __FILENAME__, __LINE__, ptr, len, DT, (__msg), ##__VA_ARGS__) + +#define cf_warning_binary(...) __BINARY_SEVLOG(CF_WARNING, ##__VA_ARGS__) +#define cf_info_binary(...) __BINARY_SEVLOG(CF_INFO, ##__VA_ARGS__) +#define cf_debug_binary(...) __BINARY_SEVLOG(CF_DEBUG, ##__VA_ARGS__) +#define cf_detail_binary(...) __BINARY_SEVLOG(CF_DETAIL, ##__VA_ARGS__) + +// This set of log calls specifically handles DIGEST values. +#define __DIGEST_SEVLOG(severity, context, ptr,__msg, ...) \ + (severity > cf_fault_filter[context] ? \ + (void)0 : \ + cf_fault_event2((context), severity, __FILENAME__, __LINE__, ptr, 20, CF_DISPLAY_HEX_DIGEST, (__msg), ##__VA_ARGS__)) + +#define cf_crash_digest(context, ptr,__msg, ...) \ + cf_fault_event2((context), CF_CRITICAL, __FILENAME__, __LINE__, ptr, 20, CF_DISPLAY_HEX_DIGEST, (__msg), ##__VA_ARGS__) + +#define cf_warning_digest(...) __DIGEST_SEVLOG(CF_WARNING, ##__VA_ARGS__) +#define cf_info_digest(...) __DIGEST_SEVLOG(CF_INFO, ##__VA_ARGS__) +#define cf_debug_digest(...) __DIGEST_SEVLOG(CF_DEBUG, ##__VA_ARGS__) +#define cf_detail_digest(...) __DIGEST_SEVLOG(CF_DETAIL, ##__VA_ARGS__) + +// _GNU_SOURCE gives us a strerror_r() that returns (char *). +#define cf_strerror(err) strerror_r(err, (char *)alloca(200), 200) + +/* cf_context_at_severity + * Return whether the given context is set to this severity level or higher. */ +extern bool cf_context_at_severity(const cf_fault_context context, const cf_fault_severity severity); + +extern void cf_fault_init(); + +int generate_packed_hex_string(const void *mem_ptr, uint32_t len, char* output); + +// For now there's only one cache, dumped by the ticker. +extern void cf_fault_dump_cache(); + +#define cf_dump_ticker_cache() cf_fault_dump_cache() + +#define __CACHE_SEVLOG(severity, context, __msg, ...) \ + (severity > cf_fault_filter[context] ? \ + (void)0 : \ + cf_fault_cache_event((context), severity, __FILENAME__, __LINE__, (__msg), ##__VA_ARGS__)) + +#define cf_ticker_warning(...) __CACHE_SEVLOG(CF_WARNING, ##__VA_ARGS__) +#define cf_ticker_info(...) __CACHE_SEVLOG(CF_INFO, ##__VA_ARGS__) +#define cf_ticker_debug(...) __CACHE_SEVLOG(CF_DEBUG, ##__VA_ARGS__) +#define cf_ticker_detail(...) __CACHE_SEVLOG(CF_DETAIL, ##__VA_ARGS__) diff --git a/cf/include/hardware.h b/cf/include/hardware.h new file mode 100644 index 00000000..87ac526b --- /dev/null +++ b/cf/include/hardware.h @@ -0,0 +1,56 @@ +/* + * hardware.h + * + * Copyright (C) 2016-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include +#include + +#include + +typedef enum { + CF_TOPO_AUTO_PIN_NONE, + CF_TOPO_AUTO_PIN_CPU, + CF_TOPO_AUTO_PIN_NUMA +} cf_topo_auto_pin; + +typedef uint16_t cf_topo_os_cpu_index; + +typedef uint16_t cf_topo_numa_node_index; +typedef uint16_t cf_topo_core_index; +typedef uint16_t cf_topo_cpu_index; + +void cf_topo_config(cf_topo_auto_pin auto_pin, cf_topo_numa_node_index a_numa_node, + const cf_addr_list *addrs); +void cf_topo_force_map_memory(const uint8_t *from, size_t size); +void cf_topo_migrate_memory(void); +void cf_topo_info(void); + +uint16_t cf_topo_count_cores(void); +uint16_t cf_topo_count_cpus(void); + +cf_topo_cpu_index cf_topo_current_cpu(void); +cf_topo_cpu_index cf_topo_socket_cpu(const cf_socket *sock); + +void cf_topo_pin_to_core(cf_topo_core_index i_core); +void cf_topo_pin_to_cpu(cf_topo_cpu_index i_cpu); diff --git a/cf/include/hist.h b/cf/include/hist.h new file mode 100644 index 00000000..341e268f --- /dev/null +++ b/cf/include/hist.h @@ -0,0 +1,67 @@ +/* + * hist.h + * + * Copyright (C) 2009-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include +#include +#include "citrusleaf/cf_atomic.h" +#include "dynbuf.h" + + +//========================================================== +// Histogram with logarithmic buckets, used for all the +// latency metrics. +// + +#define N_BUCKETS (1 + 64) +#define HISTOGRAM_NAME_SIZE 512 + +typedef enum { + HIST_MILLISECONDS, + HIST_MICROSECONDS, + HIST_SIZE, + HIST_COUNT, + HIST_SCALE_MAX_PLUS_1 +} histogram_scale; + +#define HIST_TAG_MILLISECONDS "msec" +#define HIST_TAG_MICROSECONDS "usec" +#define HIST_TAG_SIZE "bytes" +#define HIST_TAG_COUNT "count" + +// DO NOT access this member data directly - use the API! +// (Except for cf_hist_track, for which histogram is a base class.) +typedef struct histogram_s { + char name[HISTOGRAM_NAME_SIZE]; + const char* scale_tag; + uint32_t time_div; + cf_atomic64 counts[N_BUCKETS]; +} histogram; + +extern histogram *histogram_create(const char *name, histogram_scale scale); +extern void histogram_clear(histogram *h); +extern void histogram_dump(histogram *h ); + +extern uint64_t histogram_insert_data_point(histogram *h, uint64_t start_ns); +extern void histogram_insert_raw(histogram *h, uint64_t value); diff --git a/cf/include/hist_track.h b/cf/include/hist_track.h new file mode 100644 index 00000000..8dfdb287 --- /dev/null +++ b/cf/include/hist_track.h @@ -0,0 +1,86 @@ +/* + * hist_track.h + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + + +//========================================================== +// Includes +// + +#include +#include +#include "dynbuf.h" +#include "hist.h" + + +//========================================================== +// Typedefs +// + +typedef struct cf_hist_track_s cf_hist_track; + +typedef enum { + CF_HIST_TRACK_FMT_PACKED, + CF_HIST_TRACK_FMT_TABLE +} cf_hist_track_info_format; + + +//========================================================== +// Public API +// + +//------------------------------------------------ +// Constructor/Destructor +// +cf_hist_track* cf_hist_track_create(const char* name, histogram_scale scale); +void cf_hist_track_destroy(cf_hist_track* _this); + +//------------------------------------------------ +// Start/Stop Caching Data +// +bool cf_hist_track_start(cf_hist_track* _this, uint32_t back_sec, + uint32_t slice_sec, const char* thresholds); +void cf_hist_track_stop(cf_hist_track* _this); + +//------------------------------------------------ +// Histogram API "Overrides" +// +void cf_hist_track_clear(cf_hist_track* _this); +void cf_hist_track_dump(cf_hist_track* _this); + +// These are just pass-throughs to histogram insertion methods: +uint64_t cf_hist_track_insert_data_point(cf_hist_track* _this, + uint64_t start_ns); +void cf_hist_track_insert_raw(cf_hist_track* _this, uint64_t value); + +//------------------------------------------------ +// Get Statistics from Cached Data +// +void cf_hist_track_get_info(cf_hist_track* _this, uint32_t back_sec, + uint32_t duration_sec, uint32_t slice_sec, bool throughput_only, + cf_hist_track_info_format info_fmt, cf_dyn_buf* db_p); + +//------------------------------------------------ +// Get Current Settings +// +void cf_hist_track_get_settings(cf_hist_track* _this, cf_dyn_buf* db_p); diff --git a/cf/include/linear_hist.h b/cf/include/linear_hist.h new file mode 100644 index 00000000..da558f09 --- /dev/null +++ b/cf/include/linear_hist.h @@ -0,0 +1,61 @@ +/* + * linear_hist.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include "dynbuf.h" + + +typedef struct linear_hist_s linear_hist; + +typedef struct linear_hist_threshold_s { + uint32_t value; + uint32_t bucket_index; + uint32_t bucket_width; + uint64_t bucket_count; + uint64_t target_count; +} linear_hist_threshold; + +//------------------------------------------------ +// These must all be called from the same thread! +// + +linear_hist *linear_hist_create(const char *name, uint32_t start, uint32_t max_offset, uint32_t num_buckets); +void linear_hist_destroy(linear_hist *h); +void linear_hist_reset(linear_hist *h, uint32_t start, uint32_t max_offset, uint32_t num_buckets); +void linear_hist_clear(linear_hist *h, uint32_t start, uint32_t max_offset); + +uint64_t linear_hist_get_total(linear_hist *h); +void linear_hist_merge(linear_hist *h1, linear_hist *h2); +void linear_hist_insert_data_point(linear_hist *h, uint32_t point); +uint64_t linear_hist_get_threshold_for_fraction(linear_hist *h, uint32_t tenths_pct, linear_hist_threshold *p_threshold); +uint64_t linear_hist_get_threshold_for_subtotal(linear_hist *h, uint64_t subtotal, linear_hist_threshold *p_threshold); + +void linear_hist_dump(linear_hist *h); +void linear_hist_save_info(linear_hist *h); + +//------------------------------------------------ +// This call is thread-safe. +// + +void linear_hist_get_info(linear_hist *h, cf_dyn_buf *db); diff --git a/cf/include/mem_count.h b/cf/include/mem_count.h new file mode 100644 index 00000000..71652749 --- /dev/null +++ b/cf/include/mem_count.h @@ -0,0 +1,51 @@ +/* + * mem_count.h + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include "dynbuf.h" + +/* + * Type for selecting the field to be sorted on for memory count reporting. + */ +typedef enum sort_field_e { + CF_ALLOC_SORT_NET_SZ, + CF_ALLOC_SORT_DELTA_SZ, + CF_ALLOC_SORT_NET_ALLOC_COUNT, + CF_ALLOC_SORT_TOTAL_ALLOC_COUNT, + CF_ALLOC_SORT_TIME_LAST_MODIFIED +} sort_field_t; + +/* + * Type for mode of enabling / disabling memory accounting. + */ +typedef enum mem_count_mode_e { + MEM_COUNT_DISABLE, // Disable memory accounting. + MEM_COUNT_ENABLE, // Enable memory accounting at daemon start-up time. + MEM_COUNT_ENABLE_DYNAMIC // Enable memory accounting at run-time. +} mem_count_mode_t; + +int mem_count_init(mem_count_mode_t mode); +void mem_count_stats(void); +int mem_count_alloc_info(char *file, int line, cf_dyn_buf *db); +int mem_count_report(sort_field_t sort_field, int top_n, cf_dyn_buf *db); +void mem_count_shutdown(void); diff --git a/cf/include/meminfo.h b/cf/include/meminfo.h new file mode 100644 index 00000000..9df7af07 --- /dev/null +++ b/cf/include/meminfo.h @@ -0,0 +1,33 @@ +/* + * meminfo.h + * + * Copyright (C) 2010 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include + +/* SYNOPSIS + * We have the ability to evict data to protect the server. + */ + +int +cf_meminfo(uint64_t *physmem, uint64_t *freemem, int *freepct, bool *swapping); diff --git a/cf/include/msg.h b/cf/include/msg.h new file mode 100644 index 00000000..a2ef961c --- /dev/null +++ b/cf/include/msg.h @@ -0,0 +1,232 @@ +/* + * msg.h + * + * Copyright (C) 2008-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include +#include +#include + +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_vector.h" + +#include "dynbuf.h" + + +//========================================================== +// Typedefs & constants. +// + +// These values are used on the wire - don't change them. +typedef enum { + M_FT_UINT32 = 1, + M_FT_UNUSED_2 = 2, + M_FT_UINT64 = 3, + M_FT_UNUSED_4 = 4, + M_FT_STR = 5, + M_FT_BUF = 6, + M_FT_ARRAY_UINT32 = 7, + M_FT_ARRAY_UINT64 = 8, + M_FT_ARRAY_BUF = 9, + M_FT_ARRAY_STR = 10, + M_FT_MSGPACK = 11 +} msg_field_type; // encoded in uint8_t + +// These values are used on the wire - don't change them. +typedef enum { + M_TYPE_FABRIC = 0, + M_TYPE_HEARTBEAT_V2 = 1, + M_TYPE_PAXOS = 2, + M_TYPE_MIGRATE = 3, + M_TYPE_PROXY = 4, + M_TYPE_HEARTBEAT = 5, + M_TYPE_CLUSTERING = 6, + M_TYPE_RW = 7, + M_TYPE_INFO = 8, + M_TYPE_EXCHANGE = 9, + M_TYPE_APPEAL = 10, + M_TYPE_XDR = 11, + M_TYPE_UNUSED_12 = 12, + M_TYPE_UNUSED_13 = 13, + M_TYPE_UNUSED_14 = 14, + M_TYPE_SMD = 15, + M_TYPE_UNUSED_16 = 16, + M_TYPE_UNUSED_17 = 17, + M_TYPE_MAX = 18 +} msg_type; // encoded in uint16_t + +typedef struct msg_template_s { + uint16_t id; + msg_field_type type; +} msg_template; + +struct msg_str_array_s; +struct msg_buf_array_s; + +typedef struct msg_field_s { + uint16_t id; + bool is_set; + bool is_free; + uint32_t field_sz; + + union { + uint32_t ui32; + uint64_t ui64; + char *str; + uint8_t *buf; + uint32_t *ui32_a; + uint64_t *ui64_a; + struct msg_str_array_s *str_a; + struct msg_buf_array_s *buf_a; + void *any_buf; + } u; +} msg_field; + +typedef struct msg_s { + msg_type type; + uint16_t n_fields; + bool just_parsed; // fields point into fabric buffer + uint32_t bytes_used; + uint32_t bytes_alloc; + uint64_t benchmark_time; + msg_field f[]; // indexed by id +} msg; + +// msg header on wire. +typedef struct msg_hdr_s { + uint32_t size; + uint16_t type; +} __attribute__ ((__packed__)) msg_hdr; + +typedef enum { + MSG_GET_DIRECT, + MSG_GET_COPY_MALLOC +} msg_get_type; + +typedef enum { + MSG_SET_HANDOFF_MALLOC, + MSG_SET_COPY +} msg_set_type; + +typedef struct msg_buf_ele_s { + uint32_t sz; + uint8_t *ptr; +} msg_buf_ele; + + +//========================================================== +// Globals. +// + +extern cf_atomic_int g_num_msgs; +extern cf_atomic_int g_num_msgs_by_type[M_TYPE_MAX]; + + +//========================================================== +// Public API. +// + +//------------------------------------------------ +// Object accounting. +// + +// Free up a "msg" object. Call this function instead of freeing the msg +// directly in order to keep track of all msgs. +void msg_put(msg *m); + +//------------------------------------------------ +// Lifecycle. +// + +void msg_type_register(msg_type type, const msg_template *mt, size_t mt_sz, size_t scratch_sz); +msg *msg_create(msg_type type); +void msg_destroy(msg *m); +void msg_incr_ref(msg *m); + +//------------------------------------------------ +// Pack messages into flattened data. +// + +size_t msg_get_wire_size(const msg *m); +size_t msg_get_template_fixed_sz(const msg_template *mt, size_t mt_count); +size_t msg_to_wire(const msg *m, uint8_t *buf); + +//------------------------------------------------ +// Parse flattened data into messages. +// + +int msg_parse(msg *m, const uint8_t *buf, size_t bufsz); +int msg_get_initial(uint32_t *size_r, msg_type *type_r, const uint8_t *buf, uint32_t bufsz); + +void msg_reset(msg *m); +void msg_preserve_fields(msg *m, uint32_t n_field_ids, ...); +void msg_preserve_all_fields(msg *m); + +//------------------------------------------------ +// Set fields in messages. +// + +int msg_set_uint32(msg *m, int field_id, uint32_t v); +int msg_set_uint64(msg *m, int field_id, uint64_t v); +int msg_set_str(msg *m, int field_id, const char *v, msg_set_type type); +int msg_set_buf(msg *m, int field_id, const uint8_t *v, size_t sz, msg_set_type type); + +int msg_set_uint32_array_size(msg *m, int field_id, uint32_t count); +int msg_set_uint32_array(msg *m, int field_id, uint32_t idx, uint32_t v); +int msg_set_uint64_array_size(msg *m, int field_id, uint32_t count); +int msg_set_uint64_array(msg *m, int field_id, uint32_t idx, uint64_t v); + +void msg_msgpack_list_set_uint32(msg *m, int field_id, const uint32_t *buf, uint32_t count); +void msg_msgpack_list_set_buf(msg *m, int field_id, const cf_vector *v); + +//------------------------------------------------ +// Get fields from messages. +// + +msg_field_type msg_field_get_type(const msg *m, int field_id); +bool msg_is_set(const msg *m, int field_id); +int msg_get_uint32(const msg *m, int field_id, uint32_t *val_r); +int msg_get_uint64(const msg *m, int field_id, uint64_t *val_r); +int msg_get_str(const msg *m, int field_id, char **str_r, size_t *sz_r, msg_get_type type); +int msg_get_buf(const msg *m, int field_id, uint8_t **buf_r, size_t *sz_r, msg_get_type type); + +int msg_get_uint32_array(const msg *m, int field_id, uint32_t idx, uint32_t *val_r); +int msg_get_uint64_array_count(const msg *m, int field_id, uint32_t *count_r); +int msg_get_uint64_array(const msg *m, int field_id, uint32_t idx, uint64_t *val_r); + +bool msg_msgpack_container_get_count(const msg *m, int field_id, uint32_t *count_r); +bool msg_msgpack_list_get_uint32_array(const msg *m, int field_id, uint32_t *buf_r, uint32_t *count_r); +bool msg_msgpack_list_get_buf_array(const msg *m, int field_id, cf_vector *v_r, bool init_vec); + +static inline bool +msg_msgpack_list_get_buf_array_presized(const msg *m, int field_id, cf_vector *v_r) +{ + return msg_msgpack_list_get_buf_array(m, field_id, v_r, false); +} + + +//========================================================== +// Debugging API. +// + +void msg_dump(const msg *m, const char *info); diff --git a/cf/include/node.h b/cf/include/node.h new file mode 100644 index 00000000..4e2f81eb --- /dev/null +++ b/cf/include/node.h @@ -0,0 +1,71 @@ +/* + * node.h + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include + +#include "compare.h" + +typedef uint64_t cf_node; + +uint32_t cf_nodeid_shash_fn(const void *key); +uint32_t cf_nodeid_rchash_fn(const void *key, uint32_t key_size); +char *cf_node_name(); + +static inline int +index_of_node(const cf_node* nodes, uint32_t n_nodes, cf_node node) +{ + for (uint32_t n = 0; n < n_nodes; n++) { + if (node == nodes[n]) { + return (int)n; + } + } + + return -1; +} + +static inline bool +contains_node(const cf_node* nodes, uint32_t n_nodes, cf_node node) +{ + return index_of_node(nodes, n_nodes, node) != -1; +} + +static inline uint32_t +remove_node(cf_node* nodes, uint32_t n_nodes, cf_node node) +{ + int n = index_of_node(nodes, n_nodes, node); + + if (n != -1) { + nodes[n] = nodes[--n_nodes]; + } + + return n_nodes; +} + +static inline int +cf_node_compare_desc(const void* pa, const void* pb) +{ + // Relies on cf_node being uint64_t. + return cf_compare_uint64_desc(pa, pb); +} diff --git a/cf/include/olock.h b/cf/include/olock.h new file mode 100644 index 00000000..a7907b7d --- /dev/null +++ b/cf/include/olock.h @@ -0,0 +1,49 @@ +/* + * olock.h + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * An object lock system allows fewer locks to be created + */ + +#pragma once + +#include +#include + +#include + +#include + + +typedef struct olock_s { + uint32_t n_locks; + uint32_t mask; + cf_mutex locks[]; +} olock; + +void olock_lock(olock *ol, cf_digest *d); +void olock_vlock(olock *ol, cf_digest *d, cf_mutex **vlock); +void olock_unlock(olock *ol, cf_digest *d); +olock *olock_create(uint32_t n_locks, bool mutex); +void olock_destroy(olock *o); + +extern olock *g_record_locks; diff --git a/cf/include/shash.h b/cf/include/shash.h new file mode 100644 index 00000000..c70e4d11 --- /dev/null +++ b/cf/include/shash.h @@ -0,0 +1,110 @@ +/* + * shash.h + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include + +#include + + +//========================================================== +// Typedefs & constants. +// + +// Return codes. +#define CF_SHASH_ERR_FOUND -4 +#define CF_SHASH_ERR_NOT_FOUND -3 +#define CF_SHASH_ERR -1 +#define CF_SHASH_OK 0 +#define CF_SHASH_REDUCE_DELETE 1 + +// Bit-values for 'flags' parameter. +#define CF_SHASH_BIG_LOCK 0x01 // thread-safe with single big lock +#define CF_SHASH_MANY_LOCK 0x02 // thread-safe with lock per bucket + +// User must provide the hash function at create time. +typedef uint32_t (*cf_shash_hash_fn)(const void *key); + +// FIXME - explain or replace. +typedef void (*cf_shash_update_fn)(const void *key, void *value_old, void *value_new, void *udata); + +// The "reduce" function called for every element. Returned value governs +// behavior during reduce as follows: +// - CF_SHASH_OK - continue iterating +// - CF_SHASH_REDUCE_DELETE - delete the current element, continue iterating +// - anything else (e.g. CF_SHASH_ERR) - stop iterating and return reduce_fn's +// returned value +typedef int (*cf_shash_reduce_fn)(const void *key, void *value, void *udata); + +// Private data. +typedef struct cf_shash_s { + cf_shash_hash_fn h_fn; + uint32_t key_size; + uint32_t value_size; + uint32_t ele_size; + uint32_t n_buckets; + uint32_t flags; + cf_atomic32 n_elements; + void *table; + pthread_mutex_t *bucket_locks; + pthread_mutex_t big_lock; +} cf_shash; + + +//========================================================== +// Public API - useful hash functions. +// + +// TODO - hash function signature may change. +uint32_t cf_shash_fn_u32(const void *key); +uint32_t cf_shash_fn_ptr(const void *key); +uint32_t cf_shash_fn_zstr(const void *key); + + +//========================================================== +// Public API. +// + +cf_shash *cf_shash_create(cf_shash_hash_fn h_fn, uint32_t key_size, uint32_t value_size, uint32_t n_buckets, uint32_t flags); +void cf_shash_destroy(cf_shash *h); +uint32_t cf_shash_get_size(cf_shash *h); + +void cf_shash_put(cf_shash *h, const void *key, const void *value); +int cf_shash_put_unique(cf_shash *h, const void *key, const void *value); + +void cf_shash_update(cf_shash *h, const void *key, void *value_old, void *value_new, cf_shash_update_fn update_fn, void *udata); + +int cf_shash_get(cf_shash *h, const void *key, void *value); +int cf_shash_get_vlock(cf_shash *h, const void *key, void **value_r, pthread_mutex_t **vlock_r); + +int cf_shash_delete(cf_shash *h, const void *key); +int cf_shash_delete_lockfree(cf_shash *h, const void *key); +int cf_shash_get_and_delete(cf_shash *h, const void *key, void *value); +void cf_shash_delete_all(cf_shash *h); + +int cf_shash_reduce(cf_shash *h, cf_shash_reduce_fn reduce_fn, void *udata); diff --git a/cf/include/socket.h b/cf/include/socket.h new file mode 100644 index 00000000..ca5f8d29 --- /dev/null +++ b/cf/include/socket.h @@ -0,0 +1,340 @@ +/* + * socket.h + * + * Copyright (C) 2008-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include "fault.h" +#include "msg.h" +#include "node.h" + +// Use forward declaration instead of including openssl/ssl.h here. +struct ssl_st; + +#define CF_SOCKET_TIMEOUT 10000 +#define CF_SOCK_CFG_MAX 250 + +// Accesses the socket file descriptor as an rvalue, i.e., the socket file descriptor +// cannot be modified. +#define CSFD(sock) ((int32_t)((sock)->fd)) + +// CSFD() for epoll file descriptors. +#define CEFD(poll) ((int32_t)(poll).fd) + +// Like CEFD(), but produces an lvalue, i.e., the epoll file descriptor can be modified. +#define EFD(poll) ((poll).fd) + +#define cf_ip_addr_print(_addr) ({ \ + char *_tmp = alloca(250); \ + cf_ip_addr_to_string_safe(_addr, _tmp, 250); \ + _tmp; \ +}) + +#define cf_ip_addr_print_multi(_addrs, _n_addrs) ({ \ + char *_tmp = alloca(2500); \ + cf_ip_addr_to_string_multi_safe(_addrs, _n_addrs, _tmp, 2500); \ + _tmp; \ +}) + +#define cf_ip_port_print(_port) ({ \ + char *_tmp = alloca(25); \ + cf_ip_port_to_string_safe(_port, _tmp, 25); \ + _tmp; \ +}) + +#define cf_sock_addr_print(_addr) ({ \ + char *_tmp = alloca(250); \ + cf_sock_addr_to_string_safe(_addr, _tmp, 250); \ + _tmp; \ +}) + +typedef struct cf_ip_addr_s { + sa_family_t family; + + union { + struct in_addr v4; + struct in6_addr v6; + }; +} cf_ip_addr; + +typedef uint16_t cf_ip_port; + +typedef struct cf_addr_list_s { + uint32_t n_addrs; + const char *addrs[CF_SOCK_CFG_MAX]; +} cf_addr_list; + +typedef struct cf_serv_spec_s { + cf_ip_port bind_port; + cf_addr_list bind; + cf_ip_port std_port; + cf_addr_list std; + cf_ip_port alt_port; + cf_addr_list alt; + char *tls_our_name; + uint32_t n_tls_peer_names; + char *tls_peer_names[CF_SOCK_CFG_MAX]; +} cf_serv_spec; + +typedef struct cf_sock_addr_s { + cf_ip_addr addr; + cf_ip_port port; +} cf_sock_addr; + +typedef enum { + CF_SOCKET_STATE_NON_TLS, + CF_SOCKET_STATE_TLS_HANDSHAKE, + CF_SOCKET_STATE_TLS_READY +} cf_socket_state; + +typedef struct cf_socket_s { + int32_t fd; + cf_socket_state state; + void *cfg; + struct ssl_st *ssl; +} cf_socket; + +typedef struct cf_sockets_s { + uint32_t n_socks; + cf_socket socks[CF_SOCK_CFG_MAX]; +} cf_sockets; + +typedef enum { + CF_SOCK_OWNER_SERVICE, + CF_SOCK_OWNER_SERVICE_TLS, + CF_SOCK_OWNER_HEARTBEAT, + CF_SOCK_OWNER_HEARTBEAT_TLS, + CF_SOCK_OWNER_FABRIC, + CF_SOCK_OWNER_FABRIC_TLS, + CF_SOCK_OWNER_INFO, + CF_SOCK_OWNER_XDR, + CF_SOCK_OWNER_INVALID +} cf_sock_owner; + +typedef struct cf_sock_cfg_s { + cf_sock_owner owner; + cf_ip_port port; + cf_ip_addr addr; +} cf_sock_cfg; + +typedef struct cf_serv_cfg_s { + uint32_t n_cfgs; + cf_sock_cfg cfgs[CF_SOCK_CFG_MAX]; +} cf_serv_cfg; + +typedef struct cf_poll_s { + int32_t fd; +} __attribute__((packed)) cf_poll; + +// This precisely matches the epoll_event struct. +typedef struct cf_poll_event_s { + uint32_t events; + void *data; +} __attribute__((packed)) cf_poll_event; + +typedef struct cf_msock_cfg_s { + cf_sock_owner owner; + cf_ip_port port; + cf_ip_addr addr; + cf_ip_addr if_addr; + uint8_t ttl; +} cf_msock_cfg; + +typedef struct cf_mserv_cfg_s { + uint32_t n_cfgs; + cf_msock_cfg cfgs[CF_SOCK_CFG_MAX]; +} cf_mserv_cfg; + +void cf_socket_set_advertise_ipv6(bool advertise); +bool cf_socket_advertises_ipv6(void); + +CF_MUST_CHECK int32_t cf_ip_addr_from_string(const char *string, cf_ip_addr *addr); +CF_MUST_CHECK int32_t cf_ip_addr_from_string_multi(const char *string, cf_ip_addr *addrs, uint32_t *n_addrs); +CF_MUST_CHECK int32_t cf_ip_addr_to_string(const cf_ip_addr *addr, char *string, size_t size); +void cf_ip_addr_to_string_safe(const cf_ip_addr *addr, char *string, size_t size); +CF_MUST_CHECK int32_t cf_ip_addr_to_string_multi(const cf_ip_addr *addrs, uint32_t n_addrs, char *string, size_t size); +void cf_ip_addr_to_string_multi_safe(const cf_ip_addr *addrs, uint32_t n_addrs, char *string, size_t size); +CF_MUST_CHECK int32_t cf_ip_addr_from_binary(const uint8_t *binary, size_t size, cf_ip_addr *addr); +CF_MUST_CHECK int32_t cf_ip_addr_to_binary(const cf_ip_addr *addr, uint8_t *binary, size_t size); +void cf_ip_addr_to_rack_aware_id(const cf_ip_addr *addr, uint32_t *id); + +CF_MUST_CHECK int32_t cf_ip_addr_compare(const cf_ip_addr *lhs, const cf_ip_addr *rhs); +void cf_ip_addr_copy(const cf_ip_addr *from, cf_ip_addr *to); +void cf_ip_addr_sort(cf_ip_addr *addrs, uint32_t n_addrs); + +bool cf_ip_addr_is_dns_name(const char *string); +bool cf_ip_addr_str_is_legacy(const char *string); +bool cf_ip_addr_is_legacy(const cf_ip_addr *addr); +bool cf_ip_addr_legacy_only(void); + +void cf_ip_addr_set_local(cf_ip_addr *addr); +CF_MUST_CHECK bool cf_ip_addr_is_local(const cf_ip_addr *addr); + +void cf_ip_addr_set_any(cf_ip_addr *addr); +CF_MUST_CHECK bool cf_ip_addr_is_any(const cf_ip_addr *addr); + +CF_MUST_CHECK int32_t cf_ip_port_from_string(const char *string, cf_ip_port *port); +CF_MUST_CHECK int32_t cf_ip_port_to_string(cf_ip_port port, char *string, size_t size); +void cf_ip_port_to_string_safe(cf_ip_port port, char *string, size_t size); +CF_MUST_CHECK int32_t cf_ip_port_from_binary(const uint8_t *binary, size_t size, cf_ip_port *port); +CF_MUST_CHECK int32_t cf_ip_port_to_binary(cf_ip_port port, uint8_t *binary, size_t size); +void cf_ip_port_from_node_id(cf_node id, cf_ip_port *port); + +CF_MUST_CHECK int32_t cf_sock_addr_from_string(const char *string, cf_sock_addr *addr); +CF_MUST_CHECK int32_t cf_sock_addr_to_string(const cf_sock_addr *addr, char *string, size_t size); +void cf_sock_addr_to_string_safe(const cf_sock_addr *addr, char *string, size_t size); +CF_MUST_CHECK int32_t cf_sock_addr_from_binary(const uint8_t *binary, size_t size, cf_sock_addr *addr); +CF_MUST_CHECK int32_t cf_sock_addr_to_binary(const cf_sock_addr *addr, uint8_t *binary, size_t size); + +CF_MUST_CHECK int32_t cf_sock_addr_from_host_port(const char *host, cf_ip_port port, cf_sock_addr *addr); +void cf_sock_addr_from_addr_port(const cf_ip_addr *ip_addr, cf_ip_port port, cf_sock_addr *addr); + +CF_MUST_CHECK int32_t cf_sock_addr_compare(const cf_sock_addr *lhs, const cf_sock_addr *rhs); +void cf_sock_addr_copy(const cf_sock_addr *from, cf_sock_addr *to); + +void cf_sock_addr_from_native(const struct sockaddr *native, cf_sock_addr *addr); +void cf_sock_addr_to_native(const cf_sock_addr *addr, struct sockaddr *native); + +void cf_sock_addr_set_any(cf_sock_addr *addr); +CF_MUST_CHECK bool cf_sock_addr_is_any(const cf_sock_addr *addr); + +void cf_sock_cfg_init(cf_sock_cfg *cfg, cf_sock_owner owner); +void cf_sock_cfg_copy(const cf_sock_cfg *from, cf_sock_cfg *to); + +void cf_serv_cfg_init(cf_serv_cfg *cfg); +CF_MUST_CHECK int32_t cf_serv_cfg_add_sock_cfg(cf_serv_cfg *serv_cfg, const cf_sock_cfg *sock_cfg); + +void cf_sockets_init(cf_sockets *socks); +CF_MUST_CHECK bool cf_sockets_has_socket(const cf_sockets *socks, const cf_socket *sock); +void cf_sockets_close(cf_sockets *socks); + +void cf_fd_disable_blocking(int32_t fd); + +void cf_socket_disable_blocking(cf_socket *sock); +void cf_socket_enable_blocking(cf_socket *sock); +void cf_socket_disable_nagle(cf_socket *sock); +void cf_socket_enable_nagle(cf_socket *sock); +void cf_socket_keep_alive(cf_socket *sock, int32_t idle, int32_t interval, int32_t count); +void cf_socket_set_send_buffer(cf_socket *sock, int32_t size); +void cf_socket_set_receive_buffer(cf_socket *sock, int32_t size); +void cf_socket_set_window(cf_socket *sock, int32_t size); + +void cf_socket_init(cf_socket *sock); +bool cf_socket_exists(cf_socket *sock); + +static inline void cf_socket_copy(const cf_socket *from, cf_socket *to) +{ + to->fd = from->fd; + to->state = from->state; + to->cfg = from->cfg; + to->ssl = from->ssl; +} + +CF_MUST_CHECK int32_t cf_socket_init_server(cf_serv_cfg *cfg, cf_sockets *socks); +void cf_socket_show_server(cf_fault_context cont, const char *tag, const cf_sockets *socks); +CF_MUST_CHECK int32_t cf_socket_init_client(cf_sock_cfg *cfg, int32_t timeout, cf_socket *sock); + +CF_MUST_CHECK int32_t cf_socket_accept(cf_socket *lsock, cf_socket *sock, cf_sock_addr *addr); +CF_MUST_CHECK int32_t cf_socket_remote_name(const cf_socket *sock, cf_sock_addr *addr); +CF_MUST_CHECK int32_t cf_socket_local_name(const cf_socket *sock, cf_sock_addr *addr); +CF_MUST_CHECK int32_t cf_socket_available(cf_socket *sock); + +CF_MUST_CHECK int32_t cf_socket_recv_from(cf_socket *sock, void *buff, size_t size, int32_t flags, cf_sock_addr *addr); +CF_MUST_CHECK int32_t cf_socket_recv(cf_socket *sock, void *buff, size_t size, int32_t flags); +CF_MUST_CHECK int32_t cf_socket_send_to(cf_socket *sock, const void *buff, size_t size, int32_t flags, const cf_sock_addr *addr); +CF_MUST_CHECK int32_t cf_socket_send(cf_socket *sock, const void *buff, size_t size, int32_t flags); + +CF_MUST_CHECK int32_t cf_socket_recv_from_all(cf_socket *sock, void *buff, size_t size, int32_t flags, cf_sock_addr *addr, int32_t timeout); +CF_MUST_CHECK int32_t cf_socket_recv_all(cf_socket *sock, void *buff, size_t size, int32_t flags, int32_t timeout); +CF_MUST_CHECK int32_t cf_socket_send_to_all(cf_socket *sock, const void *buff, size_t size, int32_t flags, const cf_sock_addr *addr, int32_t timeout); +CF_MUST_CHECK int32_t cf_socket_send_all(cf_socket *sock, const void *buff, size_t size, int32_t flags, int32_t timeout); + +void cf_socket_write_shutdown(cf_socket *sock); +void cf_socket_shutdown(cf_socket *sock); +void cf_socket_close(cf_socket *sock); +void cf_socket_drain_close(cf_socket *sock); +void cf_socket_term(cf_socket *sock); + +void cf_msock_cfg_init(cf_msock_cfg *cfg, cf_sock_owner owner); +void cf_msock_cfg_copy(const cf_msock_cfg *from, cf_msock_cfg *to); + +void cf_mserv_cfg_init(cf_mserv_cfg *cfg); +CF_MUST_CHECK int32_t cf_mserv_cfg_add_msock_cfg(cf_mserv_cfg *serv_cfg, const cf_msock_cfg *sock_cfg); +CF_MUST_CHECK int32_t cf_mserv_cfg_add_combo(cf_mserv_cfg *serv_cfg, cf_sock_owner owner, cf_ip_port port, cf_ip_addr *addr, cf_ip_addr *if_addr, uint8_t ttl); + +CF_MUST_CHECK int32_t cf_socket_mcast_init(cf_mserv_cfg *cfg, cf_sockets *socks); +void cf_socket_mcast_show(cf_fault_context cont, const char *tag, const cf_sockets *socks); +CF_MUST_CHECK int32_t cf_socket_mcast_set_inter(cf_socket *sock, const cf_ip_addr *iaddr); +CF_MUST_CHECK int32_t cf_socket_mcast_set_ttl(cf_socket *sock, int32_t ttl); +CF_MUST_CHECK int32_t cf_socket_mcast_join_group(cf_socket *sock, const cf_ip_addr *iaddr, const cf_ip_addr *gaddr); + +void cf_poll_create(cf_poll *poll); +void cf_poll_add_fd(cf_poll poll, int32_t fd, uint32_t events, void *data); +void cf_poll_add_socket(cf_poll poll, const cf_socket *sock, uint32_t events, void *data); +CF_MUST_CHECK int32_t cf_poll_modify_socket_forgiving(cf_poll poll, const cf_socket *sock, uint32_t events, void *data, uint32_t n_err_ok, int32_t *err_ok); +CF_MUST_CHECK int32_t cf_poll_delete_socket_forgiving(cf_poll poll, const cf_socket *sock, uint32_t n_err_ok, int32_t *err_ok); +void cf_poll_add_sockets(cf_poll poll, cf_sockets *socks, uint32_t events); +void cf_poll_delete_sockets(cf_poll poll, cf_sockets *socks); +CF_MUST_CHECK int32_t cf_poll_wait(cf_poll poll, cf_poll_event *events, int32_t limit, int32_t timeout); +void cf_poll_destroy(cf_poll poll); + +static inline void cf_poll_modify_socket(cf_poll poll, const cf_socket *sock, uint32_t events, void *data) +{ + CF_IGNORE_ERROR(cf_poll_modify_socket_forgiving(poll, sock, events, data, 0, NULL)); +} + +static inline void cf_poll_delete_socket(cf_poll poll, const cf_socket *sock) +{ + CF_IGNORE_ERROR(cf_poll_delete_socket_forgiving(poll, sock, 0, NULL)); +} + +CF_MUST_CHECK int32_t cf_inter_get_addr_all(cf_ip_addr *addrs, uint32_t *n_addrs); +CF_MUST_CHECK int32_t cf_inter_get_addr_all_legacy(cf_ip_addr *addrs, uint32_t *n_addrs); +CF_MUST_CHECK int32_t cf_inter_get_addr_def(cf_ip_addr *addrs, uint32_t *n_addrs); +CF_MUST_CHECK int32_t cf_inter_get_addr_def_legacy(cf_ip_addr *addrs, uint32_t *n_addrs); +CF_MUST_CHECK int32_t cf_inter_get_addr_name(cf_ip_addr *addrs, uint32_t *n_addrs, const char *if_name); +bool cf_inter_is_inter_name(const char *if_name); +CF_MUST_CHECK int32_t cf_inter_addr_to_index_and_name(const cf_ip_addr *addr, int32_t *index, char **name); +void cf_inter_expand_bond(const char *if_name, char **out_names, uint32_t *n_out); +CF_MUST_CHECK int32_t cf_inter_mtu(const cf_ip_addr *inter_addr); +CF_MUST_CHECK int32_t cf_inter_min_mtu(void); +bool cf_inter_detect_changes(cf_ip_addr *addrs, uint32_t *n_addrs, uint32_t limit); +bool cf_inter_detect_changes_legacy(cf_ip_addr *addrs, uint32_t *n_addrs, uint32_t limit); + +CF_MUST_CHECK int32_t cf_node_id_get(cf_ip_port port, const char *if_hint, cf_node *id); + +#if defined CF_SOCKET_PRIVATE +CF_MUST_CHECK size_t cf_socket_addr_len(const struct sockaddr* sa); +CF_MUST_CHECK int32_t cf_socket_parse_netlink(bool allow_v6, uint32_t family, uint32_t flags, + const void *data, size_t len, cf_ip_addr *addr); +void cf_socket_fix_client(cf_socket *sock); +void cf_socket_fix_bind(cf_serv_cfg *serv_cfg); +void cf_socket_fix_server(cf_socket *sock); +#endif diff --git a/cf/include/tls.h b/cf/include/tls.h new file mode 100644 index 00000000..2bd77ca9 --- /dev/null +++ b/cf/include/tls.h @@ -0,0 +1,75 @@ +/* + * tls.h + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +#include "socket.h" + +struct cf_tls_info_s; +typedef struct cf_tls_info_s cf_tls_info; + +typedef struct cf_tls_spec_s { + char *ca_file; + char *ca_path; + char *cert_blacklist; + char *cert_file; + char *cipher_suite; + char *key_file; + char *name; + char *protocols; +} cf_tls_spec; + +void tls_check_init(); + +void tls_cleanup(); +void tls_thread_cleanup(); + +void tls_socket_init(cf_socket *sock); +void tls_socket_term(cf_socket *sock); +int tls_socket_shutdown(cf_socket *sock); +void tls_socket_close(cf_socket *sock); + +cf_tls_info *tls_config_server_context(cf_tls_spec *tspec, bool auth_client, uint32_t n_peer_names, char **peer_names); +cf_tls_info *tls_config_intra_context(cf_tls_spec *tspec, const char *which); + +void tls_socket_prepare_server(cf_tls_info *info, cf_socket *sock); +void tls_socket_prepare_client(cf_tls_info *info, cf_socket *sock); + +static inline bool tls_socket_needs_handshake(cf_socket *sock) +{ + return sock->state == CF_SOCKET_STATE_TLS_HANDSHAKE; +} + +void tls_socket_must_not_have_data(cf_socket *sock, const char *caller); + +int tls_socket_accept(cf_socket *sock); +int tls_socket_connect(cf_socket *sock); +int tls_socket_accept_block(cf_socket *sock); +int tls_socket_connect_block(cf_socket *sock); + +int tls_socket_recv(cf_socket *sock, void *buf, size_t sz, int32_t flags, + uint64_t timeout_msec); + +int tls_socket_send(cf_socket *sock, void const *buf, size_t sz, int32_t flags, + uint64_t timeout_msec); + +int tls_socket_pending(cf_socket *sock); diff --git a/cf/include/vmapx.h b/cf/include/vmapx.h new file mode 100644 index 00000000..93b3c50d --- /dev/null +++ b/cf/include/vmapx.h @@ -0,0 +1,100 @@ +/* + * vmapx.h + * + * Copyright (C) 2012-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma once + +//========================================================== +// Includes. +// + +#include +#include +#include + + +//========================================================== +// Typedefs & constants. +// + +typedef struct vhash_s vhash; + +// DO NOT access this member data directly - use the API! +// Caution - changing this struct could break warm or cool restart. +typedef struct cf_vmapx_s { + // Vector-related. + uint32_t value_size; + uint32_t max_count; + volatile uint32_t count; + + // Hash-related. + uint32_t key_size; + vhash* hash; + + // Generic. + pthread_mutex_t write_lock; + + //<><><><><><><><><><><> 64 bytes <><><><><><><><><><><> + + // Vector data. + uint8_t values[]; +} cf_vmapx; + +typedef enum { + CF_VMAPX_OK = 0, + CF_VMAPX_ERR_BAD_PARAM, + CF_VMAPX_ERR_FULL, + CF_VMAPX_ERR_NAME_EXISTS, + CF_VMAPX_ERR_NAME_NOT_FOUND, + CF_VMAPX_ERR_UNKNOWN +} cf_vmapx_err; + + +//========================================================== +// Public API. +// + +size_t cf_vmapx_sizeof(uint32_t value_size, uint32_t max_count); + +void cf_vmapx_init(cf_vmapx* vmap, uint32_t value_size, uint32_t max_count, uint32_t hash_size, uint32_t max_name_size); +void cf_vmapx_release(cf_vmapx* vmap); + +uint32_t cf_vmapx_count(const cf_vmapx* vmap); + +cf_vmapx_err cf_vmapx_get_by_index(const cf_vmapx* vmap, uint32_t index, void** pp_value); +cf_vmapx_err cf_vmapx_get_by_name(const cf_vmapx* vmap, const char* name, void** pp_value); + +cf_vmapx_err cf_vmapx_get_index(const cf_vmapx* vmap, const char* name, uint32_t* p_index); +cf_vmapx_err cf_vmapx_get_index_w_len(const cf_vmapx* vmap, const char* name, size_t name_len, uint32_t* p_index); + +cf_vmapx_err cf_vmapx_put_unique(cf_vmapx* vmap, const char* name, uint32_t* p_index); +cf_vmapx_err cf_vmapx_put_unique_w_len(cf_vmapx* vmap, const char* name, size_t name_len, uint32_t* p_index); + + +//========================================================== +// Private API - for enterprise separation only. +// + +void* vmapx_value_ptr(const cf_vmapx* vmap, uint32_t index); + +vhash* vhash_create(uint32_t key_size, uint32_t n_rows); +void vhash_destroy(vhash* h); +void vhash_put(vhash* h, const char* key, size_t key_len, uint32_t value); diff --git a/cf/include/warnings.h b/cf/include/warnings.h new file mode 100644 index 00000000..d17ca25b --- /dev/null +++ b/cf/include/warnings.h @@ -0,0 +1,28 @@ +/* + * warnings.h + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#pragma GCC diagnostic warning "-Wall" +#pragma GCC diagnostic warning "-Wextra" +#pragma GCC diagnostic warning "-Wconversion" +#pragma GCC diagnostic warning "-Wsign-conversion" +#pragma GCC diagnostic warning "-Wshadow" +#pragma GCC diagnostic warning "-Wmissing-declarations" diff --git a/cf/src/Makefile b/cf/src/Makefile new file mode 100644 index 00000000..7cca26ed --- /dev/null +++ b/cf/src/Makefile @@ -0,0 +1,41 @@ +# Citrusleaf Foundation +# Makefile + +DEPTH = ../.. +include $(DEPTH)/make_in/Makefile.in + +ifeq ($(USE_EE),1) + include $(EEREPO)/cf/make_in/Makefile.vars +endif + +HEADERS += arenax.h bits.h cf_mutex.h cf_str.h compare.h daemon.h dynbuf.h +HEADERS += enhanced_alloc.h fault.h hist.h hist_track.h linear_hist.h mem_count.h +HEADERS += meminfo.h msg.h node.h olock.h shash.h socket.h tls.h +HEADERS += vmapx.h + +SOURCES += alloc.c arenax.c cf_mutex.c cf_str.c daemon.c dynbuf.c fault.c hardware.c +SOURCES += hist.c hist_track.c linear_hist.c meminfo.c msg.c node.c olock.c +SOURCES += shash.c socket.c vmapx.c +ifneq ($(USE_EE),1) + SOURCES += arenax_ce.c socket_ce.c tls_ce.c +endif + +LIBRARY = $(LIBRARY_DIR)/libcf.a + +INCLUDES += $(INCLUDE_DIR:%=-I%) -I$(COMMON)/src/include + +OBJECTS = $(SOURCES:%.c=$(OBJECT_DIR)/%.o) +DEPENDENCIES = $(OBJECTS:%.o=%.d) + +.PHONY: all +all: $(LIBRARY) + +.PHONY: clean +clean: + $(RM) $(OBJECTS) $(LIBRARY) + $(RM) $(DEPENDENCIES) + +$(LIBRARY): $(OBJECTS) + $(AR) rs $(LIBRARY) $(OBJECTS) + +include $(DEPTH)/make_in/Makefile.targets diff --git a/cf/src/alloc.c b/cf/src/alloc.c new file mode 100644 index 00000000..159d3269 --- /dev/null +++ b/cf/src/alloc.c @@ -0,0 +1,1075 @@ +/* + * alloc.c + * + * Copyright (C) 2008-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +// Make sure that stdlib.h gives us aligned_alloc(). +#define _ISOC11_SOURCE + +#include "enhanced_alloc.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "fault.h" +#include "mem_count.h" + +#include "aerospike/ck/ck_pr.h" +#include "citrusleaf/cf_atomic.h" + +#include "warnings.h" + +#undef strdup +#undef strndup + +#define N_ARENAS 150 +#define PAGE_SZ 4096 + +#define MAX_SITES 4096 +#define MAX_THREADS 256 + +#define MULT 3486784401u +#define MULT_INV 3396732273u + +#define STR_(x) #x +#define STR(x) STR_(x) + +typedef struct site_info_s { + uint32_t site_id; + pid_t thread_id; + size_t size_lo; + size_t size_hi; +} site_info; + +// Old glibc versions don't provide this; work around compiler warning. +void *aligned_alloc(size_t align, size_t sz); + +const char *jem_malloc_conf = "narenas:" STR(N_ARENAS); + +extern size_t je_chunksize_mask; +extern void *je_huge_aalloc(const void *p); + +__thread int32_t g_ns_arena = -1; +static __thread int32_t g_ns_tcache = -1; + +static const void *g_site_ras[MAX_SITES]; +static uint32_t g_n_site_ras; + +static site_info g_site_infos[MAX_SITES * MAX_THREADS]; +// Start at 1, then we can use site ID 0 to mean "no site ID". +static uint32_t g_n_site_infos = 1; + +static __thread uint32_t g_thread_site_infos[MAX_SITES]; + +static __thread pid_t g_tid; +// Start with *_ALL; see cf_alloc_set_debug() for details. +static cf_alloc_debug g_debug = CF_ALLOC_DEBUG_ALL; + +// All the hook_*() functions are invoked from hook functions that hook into +// malloc() and friends for memory accounting purposes. +// +// This means that we have no idea who called us and, for example, which locks +// they hold. Let's be careful when calling back into asd code. + +static int32_t +hook_get_arena(const void *p) +{ + int32_t **base = (int32_t **)((uint64_t)p & ~je_chunksize_mask); + int32_t *arena; + + if (base != p) { + // Small or large allocation. + arena = base[0]; + } + else { + // Huge allocation. + arena = je_huge_aalloc(p); + } + + return arena[0]; +} + +static void +hook_check_arena(const void *p, int32_t arena) +{ + if (g_debug == CF_ALLOC_DEBUG_NONE) { + return; + } + + int32_t arena_p = hook_get_arena(p); + + if (arena < 0 && arena_p < N_ARENAS) { + return; + } + + // The "arena" parameter is never < N_ARENAS. + + if (arena >= N_ARENAS && arena_p >= N_ARENAS) { + return; + } + + size_t jem_sz = jem_sallocx(p, 0); + cf_crash(CF_ALLOC, "arena change for %zu@%p: %d -> %d", jem_sz, p, arena_p, arena); +} + +static pid_t +hook_gettid(void) +{ + if (g_tid == 0) { + g_tid = (pid_t)syscall(SYS_gettid); + } + + return g_tid; +} + +// Map a 64-bit address to a 12-bit site ID. + +static uint32_t +hook_get_site_id(const void *ra) +{ + uint32_t site_id = (uint32_t)(uint64_t)ra & (MAX_SITES - 1); + + for (uint32_t i = 0; i < MAX_SITES; ++i) { + const void *site_ra = ck_pr_load_ptr(g_site_ras + site_id); + + // The allocation site is already registered and we found its + // slot. Return the slot index. + + if (site_ra == ra) { + return site_id; + } + + // We reached an empty slot, i.e., the allocation site isn't yet + // registered. Try to register it. If somebody else managed to grab + // this slot in the meantime, keep looping. Otherwise return the + // slot index. + + if (site_ra == NULL && ck_pr_cas_ptr(g_site_ras + site_id, NULL, (void *)ra)) { + ck_pr_inc_32(&g_n_site_ras); + return site_id; + } + + site_id = (site_id + 1) & (MAX_SITES - 1); + } + + // More than MAX_SITES call sites. + cf_crash(CF_ALLOC, "too many call sites"); + // Not reached. + return 0; +} + +static uint32_t +hook_new_site_info_id(void) +{ + uint32_t info_id = ck_pr_faa_32(&g_n_site_infos, 1); + + if (info_id >= g_n_site_infos) { + cf_crash(CF_ALLOC, "site info pool exhausted"); + } + + return info_id; +} + +// Get the info ID of the site_info record for the given site ID and the current +// thread. In case the current thread doesn't yet have a site_info record for the +// given site ID, a new site_info record is allocated. + +static uint32_t +hook_get_site_info_id(uint32_t site_id) +{ + uint32_t info_id = g_thread_site_infos[site_id]; + + // This thread encountered this allocation site before. We already + // have a site info record. + + if (info_id != 0) { + return info_id; + } + + // This is the first time that this thread encounters this allocation + // site. We need to allocate a site_info record. + + info_id = hook_new_site_info_id(); + site_info *info = g_site_infos + info_id; + + info->site_id = site_id; + info->thread_id = hook_gettid(); + info->size_lo = 0; + info->size_hi = 0; + + g_thread_site_infos[site_id] = info_id; + return info_id; +} + +// Account for an allocation by the current thread for the allocation site +// with the given address. + +static void +hook_handle_alloc(const void *ra, void *p, size_t sz) +{ + if (p == NULL) { + return; + } + + size_t jem_sz = jem_sallocx(p, 0); + + uint32_t site_id = hook_get_site_id(ra); + uint32_t info_id = hook_get_site_info_id(site_id); + site_info *info = g_site_infos + info_id; + + size_t size_lo = info->size_lo; + info->size_lo += jem_sz; + + // Carry? + + if (info->size_lo < size_lo) { + ++info->size_hi; + } + + uint8_t *data = (uint8_t *)p + jem_sz - sizeof(uint32_t); + uint32_t *data32 = (uint32_t *)data; + + uint8_t *mark = (uint8_t *)p + sz; + size_t delta = (size_t)(data - mark); + + // Keep 0xffff as a marker for double free detection. + + if (delta > 0xfffe) { + delta = 0; + } + + *data32 = ((site_id << 16) | (uint32_t)delta) * MULT + 1; + + for (uint32_t i = 0; i < 4 && i < delta; ++i) { + mark[i] = data[i]; + } +} + +// Account for a deallocation by the current thread for the allocation +// site with the given address. + +static void +hook_handle_free(const void *ra, void *p, size_t jem_sz) +{ + uint8_t *data = (uint8_t *)p + jem_sz - sizeof(uint32_t); + uint32_t *data32 = (uint32_t *)data; + + uint32_t val = (*data32 - 1) * MULT_INV; + uint32_t site_id = val >> 16; + uint32_t delta = val & 0xffff; + + if (site_id >= MAX_SITES) { + cf_crash(CF_ALLOC, "corruption %zu@%p RA %p, invalid site ID", jem_sz, p, ra); + } + + const void *data_ra = ck_pr_load_ptr(g_site_ras + site_id); + + if (delta == 0xffff) { + cf_crash(CF_ALLOC, "corruption %zu@%p RA %p, potential double free, possibly freed before with RA %p", + jem_sz, p, ra, data_ra); + } + + if (delta > jem_sz - sizeof(uint32_t)) { + cf_crash(CF_ALLOC, "corruption %zu@%p RA %p, invalid delta length, possibly allocated with RA %p", + jem_sz, p, ra, data_ra); + } + + uint8_t *mark = data - delta; + + for (uint32_t i = 0; i < 4 && i < delta; ++i) { + if (mark[i] != data[i]) { + cf_crash(CF_ALLOC, "corruption %zu@%p RA %p, invalid mark, possibly allocated with RA %p", + jem_sz, p, ra, data_ra); + } + } + + uint32_t info_id = hook_get_site_info_id(site_id); + site_info *info = g_site_infos + info_id; + + size_t size_lo = info->size_lo; + info->size_lo -= jem_sz; + + // Borrow? + + if (info->size_lo > size_lo) { + --info->size_hi; + } + + // Replace the allocation site with the deallocation site to facilitate + // double-free debugging. + + site_id = hook_get_site_id(ra); + + // Also invalidate the delta length, so that we are more likely to detect + // double frees. + + *data32 = ((site_id << 16) | 0xffff) * MULT + 1; + + for (uint32_t i = 0; i < 4 && i < delta; ++i) { + mark[i] = data[i]; + } +} + +static void +valgrind_check(void) +{ + // Make sure that we actually call into JEMalloc when invoking malloc(). + // + // By default, Valgrind redirects the standard allocation API functions, + // i.e., malloc(), calloc(), etc., to glibc. + // + // The problem with this is that Valgrind only redirects the standard API + // functions. It does not know about, and thus doesn't redirect, our + // non-standard functions, e.g., cf_alloc_malloc_arena(). + // + // As we use both, standard and non-standard functions, to allocate memory, + // we would end up with an inconsistent mix of allocations, some allocated + // by JEMalloc and some by glibc's allocator. + // + // Sooner or later, we will thus end up passing a memory block allocated by + // JEMalloc to free(), which Valgrind has redirected to glibc's allocator. + + void *p1 = malloc(1); + free(p1); + + void *p2 = jem_malloc(1); + jem_free(p2); + + // If both of the above allocations are handled by JEMalloc, then they will + // be located in the same memory page. If, however, the first allocation is + // handled by glibc, then the memory blocks will come from two different + // memory pages. + + uint64_t page1 = (uint64_t)p1 >> 12; + uint64_t page2 = (uint64_t)p2 >> 12; + + if (page1 != page2) { + cf_crash_nostack(CF_ALLOC, "Valgrind redirected malloc() to glibc; please run Valgrind with --soname-synonyms=somalloc=nouserintercepts"); + } +} + +void +cf_alloc_init(void) +{ + valgrind_check(); + + // Turn off libstdc++'s memory caching, as it just duplicates JEMalloc's. + + if (setenv("GLIBCXX_FORCE_NEW", "1", 1) < 0) { + cf_crash(CF_ALLOC, "setenv() failed: %d (%s)", errno, cf_strerror(errno)); + } + + // Double-check that hook_get_arena() works, as it depends on JEMalloc's + // internal data structures. + + int32_t err = jem_mallctl("thread.tcache.flush", NULL, NULL, NULL, 0); + + if (err != 0) { + cf_crash(CF_ALLOC, "error while flushing thread cache: %d (%s)", err, cf_strerror(err)); + } + + for (size_t sz = 1; sz <= 16 * 1024 * 1024; sz *= 2) { + void *p = cf_alloc_malloc_arena(sz, N_ARENAS / 2); + int32_t arena = hook_get_arena(p); + + if (arena != N_ARENAS / 2) { + cf_crash(CF_ALLOC, "arena mismatch: %d vs. %d", arena, N_ARENAS / 2); + } + + free(p); + } +} + +// Restrict memory debugging. +// +// We always start out with memory debugging fully enabled (*_ALL). Then, +// once we have parsed the configuration file, we restrict it to what the +// configuration file says (e.g., *_TRANSIENT). +// +// The reason is that we can safely go from "on" to "off", but not vice +// versa. +// +// When "off", we don't add accounting info to an allocation. Now, if we +// deallocated such an allocation when "on", then we'd erroneously detect +// a corruption, because we'd try to validate accounting info that isn't +// there. + +void +cf_alloc_set_debug(cf_alloc_debug debug) +{ + g_debug = debug; +} + +int32_t +cf_alloc_create_arena(void) +{ + int32_t arena; + size_t arena_len = sizeof(arena); + + int32_t err = jem_mallctl("arenas.extend", &arena, &arena_len, NULL, 0); + + if (err != 0) { + cf_crash(CF_ALLOC, "failed to create new arena: %d (%s)", err, cf_strerror(err)); + } + + cf_debug(CF_ALLOC, "created new arena %d", arena); + return arena; +} + +void +cf_alloc_heap_stats(size_t *allocated_kbytes, size_t *active_kbytes, size_t *mapped_kbytes, + double *efficiency_pct, uint32_t *site_count) +{ + uint64_t epoch = 1; + size_t len = sizeof(epoch); + + int32_t err = jem_mallctl("epoch", &epoch, &len, &epoch, len); + + if (err != 0) { + cf_crash(CF_ALLOC, "failed to retrieve epoch: %d (%s)", err, cf_strerror(err)); + } + + size_t allocated; + len = sizeof(allocated); + + err = jem_mallctl("stats.allocated", &allocated, &len, NULL, 0); + + if (err != 0) { + cf_crash(CF_ALLOC, "failed to retrieve stats.allocated: %d (%s)", err, cf_strerror(err)); + } + + size_t active; + len = sizeof(active); + + err = jem_mallctl("stats.active", &active, &len, NULL, 0); + + if (err != 0) { + cf_crash(CF_ALLOC, "failed to retrieve stats.active: %d (%s)", err, cf_strerror(err)); + } + + size_t mapped; + len = sizeof(mapped); + + err = jem_mallctl("stats.mapped", &mapped, &len, NULL, 0); + + if (err != 0) { + cf_crash(CF_ALLOC, "failed to retrieve stats.mapped: %d (%s)", err, cf_strerror(err)); + } + + if (allocated_kbytes) { + *allocated_kbytes = allocated / 1024; + } + + if (active_kbytes) { + *active_kbytes = active / 1024; + } + + if (mapped_kbytes) { + *mapped_kbytes = mapped / 1024; + } + + if (efficiency_pct) { + *efficiency_pct = mapped != 0 ? + (double)allocated * 100.0 / (double)mapped : 0.0; + } + + if (site_count) { + *site_count = ck_pr_load_32(&g_n_site_ras); + } +} + +static void +line_to_log(void *data, const char *line) +{ + (void)data; + + char buff[1000]; + size_t i; + + for (i = 0; i < sizeof(buff) - 1 && line[i] != 0 && line[i] != '\n'; ++i) { + buff[i] = line[i]; + } + + buff[i] = 0; + cf_info(CF_ALLOC, "%s", buff); +} + +static void +line_to_file(void *data, const char *line) +{ + fprintf((FILE *)data, "%s", line); +} + +static void +time_to_file(FILE *fh) +{ + time_t now = time(NULL); + + if (now == (time_t)-1) { + cf_crash(CF_ALLOC, "time() failed: %d (%s)", errno, cf_strerror(errno)); + } + + struct tm gmt; + + if (gmtime_r(&now, &gmt) == NULL) { + cf_crash(CF_ALLOC, "gmtime_r() failed"); + } + + char text[250]; + + if (strftime(text, sizeof(text), "%b %d %Y %T %Z", &gmt) == 0) { + cf_crash(CF_ALLOC, "strftime() failed"); + } + + fprintf(fh, "---------- %s ----------\n", text); +} + +void +cf_alloc_log_stats(const char *file, const char *opts) +{ + if (file == NULL) { + jem_malloc_stats_print(line_to_log, NULL, opts); + return; + } + + FILE *fh = fopen(file, "a"); + + if (fh == NULL) { + cf_warning(CF_ALLOC, "failed to open allocation stats file %s: %d (%s)", + file, errno, cf_strerror(errno)); + return; + } + + time_to_file(fh); + jem_malloc_stats_print(line_to_file, fh, opts); + fclose(fh); +} + +void +cf_alloc_log_site_infos(const char *file) +{ + FILE *fh = fopen(file, "a"); + + if (fh == NULL) { + cf_warning(CF_ALLOC, "failed to open site info file %s: %d (%s)", + file, errno, cf_strerror(errno)); + return; + } + + time_to_file(fh); + uint32_t n_site_infos = ck_pr_load_32(&g_n_site_infos); + + for (uint32_t i = 1; i < n_site_infos; ++i) { + site_info *info = g_site_infos + i; + const void *ra = ck_pr_load_ptr(g_site_ras + info->site_id); + fprintf(fh, "0x%016" PRIx64 " %9d 0x%016zx 0x%016zx\n", (uint64_t)ra, info->thread_id, + info->size_hi, info->size_lo); + } + + fclose(fh); +} + +static bool +is_transient(int32_t arena) +{ + // Note that this also considers -1 (i.e., the default thread arena) + // to be transient, in addition to arenas 0 .. (N_ARENAS - 1). + + return arena < N_ARENAS; +} + +static bool +want_debug(int32_t arena) +{ + switch (g_debug) { + case CF_ALLOC_DEBUG_NONE: + return false; + + case CF_ALLOC_DEBUG_TRANSIENT: + return is_transient(arena); + + case CF_ALLOC_DEBUG_PERSISTENT: + return !is_transient(arena); + + case CF_ALLOC_DEBUG_ALL: + return true; + } + + // Not reached. + return false; +} + +static int32_t +calc_free_flags(int32_t arena) +{ + // If it's a transient allocation, then simply use the default + // thread-local cache. No flags needed. Same, if we don't debug + // at all; then we can save ourselves the second cache. + + if (is_transient(arena) || g_debug == CF_ALLOC_DEBUG_NONE) { + return 0; + } + + // If it's a persistent allocation, then use the second per-thread + // cache. Add it to the flags. See calc_alloc_flags() for more on + // this second cache. + + return MALLOCX_TCACHE(g_ns_tcache); +} + +static void +do_free(void *p, const void *ra) +{ + if (p == NULL) { + return; + } + + int32_t arena = hook_get_arena(p); + int32_t flags = calc_free_flags(arena); + + if (!want_debug(arena)) { + jem_dallocx(p, flags); + return; + } + + size_t jem_sz = jem_sallocx(p, 0); + hook_handle_free(ra, p, jem_sz); + jem_sdallocx(p, jem_sz, flags); +} + +void +__attribute__ ((noinline)) +free(void *p) +{ + do_free(p, __builtin_return_address(0)); +} + +static int32_t +calc_alloc_flags(int32_t flags, int32_t arena) +{ + // Default arena and default thread-local cache. No additional flags + // needed. + + if (arena < 0) { + return flags; + } + + // We're allocating from a specific arena. Add it to the flags. + + flags |= MALLOCX_ARENA(arena); + + // If it's an arena for transient allocations, then we use the default + // thread-local cache. No additional flags needed. Same, if we don't + // debug at all; then we can save ourselves the second cache. + + if (is_transient(arena) || g_debug == CF_ALLOC_DEBUG_NONE) { + return flags; + } + + // We have a second per-thread cache for persistent allocations. In this + // way we never mix persistent allocations and transient allocations in + // the same cache. We need to keep them apart, because debugging may be + // enabled for one, but not the other. + + // Create the second per-thread cache, if we haven't already done so. + + if (g_ns_tcache < 0) { + size_t len = sizeof(g_ns_tcache); + int32_t err = jem_mallctl("tcache.create", &g_ns_tcache, &len, NULL, 0); + + if (err != 0) { + cf_crash(CF_ALLOC, "failed to create new cache: %d (%s)", err, cf_strerror(err)); + } + } + + // Add the second (non-default) per-thread cache to the flags. + + flags |= MALLOCX_TCACHE(g_ns_tcache); + return flags; +} + +static void * +do_mallocx(size_t sz, int32_t arena, const void *ra) +{ + int32_t flags = calc_alloc_flags(0, arena); + + if (!want_debug(arena)) { + return jem_mallocx(sz == 0 ? 1 : sz, flags); + } + + size_t ext_sz = sz + sizeof(uint32_t); + + void *p = jem_mallocx(ext_sz, flags); + hook_handle_alloc(ra, p, sz); + + return p; +} + +void * +cf_alloc_try_malloc(size_t sz) +{ + // Allowed to return NULL. + return do_mallocx(sz, -1, __builtin_return_address(0)); +} + +void * +cf_alloc_malloc_arena(size_t sz, int32_t arena) +{ + void *p = do_mallocx(sz, arena, __builtin_return_address(0)); + cf_assert(p, CF_ALLOC, "malloc_ns failed sz %zu arena %d", sz, arena); + return p; +} + +void * +__attribute__ ((noinline)) +malloc(size_t sz) +{ + void *p = do_mallocx(sz, -1, __builtin_return_address(0)); + cf_assert(p, CF_ALLOC, "malloc failed sz %zu", sz); + return p; +} + +static void * +do_callocx(size_t n, size_t sz, int32_t arena, const void *ra) +{ + int32_t flags = calc_alloc_flags(MALLOCX_ZERO, arena); + size_t tot_sz = n * sz; + + if (!want_debug(arena)) { + return jem_mallocx(tot_sz == 0 ? 1 : tot_sz, flags); + } + + size_t ext_sz = tot_sz + sizeof(uint32_t); + + void *p = jem_mallocx(ext_sz, flags); + hook_handle_alloc(ra, p, tot_sz); + + return p; +} + +void * +cf_alloc_calloc_arena(size_t n, size_t sz, int32_t arena) +{ + void *p = do_callocx(n, sz, arena, __builtin_return_address(0)); + cf_assert(p, CF_ALLOC, "calloc_ns failed n %zu sz %zu arena %d", n, sz, arena); + return p; +} + +void * +calloc(size_t n, size_t sz) +{ + void *p = do_callocx(n, sz, -1, __builtin_return_address(0)); + cf_assert(p, CF_ALLOC, "calloc failed n %zu sz %zu", n, sz); + return p; +} + +static void * +do_rallocx(void *p, size_t sz, int32_t arena, const void *ra) +{ + if (p == NULL) { + return do_mallocx(sz, arena, ra); + } + + hook_check_arena(p, arena); + + if (sz == 0) { + do_free(p, ra); + return NULL; + } + + int32_t flags = calc_alloc_flags(0, arena); + + if (!want_debug(arena)) { + return jem_rallocx(p, sz, flags); + } + + size_t jem_sz = jem_sallocx(p, 0); + hook_handle_free(ra, p, jem_sz); + + size_t ext_sz = sz + sizeof(uint32_t); + + void *p2 = jem_rallocx(p, ext_sz, flags); + hook_handle_alloc(ra, p2, sz); + + return p2; +} + +void * +cf_alloc_realloc_arena(void *p, size_t sz, int32_t arena) +{ + void *p2 = do_rallocx(p, sz, arena, __builtin_return_address(0)); + cf_assert(p2 || sz == 0, CF_ALLOC, "realloc_ns failed sz %zu arena %d", sz, arena); + return p2; +} + +void * +realloc(void *p, size_t sz) +{ + void *p2 = do_rallocx(p, sz, -1, __builtin_return_address(0)); + cf_assert(p2 || sz == 0, CF_ALLOC, "realloc failed sz %zu", sz); + return p2; +} + +static char * +do_strdup(const char *s, size_t n, const void *ra) +{ + size_t sz = n + 1; + size_t ext_sz = want_debug(-1) ? sz + sizeof(uint32_t) : sz; + + char *s2 = jem_mallocx(ext_sz, 0); + cf_assert(s2, CF_ALLOC, "strdup failed len %zu", n); + + if (want_debug(-1)) { + hook_handle_alloc(ra, s2, sz); + } + + memcpy(s2, s, sz); + return s2; +} + +char * +strdup(const char *s) +{ + return do_strdup(s, strlen(s), __builtin_return_address(0)); +} + +char * +strndup(const char *s, size_t n) +{ + size_t n2 = 0; + + while (n2 < n && s[n2] != 0) { + ++n2; + } + + size_t sz = n2 + 1; + size_t ext_sz = want_debug(-1) ? sz + sizeof(uint32_t) : sz; + + char *s2 = jem_mallocx(ext_sz, 0); + cf_assert(s2, CF_ALLOC, "strndup failed limit %zu", n); + + if (want_debug(-1)) { + hook_handle_alloc(__builtin_return_address(0), s2, sz); + } + + memcpy(s2, s, n2); + s2[n2] = 0; + + return s2; +} + +int32_t +asprintf(char **res, const char *form, ...) +{ + char buff[25000]; + + va_list va; + va_start(va, form); + + int32_t n = vsnprintf(buff, sizeof(buff), form, va); + + va_end(va); + + if ((size_t)n >= sizeof(buff)) { + cf_crash(CF_ALLOC, "asprintf overflow len %d", n); + } + + *res = do_strdup(buff, (size_t)n, __builtin_return_address(0)); + return n; +} + +int32_t +posix_memalign(void **p, size_t align, size_t sz) +{ + if (!want_debug(-1)) { + return jem_posix_memalign(p, align, sz == 0 ? 1 : sz); + } + + size_t ext_sz = sz + sizeof(uint32_t); + int32_t err = jem_posix_memalign(p, align, ext_sz); + + if (err != 0) { + return err; + } + + hook_handle_alloc(__builtin_return_address(0), *p, sz); + return 0; +} + +void * +aligned_alloc(size_t align, size_t sz) +{ + if (!want_debug(-1)) { + return jem_aligned_alloc(align, sz == 0 ? 1 : sz); + } + + size_t ext_sz = sz + sizeof(uint32_t); + + void *p = jem_aligned_alloc(align, ext_sz); + hook_handle_alloc(__builtin_return_address(0), p, sz); + + return p; +} + +static void * +do_valloc(size_t sz) +{ + if (!want_debug(-1)) { + return jem_aligned_alloc(PAGE_SZ, sz == 0 ? 1 : sz); + } + + size_t ext_sz = sz + sizeof(uint32_t); + + void *p = jem_aligned_alloc(PAGE_SZ, ext_sz); + hook_handle_alloc(__builtin_return_address(0), p, sz); + + return p; +} + +void * +valloc(size_t sz) +{ + void *p = do_valloc(sz); + cf_assert(p, CF_ALLOC, "valloc failed sz %zu", sz); + return p; +} + +void * +memalign(size_t align, size_t sz) +{ + if (!want_debug(-1)) { + return jem_aligned_alloc(align, sz == 0 ? 1 : sz); + } + + size_t ext_sz = sz + sizeof(uint32_t); + + void *p = jem_aligned_alloc(align, ext_sz); + hook_handle_alloc(__builtin_return_address(0), p, sz); + + return p; +} + +void * +pvalloc(size_t sz) +{ + (void)sz; + cf_crash(CF_ALLOC, "obsolete pvalloc() called"); + // Not reached. + return NULL; +} + +void * +cf_rc_alloc(size_t sz) +{ + size_t tot_sz = sizeof(cf_rc_header) + sz; + size_t ext_sz = want_debug(-1) ? tot_sz + sizeof(uint32_t) : tot_sz; + + cf_rc_header *head = jem_malloc(ext_sz); + cf_assert(head, CF_ALLOC, "rc_alloc failed sz %zu", sz); + + if (want_debug(-1)) { + hook_handle_alloc(__builtin_return_address(0), head, tot_sz); + } + + head->rc = 1; + head->sz = (uint32_t)sz; + + return head + 1; +} + +void +cf_rc_free(void *p) +{ + if (p == NULL) { + cf_crash(CF_ALLOC, "trying to cf_rc_free() null pointer"); + } + + cf_rc_header *head = (cf_rc_header *)p - 1; + + if (!want_debug(-1)) { + jem_dallocx(head, 0); + return; + } + + size_t jem_sz = jem_sallocx(head, 0); + hook_handle_free(__builtin_return_address(0), head, jem_sz); + jem_sdallocx(head, jem_sz, 0); +} + +int32_t +cf_rc_reserve(void *p) +{ + cf_rc_header *head = (cf_rc_header *)p - 1; + return cf_atomic32_incr(&head->rc); +} + +int32_t +cf_rc_release(void *p) +{ + cf_rc_header *head = (cf_rc_header *)p - 1; + int32_t rc = cf_atomic32_decr(&head->rc); + cf_assert(rc >= 0, CF_ALLOC, "reference count underflow"); + return rc; +} + +int32_t +cf_rc_releaseandfree(void *p) +{ + cf_rc_header *head = (cf_rc_header *)p - 1; + int32_t rc = cf_atomic32_decr(&head->rc); + cf_assert(rc >= 0, CF_ALLOC, "reference count underflow"); + + if (rc > 0) { + return rc; + } + + if (!want_debug(-1)) { + jem_dallocx(head, 0); + return 0; + } + + size_t jem_sz = jem_sallocx(head, 0); + hook_handle_free(__builtin_return_address(0), head, jem_sz); + jem_sdallocx(head, jem_sz, 0); + return 0; +} + +int32_t +cf_rc_count(const void *p) +{ + const cf_rc_header *head = (const cf_rc_header *)p - 1; + return (int32_t)head->rc; +} diff --git a/cf/src/arenax.c b/cf/src/arenax.c new file mode 100644 index 00000000..4bcf8131 --- /dev/null +++ b/cf/src/arenax.c @@ -0,0 +1,201 @@ +/* + * arenax.c + * + * Copyright (C) 2012-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "arenax.h" + +#include +#include +#include +#include +#include +#include + +#include "fault.h" + + +//========================================================== +// Typedefs & constants. +// + +// Must be in-sync with cf_arenax_err: +const char* ARENAX_ERR_STRINGS[] = { + "ok", + "bad parameter", + "error creating stage", + "error attaching stage", + "error detaching stage", + "unknown error" +}; + + +//========================================================== +// Public API. +// + +// Return persistent memory size needed. Excludes stages, which cf_arenax +// handles internally. +size_t +cf_arenax_sizeof() +{ + return sizeof(cf_arenax); +} + +// Convert cf_arenax_err to meaningful string. +const char* +cf_arenax_errstr(cf_arenax_err err) +{ + if (err < 0 || err > CF_ARENAX_ERR_UNKNOWN) { + err = CF_ARENAX_ERR_UNKNOWN; + } + + return ARENAX_ERR_STRINGS[err]; +} + +// Create a cf_arenax object in persistent memory. Also create and attach the +// first arena stage in persistent memory. +void +cf_arenax_init(cf_arenax* arena, key_t key_base, uint32_t element_size, + uint32_t stage_capacity, uint32_t max_stages, uint32_t flags) +{ + if (stage_capacity == 0) { + stage_capacity = MAX_STAGE_CAPACITY; + } + else if (stage_capacity > MAX_STAGE_CAPACITY) { + cf_crash(CF_ARENAX, "stage capacity %u too large", stage_capacity); + } + + if (max_stages == 0) { + max_stages = CF_ARENAX_MAX_STAGES; + } + else if (max_stages > CF_ARENAX_MAX_STAGES) { + cf_crash(CF_ARENAX, "max stages %u too large", max_stages); + } + + arena->key_base = key_base; + arena->element_size = element_size; + arena->stage_capacity = stage_capacity; + arena->max_stages = max_stages; + arena->flags = flags; + + arena->stage_size = (size_t)stage_capacity * element_size; + + arena->free_h = 0; + + // Skip 0:0 so null handle is never used. + arena->at_stage_id = 0; + arena->at_element_id = 1; + + if ((flags & CF_ARENAX_BIGLOCK) != 0) { + pthread_mutex_init(&arena->lock, NULL); + } + + arena->stage_count = 0; + memset(arena->stages, 0, sizeof(arena->stages)); + + // Add first stage. + if (cf_arenax_add_stage(arena) != CF_ARENAX_OK) { + cf_crash(CF_ARENAX, "failed to add first stage"); + } + + // Clear the null element - allocation bypasses it, but it may be read. + memset(cf_arenax_resolve(arena, 0), 0, element_size); +} + +// Allocate an element within the arena. +cf_arenax_handle +cf_arenax_alloc(cf_arenax* arena) +{ + if ((arena->flags & CF_ARENAX_BIGLOCK) != 0) { + pthread_mutex_lock(&arena->lock); + } + + cf_arenax_handle h; + + // Check free list first. + if (arena->free_h != 0) { + h = arena->free_h; + + free_element* p_free_element = cf_arenax_resolve(arena, h); + + arena->free_h = p_free_element->next_h; + } + // Otherwise keep end-allocating. + else { + if (arena->at_element_id >= arena->stage_capacity) { + if (cf_arenax_add_stage(arena) != CF_ARENAX_OK) { + if ((arena->flags & CF_ARENAX_BIGLOCK) != 0) { + pthread_mutex_unlock(&arena->lock); + } + + return 0; + } + + arena->at_stage_id++; + arena->at_element_id = 0; + } + + cf_arenax_set_handle(&h, arena->at_stage_id, arena->at_element_id); + + arena->at_element_id++; + } + + if ((arena->flags & CF_ARENAX_BIGLOCK) != 0) { + pthread_mutex_unlock(&arena->lock); + } + + if ((arena->flags & CF_ARENAX_CALLOC) != 0) { + memset(cf_arenax_resolve(arena, h), 0, arena->element_size); + } + + return h; +} + +// Free an element. +void +cf_arenax_free(cf_arenax* arena, cf_arenax_handle h) +{ + free_element* p_free_element = cf_arenax_resolve(arena, h); + + if ((arena->flags & CF_ARENAX_BIGLOCK) != 0) { + pthread_mutex_lock(&arena->lock); + } + + p_free_element->magic = FREE_MAGIC; + p_free_element->next_h = arena->free_h; + arena->free_h = h; + + if ((arena->flags & CF_ARENAX_BIGLOCK) != 0) { + pthread_mutex_unlock(&arena->lock); + } +} + +// Convert cf_arenax_handle to memory address. +void* +cf_arenax_resolve(cf_arenax* arena, cf_arenax_handle h) +{ + return arena->stages[h >> ELEMENT_ID_NUM_BITS] + + ((h & ELEMENT_ID_MASK) * arena->element_size); +} diff --git a/cf/src/arenax_ce.c b/cf/src/arenax_ce.c new file mode 100644 index 00000000..fd6d4571 --- /dev/null +++ b/cf/src/arenax_ce.c @@ -0,0 +1,59 @@ +/* + * arenax_cold.c + * + * Copyright (C) 2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "arenax.h" + +#include +#include "citrusleaf/alloc.h" +#include "fault.h" + + +//========================================================== +// Private API - for enterprise separation only. +// + +// Allocate an arena stage, and store its pointer in the stages array. +cf_arenax_err +cf_arenax_add_stage(cf_arenax* arena) +{ + if (arena->stage_count >= arena->max_stages) { + cf_warning(CF_ARENAX, "can't allocate more than %u arena stages", + arena->max_stages); + return CF_ARENAX_ERR_STAGE_CREATE; + } + + uint8_t* p_stage = (uint8_t*)cf_try_malloc(arena->stage_size); + + if (! p_stage) { + cf_warning(CF_ARENAX, "could not allocate %zu-byte arena stage %u", + arena->stage_size, arena->stage_count); + return CF_ARENAX_ERR_STAGE_CREATE; + } + + arena->stages[arena->stage_count++] = p_stage; + + return CF_ARENAX_OK; +} diff --git a/cf/src/cf_mutex.c b/cf/src/cf_mutex.c new file mode 100644 index 00000000..84777eb2 --- /dev/null +++ b/cf/src/cf_mutex.c @@ -0,0 +1,175 @@ +/* + * cf_mutex.c + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + +//========================================================== +// Includes. +// + +#include + +#include +#include +#include +#include + +#include +#include + +#include "fault.h" + + +//========================================================== +// Typedefs & constants. +// + +#define FUTEX_SPIN_MAX 100 + + +//========================================================== +// Inlines & macros. +// + +inline static void +sys_futex(void *uaddr, int op, int val) +{ + syscall(SYS_futex, uaddr, op, val, NULL, NULL, 0); +} + +#define xchg(__ptr, __val) __sync_lock_test_and_set(__ptr, __val) +#define cmpxchg(__ptr, __cmp, __set) __sync_val_compare_and_swap(__ptr, __cmp, __set) +#define cpu_relax() asm volatile("pause\n": : :"memory") +#define unlikely(__expr) __builtin_expect(!! (__expr), 0) +#define likely(__expr) __builtin_expect(!! (__expr), 1) + + +//========================================================== +// Public API - cf_mutex. +// + +void +cf_mutex_lock(cf_mutex *m) +{ + if (likely(cmpxchg((uint32_t *)m, 0, 1) == 0)) { + return; // was not locked + } + + if (m->u32 == 2) { + sys_futex(m, FUTEX_WAIT_PRIVATE, 2); + } + + while (xchg((uint32_t *)m, 2) != 0) { + sys_futex(m, FUTEX_WAIT_PRIVATE, 2); + } +} + +void +cf_mutex_unlock(cf_mutex *m) +{ + uint32_t check = xchg((uint32_t *)m, 0); + + if (unlikely(check == 2)) { + sys_futex(m, FUTEX_WAKE_PRIVATE, 1); + } + else if (unlikely(check == 0)) { + cf_crash(CF_MISC, "cf_mutex_unlock() on already unlocked mutex"); + } +} + +// Return true if lock success. +bool +cf_mutex_trylock(cf_mutex *m) +{ + if (cmpxchg((uint32_t *)m, 0, 1) == 0) { + return true; // was not locked + } + + return false; +} + +void +cf_mutex_lock_spin(cf_mutex *m) +{ + for (int i = 0; i < FUTEX_SPIN_MAX; i++) { + if (cmpxchg((uint32_t *)m, 0, 1) == 0) { + return; // was not locked + } + + cpu_relax(); + } + + if (m->u32 == 2) { + sys_futex(m, FUTEX_WAIT_PRIVATE, 2); + } + + while (xchg((uint32_t *)m, 2) != 0) { + sys_futex(m, FUTEX_WAIT_PRIVATE, 2); + } +} + +void +cf_mutex_unlock_spin(cf_mutex *m) +{ + uint32_t check = xchg((uint32_t *)m, 0); + + if (unlikely(check == 2)) { + // Spin and hope someone takes the lock. + for (int i = 0; i < FUTEX_SPIN_MAX; i++) { + if (m->u32 != 0) { + if (cmpxchg((uint32_t *)m, 1, 2) == 0) { + break; + } + + return; // someone else took the lock + } + + cpu_relax(); + } + + sys_futex(m, FUTEX_WAKE_PRIVATE, 1); + } + else if (unlikely(check == 0)) { + cf_crash(CF_MISC, "cf_mutex_unlock_spin() on already unlocked mutex"); + } +} + + +//========================================================== +// Public API - cf_condition. +// + +void +cf_condition_wait(cf_condition *c, cf_mutex *m) +{ + uint32_t seq = c->seq; + + cf_mutex_unlock(m); + sys_futex(&c->seq, FUTEX_WAIT_PRIVATE, seq); + cf_mutex_lock(m); +} + +void +cf_condition_signal(cf_condition *c) +{ + __sync_fetch_and_add(&c->seq, 1); + sys_futex(&c->seq, FUTEX_WAKE_PRIVATE, 1); +} diff --git a/cf/src/cf_str.c b/cf/src/cf_str.c new file mode 100644 index 00000000..57a465c8 --- /dev/null +++ b/cf/src/cf_str.c @@ -0,0 +1,419 @@ +/* + * cf_str.c + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * String helper functions + * + */ + +#include "cf_str.h" + +#include +#include +#include +#include + +#include + + +static char itoa_table[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N' }; + +// return 0 on success, -1 on fail +int cf_str_atoi(char *s, int *value) +{ + int i = 0; + bool neg = false; + + if (*s == '-') { neg = true; s++; } + + while (*s >= '0' && *s <= '9') { + i *= 10; + i += *s - '0'; + s++; + } + switch (*s) { + case 'k': + case 'K': + i *= 1024L; + s++; + break; + case 'M': + case 'm': + i *= (1024L * 1024L); + s++; + break; + case 'G': + case 'g': + i *= (1024L * 1024L * 1024L); + s++; + break; + default: + break; + } + if (*s != 0) { + return(-1); // reached a non-num before EOL + } + *value = neg ? -i : i; + return(0); +} + +// return 0 on success, -1 on fail +int cf_str_atoi_u32(char *s, unsigned int *value) +{ + unsigned int i = 0; + + while (*s >= '0' && *s <= '9') { + i *= 10; + i += *s - '0'; + s++; + } + switch (*s) { + case 'k': + case 'K': + i *= 1024L; + s++; + break; + case 'M': + case 'm': + i *= (1024L * 1024L); + s++; + break; + case 'G': + case 'g': + i *= (1024L * 1024L * 1024L); + s++; + break; + default: + break; + } + if (*s != 0) { + return(-1); // reached a non-num before EOL + } + *value = i; + return(0); +} + +int cf_str_atoi_64(char *s, int64_t *value) +{ + int64_t i = 0; + bool neg = false; + + if (*s == '-') { neg = true; s++; } + + while (*s >= '0' && *s <= '9') { + i *= 10; + i += *s - '0'; + s++; + } + switch (*s) { + case 'k': + case 'K': + i *= 1024L; + s++; + break; + case 'M': + case 'm': + i *= (1024L * 1024L); + s++; + break; + case 'G': + case 'g': + i *= (1024L * 1024L * 1024L); + s++; + break; + case 'T': + case 't': + i *= (1024L * 1024L * 1024L * 1024L); + s++; + break; + case 'P': + case 'p': + i *= (1024L * 1024L * 1024L * 1024L * 1024L); + s++; + break; + default: + break; + } + if (*s != 0) { + return(-1); // reached a non-num before EOL + } + *value = neg ? -i : i; + return(0); +} + +int cf_str_atoi_u64(char *s, uint64_t *value) +{ + uint64_t i = 0; + + while (*s >= '0' && *s <= '9') { + i *= 10; + i += *s - '0'; + s++; + } + switch (*s) { + case 'k': + case 'K': + i *= 1024L; + s++; + break; + case 'M': + case 'm': + i *= (1024L * 1024L); + s++; + break; + case 'G': + case 'g': + i *= (1024L * 1024L * 1024L); + s++; + break; + case 'T': + case 't': + i *= (1024L * 1024L * 1024L * 1024L); + s++; + break; + case 'P': + case 'p': + i *= (1024L * 1024L * 1024L * 1024L * 1024L); + s++; + break; + default: + break; + } + if (*s != 0) { + return(-1); // reached a non-num before EOL + } + *value = i; + return(0); +} + +int cf_str_atoi_x64(const char *s, uint64_t *value) +{ + if (! ((*s >= '0' && *s <= '9') || + (*s >= 'a' && *s <= 'f') || + (*s >= 'A' && *s <= 'F'))) { + return -1; + } + + char* tail = NULL; + uint64_t i = strtoul(s, &tail, 16); + + // Check for overflow. + if (errno == ERANGE) { + return -1; + } + + // Don't allow trailing non-hex characters. + if (tail && *tail != 0) { + return -1; + } + + *value = i; + + return 0; +} + +int cf_str_atoi_seconds(char *s, uint64_t *value) +{ + // Special case: treat -1 the same as 0. + if (*s == '-' && *(s + 1) == '1') { + *value = 0; + return 0; + } + + uint64_t i = 0; + + while (*s >= '0' && *s <= '9') { + i *= 10; + i += *s - '0'; + s++; + } + switch (*s) { + case 'S': + case 's': + s++; + break; + case 'M': + case 'm': + i *= 60; + s++; + break; + case 'H': + case 'h': + i *= (60 * 60); + s++; + break; + case 'D': + case 'd': + i *= (60 * 60 * 24); + s++; + break; + default: + break; + } + if (*s != 0) { + return(-1); // reached a non-num before EOL + } + *value = i; + return(0); +} + + +unsigned int +cf_str_itoa(int _value, char *_s, int _radix) +{ + // special case is the easy way + if (_value == 0) { + _s[0] = itoa_table[0]; + _s[1] = 0; + return(1); + } + + // Account for negatives + unsigned int sign_len = 0; + if (_value < 0) { + *_s++ = '-'; + _value = - _value; + sign_len = 1; + } + int _v = _value; + unsigned int _nd = 0; + while (_v) { + _nd++; + _v /= _radix; + } + + unsigned int rv = sign_len + _nd; + _s[_nd] = 0; + while (_nd) { + _nd --; + _s[_nd ] = itoa_table [ _value % _radix ]; + _value = _value / _radix; + } + return(rv); +} + +unsigned int +cf_str_itoa_u64(uint64_t _value, char *_s, int _radix) +{ + // special case is the easy way + if (_value == 0) { + _s[0] = itoa_table[0]; + _s[1] = 0; + return(1); + } + + uint64_t _v = _value; + unsigned int _nd = 0; + while (_v) { + _nd++; + _v /= _radix; + } + + unsigned int rv = _nd; + _s[_nd] = 0; + while (_nd) { + _nd --; + _s[_nd ] = itoa_table [ _value % _radix ]; + _value = _value / _radix; + } + return(rv); +} + +unsigned int +cf_str_itoa_u32(uint32_t _value, char *_s, int _radix) +{ + // special case is the easy way + if (_value == 0) { + _s[0] = itoa_table[0]; + _s[1] = 0; + return(1); + } + + uint32_t _v = _value; + unsigned int _nd = 0; + while (_v) { + _nd++; + _v /= _radix; + } + + unsigned int rv = _nd; + _s[_nd] = 0; + while (_nd) { + _nd --; + _s[_nd ] = itoa_table [ _value % _radix ]; + _value = _value / _radix; + } + return(rv); +} + +#define ATOI_ILLEGAL -1 + + +static int8_t atoi_table[] = { +/* 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F */ +/* 00 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, +/* 10 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, +/* 20 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, +/* 30 */ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, +/* 40 */ -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +/* 50 */ 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, -1, -1, -1, -1, +/* 60 */ -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, +/* 70 */ 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, -1, -1, -1, -1 }; + + +int +cf_str_atoi_u64_x(char *s, uint64_t *value, int radix) +{ + uint64_t i = 0; + while (*s) { + if (*s < 0) return(-1); + int8_t cv = atoi_table[(uint8_t)*s]; + if (cv < 0 || cv >= radix) return(-1); + i *= radix; + i += cv; + s++; + } + *value = i; + return(0); +} + + + +void +cf_str_split(char *fmt, char *str, cf_vector *v) +{ + char c; + char *prev = str; + while ((c = *str)) { + for (uint32_t j = 0; fmt[j]; j++) { + if (fmt[j] == c) { + *str = 0; + cf_vector_append(v, &prev); + prev = str+1; + break; + } + } + str++; + } + if (prev != str) + cf_vector_append(v, &prev); +} diff --git a/cf/src/daemon.c b/cf/src/daemon.c new file mode 100644 index 00000000..bd59d9ab --- /dev/null +++ b/cf/src/daemon.c @@ -0,0 +1,167 @@ +/* + * daemon.c + * + * Copyright (C) 2008-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * process utilities + */ + +#include "daemon.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fault.h" + +extern int capset(cap_user_header_t header, cap_user_data_t data); + + +static bool g_hold_caps = false; +static bool g_clear_caps = false; + + +void +cf_process_privsep(uid_t uid, gid_t gid) +{ + if (0 != getuid() || (uid == getuid() && gid == getgid())) { + return; + } + + // If appropriate, make all capabilities survive the UID/GID switch. + if (g_hold_caps) { + if (0 > prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0)) { + cf_crash(CF_MISC, "prctl: %s", cf_strerror(errno)); + } + + g_clear_caps = true; + } + + // Drop all auxiliary groups. + if (0 > setgroups(0, (const gid_t *)0)) { + cf_crash(CF_MISC, "setgroups: %s", cf_strerror(errno)); + } + + // Change privileges. + if (0 > setgid(gid)) { + cf_crash(CF_MISC, "setgid: %s", cf_strerror(errno)); + } + + if (0 > setuid(uid)) { + cf_crash(CF_MISC, "setuid: %s", cf_strerror(errno)); + } +} + + +// TODO - if we get more customers of this API, we could switch to either using +// a 'hold counter', or a more involved scheme where individual capabilities can +// be kept and revoked. + +void +cf_process_holdcap(void) +{ + g_hold_caps = true; +} + + +void +cf_process_clearcap(void) +{ + if (! g_clear_caps) { + return; + } + + struct __user_cap_header_struct cap_head = { + .version = _LINUX_CAPABILITY_VERSION_2 + }; + + struct __user_cap_data_struct cap_data[2] = { { 0 } }; + + if (0 > capset(&cap_head, cap_data)) { + cf_crash(CF_MISC, "capset: %s", cf_strerror(errno)); + } +} + + +// Daemonize the server - fork a new child process and exit the parent process. +// Close all the file descriptors opened except the ones specified in the +// fd_ignore_list. Redirect console messages to a file. +void +cf_process_daemonize(int *fd_ignore_list, int list_size) +{ + int FD, j; + char cfile[128]; + pid_t p; + + // Fork ourselves, then let the parent expire. + if (-1 == (p = fork())) { + cf_crash(CF_MISC, "couldn't fork: %s", cf_strerror(errno)); + } + + if (0 != p) { + // Prefer _exit() over exit(), as we don't want the parent to + // do any cleanups. + _exit(0); + } + + // Get a new session. + if (-1 == setsid()) { + cf_crash(CF_MISC, "couldn't set session: %s", cf_strerror(errno)); + } + + // Drop all the file descriptors except the ones in fd_ignore_list. + for (int i = getdtablesize(); i > 2; i--) { + for (j = 0; j < list_size; j++) { + if (fd_ignore_list[j] == i) { + break; + } + } + + if (j == list_size) { + close(i); + } + } + + // Open a temporary file for console message redirection. + snprintf(cfile, 128, "/tmp/aerospike-console.%d", getpid()); + + if (-1 == (FD = open(cfile, O_WRONLY|O_CREAT|O_APPEND, S_IRUSR|S_IWUSR))) { + cf_crash(CF_MISC, "couldn't open console redirection file %s: %s", cfile, cf_strerror(errno)); + } + + if (-1 == chmod(cfile, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH)) { + cf_crash(CF_MISC, "couldn't set mode on console redirection file %s: %s", cfile, cf_strerror(errno)); + } + + // Redirect stdout, stderr, and stdin to the console file. + for (int i = 0; i < 3; i++) { + if (-1 == dup2(FD, i)) { + cf_crash(CF_MISC, "couldn't duplicate FD: %s", cf_strerror(errno)); + } + } +} diff --git a/cf/src/dynbuf.c b/cf/src/dynbuf.c new file mode 100644 index 00000000..18b69c52 --- /dev/null +++ b/cf/src/dynbuf.c @@ -0,0 +1,534 @@ +/* + * dynbuf.c + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "dynbuf.h" + +#include +#include +#include +#include +#include +#include + +#include + +#include "cf_str.h" + + +#define MAX_BACKOFF (1024 * 256) + +size_t +get_new_size(int alloc, int used, int requested) +{ + if (alloc - used > requested) { + return alloc; + } + + size_t new_sz = alloc + requested + sizeof(cf_buf_builder); + int backoff; + + if (new_sz < 1024 * 8) { + backoff = 1024; + } + else if (new_sz < 1024 * 32) { + backoff = 1024 * 4; + } + else if (new_sz < 1024 * 128) { + backoff = 1024 * 32; + } + else { + backoff = MAX_BACKOFF; + } + + return new_sz + (backoff - (new_sz % backoff)); +} + +void +cf_dyn_buf_reserve_internal(cf_dyn_buf *db, size_t sz) +{ + size_t new_sz = get_new_size(db->alloc_sz, db->used_sz, sz); + + if (new_sz > db->alloc_sz) { + uint8_t *_t; + + if (db->is_stack) { + _t = cf_malloc(new_sz); + memcpy(_t, db->buf, db->used_sz); + db->is_stack = false; + } + else { + _t = cf_realloc(db->buf, new_sz); + } + + db->buf = _t; + db->alloc_sz = new_sz; + } +} + +#define DB_RESERVE(_n) \ + if (db->alloc_sz - db->used_sz < _n) { \ + cf_dyn_buf_reserve_internal(db, _n); \ + } + +void +cf_dyn_buf_init_heap(cf_dyn_buf *db, size_t sz) +{ + db->buf = cf_malloc(sz); + db->is_stack = false; + db->alloc_sz = sz; + db->used_sz = 0; +} + +void +cf_dyn_buf_reserve(cf_dyn_buf *db, size_t sz, uint8_t **from) +{ + DB_RESERVE(sz); + + if (from) { + *from = &db->buf[db->used_sz]; + } + + db->used_sz += sz; +} + +void +cf_dyn_buf_append_buf(cf_dyn_buf *db, uint8_t *buf, size_t sz) +{ + DB_RESERVE(sz); + memcpy(&db->buf[db->used_sz], buf, sz); + db->used_sz += sz; +} + +void +cf_dyn_buf_append_string(cf_dyn_buf *db, const char *s) +{ + size_t len = strlen(s); + + DB_RESERVE(len); + memcpy(&db->buf[db->used_sz], s, len); + db->used_sz += len; +} + +void +cf_dyn_buf_append_char(cf_dyn_buf *db, char c) +{ + DB_RESERVE(1); + db->buf[db->used_sz] = (uint8_t)c; + db->used_sz++; +} + +void +cf_dyn_buf_append_bool(cf_dyn_buf *db, bool b) +{ + if (b) { + DB_RESERVE(4); + memcpy(&db->buf[db->used_sz], "true", 4); + db->used_sz += 4; + } + else { + DB_RESERVE(5); + memcpy(&db->buf[db->used_sz], "false", 5); + db->used_sz += 5; + } +} + +void +cf_dyn_buf_append_int(cf_dyn_buf *db, int i) +{ + DB_RESERVE(12); + db->used_sz += cf_str_itoa(i, (char *)&db->buf[db->used_sz], 10); +} + +void +cf_dyn_buf_append_uint64_x(cf_dyn_buf *db, uint64_t i) +{ + DB_RESERVE(18); + db->used_sz += cf_str_itoa_u64(i, (char *)&db->buf[db->used_sz], 16); +} + +void +cf_dyn_buf_append_uint64(cf_dyn_buf *db, uint64_t i) +{ + DB_RESERVE(22); + db->used_sz += cf_str_itoa_u64(i, (char *)&db->buf[db->used_sz], 10); +} + +void +cf_dyn_buf_append_uint32(cf_dyn_buf *db, uint32_t i) +{ + DB_RESERVE(12); + db->used_sz += cf_str_itoa_u32(i, (char *)&db->buf[db->used_sz], 10); +} + +void +cf_dyn_buf_chomp(cf_dyn_buf *db) +{ + if (db->used_sz > 0) { + db->used_sz--; + } +} + +char * +cf_dyn_buf_strdup(cf_dyn_buf *db) +{ + if (db->used_sz == 0) { + return NULL; + } + + char *s = cf_malloc(db->used_sz + 1); + + memcpy(s, db->buf, db->used_sz); + s[db->used_sz] = 0; + + return s; +} + +void +cf_dyn_buf_free(cf_dyn_buf *db) +{ + if (! db->is_stack && db->buf) { + cf_free(db->buf); + } +} + +// Helpers to append name value pairs to a cf_dyn_buf in pattern: name=value; + +void +info_append_bool(cf_dyn_buf *db, const char *name, bool value) +{ + cf_dyn_buf_append_string(db, name); + cf_dyn_buf_append_char(db, '='); + cf_dyn_buf_append_bool(db, value); + cf_dyn_buf_append_char(db, ';'); +} + +void +info_append_int(cf_dyn_buf *db, const char *name, int value) +{ + cf_dyn_buf_append_string(db, name); + cf_dyn_buf_append_char(db, '='); + cf_dyn_buf_append_int(db, value); + cf_dyn_buf_append_char(db, ';'); +} + +void +info_append_string(cf_dyn_buf *db, const char *name, const char *value) +{ + cf_dyn_buf_append_string(db, name); + cf_dyn_buf_append_char(db, '='); + cf_dyn_buf_append_string(db, value); + cf_dyn_buf_append_char(db, ';'); +} + +void +info_append_string_safe(cf_dyn_buf *db, const char *name, const char *value) +{ + cf_dyn_buf_append_string(db, name); + cf_dyn_buf_append_char(db, '='); + cf_dyn_buf_append_string(db, value ? value : "null"); + cf_dyn_buf_append_char(db, ';'); +} + +void +info_append_uint32(cf_dyn_buf *db, const char *name, uint32_t value) +{ + cf_dyn_buf_append_string(db, name); + cf_dyn_buf_append_char(db, '='); + cf_dyn_buf_append_uint32(db, value); + cf_dyn_buf_append_char(db, ';'); +} + +void +info_append_uint64(cf_dyn_buf *db, const char *name, uint64_t value) +{ + cf_dyn_buf_append_string(db, name); + cf_dyn_buf_append_char(db, '='); + cf_dyn_buf_append_uint64(db, value); + cf_dyn_buf_append_char(db, ';'); +} + +void +info_append_uint64_x(cf_dyn_buf *db, const char *name, uint64_t value) +{ + cf_dyn_buf_append_string(db, name); + cf_dyn_buf_append_char(db, '='); + cf_dyn_buf_append_uint64_x(db, value); + cf_dyn_buf_append_char(db, ';'); +} + + + +void +cf_buf_builder_reserve_internal(cf_buf_builder **bb_r, size_t sz) +{ + cf_buf_builder *bb = *bb_r; + size_t new_sz = get_new_size(bb->alloc_sz, bb->used_sz, sz); + + if (new_sz > bb->alloc_sz) { + if (bb->alloc_sz - bb->used_sz < MAX_BACKOFF) { + bb = cf_realloc(bb, new_sz); + } + else { + // Only possible if buffer was reset. Avoids potential expensive + // copy within realloc. + cf_buf_builder *_t = cf_malloc(new_sz); + + memcpy(_t->buf, bb->buf, bb->used_sz); + _t->used_sz = bb->used_sz; + cf_free(bb); + bb = _t; + } + + bb->alloc_sz = new_sz - sizeof(cf_buf_builder); + *bb_r = bb; + } +} + +#define BB_RESERVE(_n) \ + if ((*bb_r)->alloc_sz - (*bb_r)->used_sz < _n) { \ + cf_buf_builder_reserve_internal(bb_r, _n); \ + } + +void +cf_buf_builder_append_buf(cf_buf_builder **bb_r, uint8_t *buf, size_t sz) +{ + BB_RESERVE(sz); + cf_buf_builder *bb = *bb_r; + memcpy(&bb->buf[bb->used_sz], buf, sz); + bb->used_sz += sz; +} + +void +cf_buf_builder_append_string(cf_buf_builder **bb_r, const char *s) +{ + size_t len = strlen(s); + BB_RESERVE(len); + cf_buf_builder *bb = *bb_r; + memcpy(&bb->buf[bb->used_sz], s, len); + bb->used_sz += len; +} + +void +cf_buf_builder_append_char(cf_buf_builder **bb_r, char c) +{ + BB_RESERVE(1); + cf_buf_builder *bb = *bb_r; + bb->buf[bb->used_sz] = (uint8_t)c; + bb->used_sz++; +} + +void +cf_buf_builder_append_ascii_int(cf_buf_builder **bb_r, int i) +{ + BB_RESERVE(12); + cf_buf_builder *bb = *bb_r; + bb->used_sz += cf_str_itoa(i, (char *)&bb->buf[bb->used_sz], 10); +} + +void +cf_buf_builder_append_ascii_uint64_x(cf_buf_builder **bb_r, uint64_t i) +{ + BB_RESERVE(18); + cf_buf_builder *bb = *bb_r; + bb->used_sz += cf_str_itoa_u64(i, (char *)&bb->buf[bb->used_sz], 16); +} + +void +cf_buf_builder_append_ascii_uint64(cf_buf_builder **bb_r, uint64_t i) +{ + BB_RESERVE(12); + cf_buf_builder *bb = *bb_r; + bb->used_sz += cf_str_itoa_u64(i, (char *)&bb->buf[bb->used_sz], 10); +} + +void +cf_buf_builder_append_ascii_uint32(cf_buf_builder **bb_r, uint32_t i) +{ + BB_RESERVE(12); + cf_buf_builder *bb = *bb_r; + bb->used_sz += cf_str_itoa_u32(i, (char *)&bb->buf[bb->used_sz], 10); +} + +void +cf_buf_builder_append_uint64(cf_buf_builder **bb_r, uint64_t i) +{ + BB_RESERVE(8); + cf_buf_builder *bb = *bb_r; + uint64_t *i_p = (uint64_t *)&bb->buf[bb->used_sz]; + *i_p = __swab64(i); + bb->used_sz += 8; +} + +void +cf_buf_builder_append_uint32(cf_buf_builder **bb_r, uint32_t i) +{ + BB_RESERVE(4); + cf_buf_builder *bb = *bb_r; + uint32_t *i_p = (uint32_t *)&bb->buf[bb->used_sz]; + *i_p = htonl(i); + bb->used_sz += 4; +} + +void +cf_buf_builder_append_uint16(cf_buf_builder **bb_r, uint16_t i) +{ + BB_RESERVE(2); + cf_buf_builder *bb = *bb_r; + uint16_t *i_p = (uint16_t *)&bb->buf[bb->used_sz]; + *i_p = htons(i); + bb->used_sz += 2; +} + +void +cf_buf_builder_append_uint8(cf_buf_builder **bb_r, uint8_t i) +{ + BB_RESERVE(1); + cf_buf_builder *bb = *bb_r; + bb->buf[bb->used_sz] = i; + bb->used_sz ++; +} + +void +cf_buf_builder_reserve(cf_buf_builder **bb_r, int sz, uint8_t **buf) +{ + BB_RESERVE(sz); + cf_buf_builder *bb = *bb_r; + + if (buf) { + *buf = &bb->buf[bb->used_sz]; + } + + bb->used_sz += sz; +} + +int +cf_buf_builder_size(cf_buf_builder *bb) +{ + return bb->alloc_sz + sizeof(cf_buf_builder); +} + +void +cf_buf_builder_chomp(cf_buf_builder *bb) +{ + if (bb->used_sz > 0) { + bb->used_sz--; + } +} + +char * +cf_buf_builder_strdup(cf_buf_builder *bb) +{ + if (bb->used_sz == 0) { + return NULL; + } + + char *s = cf_malloc(bb->used_sz+1); + + memcpy(s, bb->buf, bb->used_sz); + s[bb->used_sz] = 0; + + return s; +} + +cf_buf_builder * +cf_buf_builder_create() +{ + cf_buf_builder *bb = cf_malloc(1024); + + bb->alloc_sz = 1024 - sizeof(cf_buf_builder); + bb->used_sz = 0; + + return bb; +} + +cf_buf_builder * +cf_buf_builder_create_size(size_t sz) +{ + size_t malloc_sz = (sz < 1024) ? 1024 : sz; + cf_buf_builder *bb = cf_malloc(malloc_sz); + + bb->alloc_sz = malloc_sz - sizeof(cf_buf_builder); + bb->used_sz = 0; + + return bb; +} + +void +cf_buf_builder_free(cf_buf_builder *bb) +{ + cf_free(bb); +} + +void +cf_buf_builder_reset(cf_buf_builder *bb) +{ + bb->used_sz = 0; +} + + + +// TODO - We've only implemented a few cf_ll_buf methods for now. We'll add more +// functionality if and when it's needed. + +void +cf_ll_buf_grow(cf_ll_buf *llb, size_t sz) +{ + size_t buf_sz = sz > llb->head->buf_sz ? sz : llb->head->buf_sz; + cf_ll_buf_stage *new_tail = cf_malloc(sizeof(cf_ll_buf_stage) + buf_sz); + + new_tail->next = NULL; + new_tail->buf_sz = buf_sz; + new_tail->used_sz = 0; + + llb->tail->next = new_tail; + llb->tail = new_tail; +} + +#define LLB_RESERVE(_n) \ + if (_n > llb->tail->buf_sz - llb->tail->used_sz) { \ + cf_ll_buf_grow(llb, _n); \ + } + +void +cf_ll_buf_reserve(cf_ll_buf *llb, size_t sz, uint8_t **from) +{ + LLB_RESERVE(sz); + + if (from) { + *from = llb->tail->buf + llb->tail->used_sz; + } + + llb->tail->used_sz += sz; +} + +void +cf_ll_buf_free(cf_ll_buf *llb) +{ + cf_ll_buf_stage *cur = llb->head_is_stack ? llb->head->next : llb->head; + + while (cur) { + cf_ll_buf_stage *temp = cur; + + cur = cur->next; + cf_free(temp); + } +} diff --git a/cf/src/fault.c b/cf/src/fault.c new file mode 100644 index 00000000..793a5dc0 --- /dev/null +++ b/cf/src/fault.c @@ -0,0 +1,1138 @@ +/* + * fault.c + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "fault.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "aerospike/as_log.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_b64.h" + +#include "shash.h" + + +/* + * Maximum length for logging binary (i.e., hexadecimal or bit string) data. + */ +#define MAX_BINARY_BUF_SZ (64 * 1024) + +#define SINK_OPEN_FLAGS (O_WRONLY | O_CREAT | O_NONBLOCK | O_APPEND) +#define SINK_OPEN_MODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) + +/* cf_fault_context_strings, cf_fault_severity_strings, cf_fault_scope_strings + * Strings describing fault states */ + +/* MUST BE KEPT IN SYNC WITH FAULT.H */ + +char *cf_fault_context_strings[] = { + "misc", + + "alloc", + "arenax", + "hardware", + "msg", + "rbuffer", + "socket", + "tls", + "vmapx", + + "aggr", + "appeal", + "as", + "batch", + "bin", + "config", + "clustering", + "compression", + "demarshal", + "drv_ssd", + "exchange", + "fabric", + "geo", + "hb", + "hlc", + "index", + "info", + "info-port", + "job", + "migrate", + "mon", + "namespace", + "nsup", + "particle", + "partition", + "paxos", + "predexp", + "proto", + "proxy", + "proxy-divert", + "query", + "record", + "roster", + "rw", + "rw-client", + "scan", + "security", + "sindex", + "skew", + "smd", + "storage", + "truncate", + "tsvc", + "udf", + "xdr" +}; + +COMPILER_ASSERT(sizeof(cf_fault_context_strings) / sizeof(char*) == CF_FAULT_CONTEXT_UNDEF); + +static const char *cf_fault_severity_strings[] = { + "CRITICAL", + "WARNING", + "INFO", + "DEBUG", + "DETAIL" +}; + +COMPILER_ASSERT(sizeof(cf_fault_severity_strings) / sizeof(const char*) == CF_FAULT_SEVERITY_UNDEF); + +cf_fault_sink cf_fault_sinks[CF_FAULT_SINKS_MAX]; +cf_fault_severity cf_fault_filter[CF_FAULT_CONTEXT_UNDEF]; +int cf_fault_sinks_inuse = 0; +int num_held_fault_sinks = 0; + +cf_shash *g_ticker_hash = NULL; +#define CACHE_MSG_MAX_SIZE 128 + +typedef struct cf_fault_cache_hkey_s { + // Members most likely to be unique come first: + int line; + cf_fault_context context; + const char *file_name; + cf_fault_severity severity; + char msg[CACHE_MSG_MAX_SIZE]; +} __attribute__((__packed__)) cf_fault_cache_hkey; + +bool g_use_local_time = false; + +static bool g_log_millis = false; + +// Filter stderr logging at this level when there are no sinks: +#define NO_SINKS_LIMIT CF_WARNING + +static inline const char* +severity_tag(cf_fault_severity severity) +{ + return severity == CF_CRITICAL ? + "FAILED ASSERTION" : cf_fault_severity_strings[severity]; +} + +/* cf_context_at_severity + * Return whether the given context is set to this severity level or higher. */ +bool +cf_context_at_severity(const cf_fault_context context, const cf_fault_severity severity) +{ + return (severity <= cf_fault_filter[context]); +} + +static inline void +cf_fault_set_severity(const cf_fault_context context, const cf_fault_severity severity) +{ + cf_fault_filter[context] = severity; + + // UDF logging relies on the common as_log facility. + // Set as_log_level whenever AS_UDF severity changes. + if (context == AS_UDF && severity < CF_FAULT_SEVERITY_UNDEF) { + as_log_set_level((as_log_level)severity); + } +} + +static inline uint32_t +cache_hash_fn(const void *key) +{ + return (uint32_t)((const cf_fault_cache_hkey*)key)->line + + *(uint32_t*)((const cf_fault_cache_hkey*)key)->msg; +} + +/* cf_fault_init + * This code MUST be the first thing executed by main(). */ +void +cf_fault_init() +{ + // Initialize the fault filter. + for (int j = 0; j < CF_FAULT_CONTEXT_UNDEF; j++) { + // We start with no sinks, so let's be in-sync with that. + cf_fault_set_severity(j, NO_SINKS_LIMIT); + } + + // Create the ticker hash. + g_ticker_hash = cf_shash_create(cache_hash_fn, sizeof(cf_fault_cache_hkey), + sizeof(uint32_t), 256, CF_SHASH_MANY_LOCK); +} + + +/* cf_fault_sink_add + * Register an sink for faults */ +cf_fault_sink * +cf_fault_sink_add(char *path) +{ + cf_fault_sink *s; + + if ((CF_FAULT_SINKS_MAX - 1) == cf_fault_sinks_inuse) + return(NULL); + + s = &cf_fault_sinks[cf_fault_sinks_inuse++]; + s->path = cf_strdup(path); + if (0 == strncmp(path, "stderr", 6)) + s->fd = 2; + else { + if (-1 == (s->fd = open(path, SINK_OPEN_FLAGS, SINK_OPEN_MODE))) { + cf_fault_sinks_inuse--; + return(NULL); + } + } + + for (int i = 0; i < CF_FAULT_CONTEXT_UNDEF; i++) + s->limit[i] = CF_INFO; + + return(s); +} + + +/* cf_fault_sink_hold + * Register but don't activate a sink for faults - return sink object pointer on + * success, NULL on failure. Only use at startup when parsing config file. After + * all sinks are registered, activate via cf_fault_sink_activate_all_held(). */ +cf_fault_sink * +cf_fault_sink_hold(char *path) +{ + if (num_held_fault_sinks >= CF_FAULT_SINKS_MAX) { + cf_warning(CF_MISC, "too many fault sinks"); + return NULL; + } + + cf_fault_sink *s = &cf_fault_sinks[num_held_fault_sinks]; + + s->path = cf_strdup(path); + + // If a context is not added, its runtime default will be CF_INFO. + for (int i = 0; i < CF_FAULT_CONTEXT_UNDEF; i++) { + s->limit[i] = CF_INFO; + } + + num_held_fault_sinks++; + + return s; +} + + +/* cf_fault_console_is_held + * Return whether the console is held. + */ +bool +cf_fault_console_is_held() +{ + for (int i = 0; i < num_held_fault_sinks; i++) { + cf_fault_sink *s = &cf_fault_sinks[i]; + if (!strcmp(s->path, "stderr")) { + return true; + } + } + + return false; +} + + +static void +fault_filter_adjust(cf_fault_sink *s, cf_fault_context ctx) +{ + // Don't adjust filter while adding contexts during config file parsing. + if (cf_fault_sinks_inuse == 0) { + return; + } + + // Fault filter must allow logs at a less critical severity. + if (s->limit[ctx] > cf_fault_filter[ctx]) { + cf_fault_set_severity(ctx, s->limit[ctx]); + } + // Fault filter might be able to become stricter - check all sinks. + else if (s->limit[ctx] < cf_fault_filter[ctx]) { + cf_fault_severity severity = CF_CRITICAL; + + for (int i = 0; i < cf_fault_sinks_inuse; i++) { + cf_fault_sink *t = &cf_fault_sinks[i]; + + if (t->limit[ctx] > severity) { + severity = t->limit[ctx]; + } + } + + cf_fault_set_severity(ctx, severity); + } +} + + +/* cf_fault_sink_activate_all_held + * Activate all sinks on hold - return 0 on success, -1 on failure. Only use + * once at startup, after parsing config file. On failure there's no cleanup, + * assumes caller will stop the process. */ +int +cf_fault_sink_activate_all_held() +{ + for (int i = 0; i < num_held_fault_sinks; i++) { + if (cf_fault_sinks_inuse >= CF_FAULT_SINKS_MAX) { + // In case this isn't first sink, force logging as if no sinks: + cf_fault_sinks_inuse = 0; + cf_warning(CF_MISC, "too many fault sinks"); + return -1; + } + + cf_fault_sink *s = &cf_fault_sinks[i]; + + // "Activate" the sink. + if (0 == strncmp(s->path, "stderr", 6)) { + s->fd = 2; + } + else if (-1 == (s->fd = open(s->path, SINK_OPEN_FLAGS, SINK_OPEN_MODE))) { + // In case this isn't first sink, force logging as if no sinks: + cf_fault_sinks_inuse = 0; + cf_warning(CF_MISC, "can't open %s: %s", s->path, cf_strerror(errno)); + return -1; + } + + cf_fault_sinks_inuse++; + + // Adjust the fault filter to the runtime levels. + for (int j = 0; j < CF_FAULT_CONTEXT_UNDEF; j++) { + fault_filter_adjust(s, (cf_fault_context)j); + } + } + + return 0; +} + + +/* cf_fault_sink_get_fd_list + * Fill list with all active sink fds, excluding stderr - return list count. */ +int +cf_fault_sink_get_fd_list(int *fds) +{ + int num_open_fds = 0; + + for (int i = 0; i < cf_fault_sinks_inuse; i++) { + cf_fault_sink *s = &cf_fault_sinks[i]; + + // Exclude stderr. + if (s->fd > 2 && 0 != strncmp(s->path, "stderr", 6)) { + fds[num_open_fds++] = s->fd; + } + } + + return num_open_fds; +} + + +static int +cf_fault_sink_addcontext_all(char *context, char *severity) +{ + for (int i = 0; i < cf_fault_sinks_inuse; i++) { + cf_fault_sink *s = &cf_fault_sinks[i]; + int rv = cf_fault_sink_addcontext(s, context, severity); + if (rv != 0) return(rv); + } + return(0); +} + + +int +cf_fault_sink_addcontext(cf_fault_sink *s, char *context, char *severity) +{ + if (s == 0) return(cf_fault_sink_addcontext_all(context, severity)); + + cf_fault_context ctx = CF_FAULT_CONTEXT_UNDEF; + cf_fault_severity sev = CF_FAULT_SEVERITY_UNDEF; + + for (int i = 0; i < CF_FAULT_SEVERITY_UNDEF; i++) { + if (0 == strncasecmp(cf_fault_severity_strings[i], severity, strlen(severity))) + sev = (cf_fault_severity)i; + } + if (CF_FAULT_SEVERITY_UNDEF == sev) + return(-1); + + if (0 == strncasecmp(context, "any", 3)) { + for (int i = 0; i < CF_FAULT_CONTEXT_UNDEF; i++) { + s->limit[i] = sev; + fault_filter_adjust(s, (cf_fault_context)i); + } + } else { + for (int i = 0; i < CF_FAULT_CONTEXT_UNDEF; i++) { + //strncasecmp only compared the length of context passed in the 3rd argument and as cf_fault_context_strings has info and info port, + //So when you try to set info to debug it will set info-port to debug . Just forcing it to check the length from cf_fault_context_strings + if (0 == strncasecmp(cf_fault_context_strings[i], context, strlen(cf_fault_context_strings[i]))) + ctx = (cf_fault_context)i; + } + if (CF_FAULT_CONTEXT_UNDEF == ctx) + return(-1); + + s->limit[ctx] = sev; + fault_filter_adjust(s, ctx); + } + + return(0); +} + + +void +cf_fault_use_local_time(bool val) +{ + g_use_local_time = val; +} + +bool +cf_fault_is_using_local_time() +{ + return g_use_local_time; +} + +void +cf_fault_log_millis(bool log_millis) +{ + g_log_millis = log_millis; +} + +bool +cf_fault_is_logging_millis() +{ + return g_log_millis; +} + +int +cf_sprintf_now(char* mbuf, size_t limit) +{ + struct tm nowtm; + + if (cf_fault_is_logging_millis()) { + // Logging milli seconds as well. + struct timeval curTime; + gettimeofday(&curTime, NULL); + int millis = curTime.tv_usec / 1000; + int pos = 0; + if (g_use_local_time) { + localtime_r(&curTime.tv_sec, &nowtm); + pos = strftime(mbuf, limit, "%b %d %Y %T.", &nowtm); + pos += + snprintf(mbuf + pos, limit - pos, "%03d", millis); + pos += + strftime(mbuf + pos, limit - pos, " GMT%z: ", &nowtm); + return pos; + } else { + gmtime_r(&curTime.tv_sec, &nowtm); + pos = strftime(mbuf, limit, "%b %d %Y %T.", &nowtm); + pos += + snprintf(mbuf + pos, limit - pos, "%03d", millis); + pos += + strftime(mbuf + pos, limit - pos, " %Z: ", &nowtm); + return pos; + } + } + + // Logging only seconds. + time_t now = time(NULL); + + if (g_use_local_time) { + localtime_r(&now, &nowtm); + return strftime(mbuf, limit, "%b %d %Y %T GMT%z: ", &nowtm); + } else { + gmtime_r(&now, &nowtm); + return strftime(mbuf, limit, "%b %d %Y %T %Z: ", &nowtm); + } +} + +/* cf_fault_event + * Respond to a fault */ +void +cf_fault_event(const cf_fault_context context, const cf_fault_severity severity, + const char *file_name, const int line, const char *msg, ...) +{ + va_list argp; + char mbuf[1024]; + size_t pos; + + + /* Make sure there's always enough space for the \n\0. */ + size_t limit = sizeof(mbuf) - 2; + + /* Set the timestamp */ + pos = cf_sprintf_now(mbuf, limit); + + /* Set the context/scope/severity tag */ + pos += snprintf(mbuf + pos, limit - pos, "%s (%s): ", severity_tag(severity), cf_fault_context_strings[context]); + + /* + * snprintf() and vsnprintf() will not write more than the size specified, + * but they return the size that would have been written without truncation. + * These checks make sure there's enough space for the final \n\0. + */ + if (pos > limit) { + pos = limit; + } + + /* Set the location: filename and line number */ + if (file_name) { + pos += snprintf(mbuf + pos, limit - pos, "(%s:%d) ", file_name, line); + } + + if (pos > limit) { + pos = limit; + } + + /* Append the message */ + va_start(argp, msg); + pos += vsnprintf(mbuf + pos, limit - pos, msg, argp); + va_end(argp); + + if (pos > limit) { + pos = limit; + } + + pos += snprintf(mbuf + pos, 2, "\n"); + + /* Route the message to the correct destinations */ + if (0 == cf_fault_sinks_inuse) { + /* If no fault sinks are defined, use stderr for important messages */ + if (severity <= NO_SINKS_LIMIT) + fprintf(stderr, "%s", mbuf); + } else { + for (int i = 0; i < cf_fault_sinks_inuse; i++) { + if ((severity <= cf_fault_sinks[i].limit[context]) || (CF_CRITICAL == severity)) { + if (0 >= write(cf_fault_sinks[i].fd, mbuf, pos)) { + // this is OK for a bit in case of a HUP. It's even better to queue the buffers and apply them + // after the hup. TODO. + fprintf(stderr, "internal failure in fault message write: %s\n", cf_strerror(errno)); + } + } + } + } + + /* Critical errors */ + if (CF_CRITICAL == severity) { + fflush(NULL); + + // Our signal handler will log a stack trace. + raise(SIGUSR1); + } +} // end cf_fault_event() + + +/** + * Generate a Packed Hex String Representation of the binary string. + * e.g. 0xfc86e83a6d6d3024659e6fe48c351aaaf6e964a5 + * The value is preceeded by a "0x" to denote Hex (which allows it to be + * used in other contexts as a hex number). + */ +int +generate_packed_hex_string(const void *mem_ptr, uint32_t len, char* output) +{ + uint8_t *d = (uint8_t *) mem_ptr; + char* p = output; + char* startp = p; // Remember where we started. + + *p++ = '0'; + *p++ = 'x'; + + for (uint32_t i = 0; i < len; i++) { + sprintf(p, "%02x", d[i]); + p += 2; + } + *p++ = 0; // Null terminate the output buffer. + return (int) (p - startp); // show how much space we used. +} // end generate_packed_hex_string() + + +/** + * Generate a Spaced Hex String Representation of the binary string. + * e.g. fc 86 e8 3a 6d 6d 30 24 65 9e 6f e4 8c 35 1a aa f6 e9 64 a5 + */ +int +generate_spaced_hex_string(const void *mem_ptr, uint32_t len, char* output) +{ + uint8_t *d = (uint8_t *) mem_ptr; + char* p = output; + char* startp = p; // Remember where we started. + + for (uint32_t i = 0; i < len; i++) { + sprintf(p, "%02x ", d[i]); // Notice the space after the 02x. + p += 3; + } + *p++ = 0; // Null terminate the output buffer. + return (int) (p - startp); // show how much space we used. +} // end generate_spaced_hex_string() + + +/** + * Generate a Column Hex String Representation of the binary string. + * The Columns will be four two-byte values, with spaces between the bytes: + * fc86 e83a 6d6d 3024 + * 659e 6fe4 8c35 1aaa + * f6e9 64a5 + */ +int +generate_column_hex_string(const void *mem_ptr, uint32_t len, char* output) +{ + uint8_t *d = (uint8_t *) mem_ptr; + char* p = output; + uint32_t i; + char* startp = p; // Remember where we started. + + *p++ = '\n'; // Start out on a new line + + for (i = 0; i < len; i++) { + sprintf(p, "%02x ", d[i]); // Two chars and a space + p += 3; + if ((i+1) % 8 == 0 && i != 0) { + *p++ = '\n'; // add a line return + } + } + *p++ = '\n'; // Finish with a new line + *p++ = 0; // Null terminate the output buffer. + return (int) (p - startp); // show how much space we used. +} // end generate_column_hex_string() + + +/** + * Generate a Base64 String Representation of the binary string. + * Base64 encoding converts three octets into four 6-bit encoded characters. + * So, the string 8-bit bytes are broken down into 6 bit values, each of which + * is then converted into a base64 value. + * So, for example, the string "Man" :: M[77: 0x4d)] a[97(0x61)] n[110(0x6e)] + * Bits: (4)0100 (d)1101 (6)0110 (1)0001 (6)0110 (e)1110 + * Base 64 bits: 010011 010110 000101 101110 + * Base 64 Rep: 010011(19) 010110(22) 000101(5) 101110(46) + * Base 64 Chars: T(19) W(22) F(5) u(46) + * and so this string is converted into the Base 64 string: "TWFu" + */ +int generate_base64_string(const void *mem_ptr, uint32_t len, char output_buf[]) +{ + uint32_t encoded_len = cf_b64_encoded_len(len); + // TODO - check that output_buf is big enough, and/or truncate. + + cf_b64_encode((const uint8_t*)mem_ptr, len, output_buf); + + output_buf[encoded_len] = 0; // null-terminate + + return (int)(encoded_len + 1); // bytes we used, including null-terminator +} // end generate_base64_hex_string() + + +/** + * Generate a BIT representation with spaces between the four bit groups. + * Print the bits left to right (big to small). + * This is assuming BIG ENDIAN representation (most significant bit is left). + */ +int generate_4spaced_bits_string(const void *mem_ptr, uint32_t len, char* output) +{ + uint8_t *d = (uint8_t *) mem_ptr; + char* p = output; + uint8_t uint_val; + uint8_t mask = 0x80; // largest single bit value in a byte + char* startp = p; // Remember where we started. + + // For each byte in the string + for (uint32_t i = 0; i < len; i++) { + uint_val = d[i]; + for (int j = 0; j < 8; j++) { + sprintf(p, "%1d", ((uint_val << j) & mask)); + p++; + // Add a space after every 4th bit + if ( (j+1) % 4 == 0 ) *p++ = ' '; + } + } + *p++ = 0; // Null terminate the output buffer. + return (int) (p - startp); // show how much space we used. +} // end generate_4spaced_bits_string() + +/** + * Generate a BIT representation of columns with spaces between the + * four bit groups. Columns will be 8 columns of 4 bits. + * (1 32 bit word per row) + */ +int generate_column_bits_string(const void *mem_ptr, uint32_t len, char* output) +{ + uint8_t *d = (uint8_t *) mem_ptr; + char* p = output; + uint8_t uint_val; + uint8_t mask = 0x80; // largest single bit value in a byte + char* startp = p; // Remember where we started. + + // Start on a new line + *p++ = '\n'; + + // For each byte in the string + for (uint32_t i = 0; i < len; i++) { + uint_val = d[i]; + for (int j = 0; j < 8; j++) { + sprintf(p, "%1d", ((uint_val << j) & mask)); + p++; + // Add a space after every 4th bit + if ((j + 1) % 4 == 0) *p++ = ' '; + } + // Add a line return after every 4th byte + if ((i + 1) % 4 == 0) *p++ = '\n'; + } + *p++ = 0; // Null terminate the output buffer. + return (int) (p - startp); // show how much space we used. +} // end generate_column_bits_string() + + +/* cf_fault_event -- TWO: Expand on the LOG ability by being able to + * print the contents of a BINARY array if we're passed a valid ptr (not NULL). + * We will print the array according to "format". + * Parms: + * (*) scope: The module family (e.g. AS_RW, AS_UDF...) + * (*) severify: The scope severity (e.g. INFO, DEBUG, DETAIL) + * (*) file_name: Ptr to the FILE generating the call + * (*) line: The function (really, the FILE) line number of the source call + * (*) mem_ptr: Ptr to memory location of binary array (or NULL) + * (*) len: Length of the binary string + * (*) format: The single char showing the format (e.g. 'D', 'B', etc) + * (*) msg: The format msg string + * (*) ... : The variable set of parameters the correspond to the msg string. + * + * NOTE: We will eventually merge this function with the original cf_fault_event() + **/ +void +cf_fault_event2(const cf_fault_context context, + const cf_fault_severity severity, const char *file_name, const int line, + const void *mem_ptr, size_t len, cf_display_type dt, const char *msg, ...) +{ + va_list argp; + char mbuf[MAX_BINARY_BUF_SZ]; + size_t pos; + + char binary_buf[MAX_BINARY_BUF_SZ]; + + // Arbitrarily limit output to a fixed maximum length. + if (len > MAX_BINARY_BUF_SZ) { + len = MAX_BINARY_BUF_SZ; + } + char * labelp = NULL; // initialize to quiet build warning + + /* Make sure there's always enough space for the \n\0. */ + size_t limit = sizeof(mbuf) - 2; + + /* Set the timestamp */ + pos = cf_sprintf_now(mbuf, limit); + + // If we're given a valid MEMORY POINTER for a binary value, then + // compute the string that corresponds to the bytes. + if (mem_ptr) { + switch (dt) { + case CF_DISPLAY_HEX_DIGEST: + labelp = "Digest"; + generate_packed_hex_string(mem_ptr, len, binary_buf); + break; + case CF_DISPLAY_HEX_SPACED: + labelp = "HexSpaced"; + generate_spaced_hex_string(mem_ptr, len, binary_buf); + break; + case CF_DISPLAY_HEX_PACKED: + labelp = "HexPacked"; + generate_packed_hex_string(mem_ptr, len, binary_buf); + break; + case CF_DISPLAY_HEX_COLUMNS: + labelp = "HexColumns"; + generate_column_hex_string(mem_ptr, len, binary_buf); + break; + case CF_DISPLAY_BASE64: + labelp = "Base64"; + generate_base64_string(mem_ptr, len, binary_buf); + break; + case CF_DISPLAY_BITS_SPACED: + labelp = "BitsSpaced"; + generate_4spaced_bits_string(mem_ptr, len, binary_buf); + break; + case CF_DISPLAY_BITS_COLUMNS: + labelp = "BitsColumns"; + generate_column_bits_string(mem_ptr, len, binary_buf); + break; + default: + labelp = "Unknown Format"; + binary_buf[0] = 0; // make sure it's null terminated. + break; + + } // end switch + } // if binary data is present + + /* Set the context/scope/severity tag */ + pos += snprintf(mbuf + pos, limit - pos, "%s (%s): ", + severity_tag(severity), + cf_fault_context_strings[context]); + + /* + * snprintf() and vsnprintf() will not write more than the size specified, + * but they return the size that would have been written without truncation. + * These checks make sure there's enough space for the final \n\0. + */ + if (pos > limit) { + pos = limit; + } + + /* Set the location: filename and line number */ + if (file_name) { + pos += snprintf(mbuf + pos, limit - pos, "(%s:%d) ", file_name, line); + } + + // Check for overflow (see above). + if (pos > limit) { + pos = limit; + } + + /* Append the message */ + va_start(argp, msg); + pos += vsnprintf(mbuf + pos, limit - pos, msg, argp); + va_end(argp); + + // Check for overflow (see above). + if (pos > limit) { + pos = limit; + } + + // Append our final BINARY string, if present (some might pass in NULL). + if ( mem_ptr ) { + pos += snprintf(mbuf + pos, limit - pos, "<%s>:%s", labelp, binary_buf); + } + + // Check for overflow (see above). + if (pos > limit) { + pos = limit; + } + + pos += snprintf(mbuf + pos, 2, "\n"); + + /* Route the message to the correct destinations */ + if (0 == cf_fault_sinks_inuse) { + /* If no fault sinks are defined, use stderr for critical messages */ + if (CF_CRITICAL == severity) + fprintf(stderr, "%s", mbuf); + } else { + for (int i = 0; i < cf_fault_sinks_inuse; i++) { + if ((severity <= cf_fault_sinks[i].limit[context]) || (CF_CRITICAL == severity)) { + if (0 >= write(cf_fault_sinks[i].fd, mbuf, pos)) { + // this is OK for a bit in case of a HUP. It's even better to queue the buffers and apply them + // after the hup. TODO. + fprintf(stderr, "internal failure in fault message write: %s\n", cf_strerror(errno)); + } + } + } + } + + /* Critical errors */ + if (CF_CRITICAL == severity) { + fflush(NULL); + + // Our signal handler will log a stack trace. + raise(SIGUSR1); + } +} + + +void +cf_fault_event_nostack(const cf_fault_context context, + const cf_fault_severity severity, const char *fn, const int line, + const char *msg, ...) +{ + va_list argp; + char mbuf[1024]; + time_t now; + struct tm nowtm; + size_t pos; + + /* Make sure there's always enough space for the \n\0. */ + size_t limit = sizeof(mbuf) - 2; + + /* Set the timestamp */ + now = time(NULL); + + if (g_use_local_time) { + localtime_r(&now, &nowtm); + pos = strftime(mbuf, limit, "%b %d %Y %T GMT%z: ", &nowtm); + } + else { + gmtime_r(&now, &nowtm); + pos = strftime(mbuf, limit, "%b %d %Y %T %Z: ", &nowtm); + } + + /* Set the context/scope/severity tag */ + pos += snprintf(mbuf + pos, limit - pos, "%s (%s): ", severity_tag(severity), cf_fault_context_strings[context]); + + /* + * snprintf() and vsnprintf() will not write more than the size specified, + * but they return the size that would have been written without truncation. + * These checks make sure there's enough space for the final \n\0. + */ + if (pos > limit) { + pos = limit; + } + + /* Set the location */ + if (fn) + pos += snprintf(mbuf + pos, limit - pos, "(%s:%d) ", fn, line); + + if (pos > limit) { + pos = limit; + } + + /* Append the message */ + va_start(argp, msg); + pos += vsnprintf(mbuf + pos, limit - pos, msg, argp); + va_end(argp); + + if (pos > limit) { + pos = limit; + } + + pos += snprintf(mbuf + pos, 2, "\n"); + + /* Route the message to the correct destinations */ + if (0 == cf_fault_sinks_inuse) { + /* If no fault sinks are defined, use stderr for important messages */ + if (severity <= NO_SINKS_LIMIT) + fprintf(stderr, "%s", mbuf); + } else { + for (int i = 0; i < cf_fault_sinks_inuse; i++) { + if ((severity <= cf_fault_sinks[i].limit[context]) || (CF_CRITICAL == severity)) { + if (0 >= write(cf_fault_sinks[i].fd, mbuf, pos)) { + // this is OK for a bit in case of a HUP. It's even better to queue the buffers and apply them + // after the hup. TODO. + fprintf(stderr, "internal failure in fault message write: %s\n", cf_strerror(errno)); + } + } + } + } + + /* Critical errors */ + if (CF_CRITICAL == severity) { + fflush(NULL); + + // these signals don't throw stack traces in our system + raise(SIGINT); + } +} + + +int +cf_fault_sink_strlist(cf_dyn_buf *db) +{ + for (int i = 0; i < cf_fault_sinks_inuse; i++) { + cf_dyn_buf_append_int(db, i); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_string(db,cf_fault_sinks[i].path); + cf_dyn_buf_append_char(db, ';'); + } + cf_dyn_buf_chomp(db); + return(0); +} + + +extern void +cf_fault_sink_logroll(void) +{ + fprintf(stderr, "cf_fault: rolling log files\n"); + for (int i = 0; i < cf_fault_sinks_inuse; i++) { + cf_fault_sink *s = &cf_fault_sinks[i]; + if ((0 != strncmp(s->path, "stderr", 6)) && (s->fd > 2)) { + int fd = s->fd; + s->fd = -1; + usleep(1); + + // hopefully, the file has been relinked elsewhere - or you're OK losing it + unlink(s->path); + close(fd); + + fd = open(s->path, SINK_OPEN_FLAGS, SINK_OPEN_MODE); + s->fd = fd; + } + } +} + + +cf_fault_sink *cf_fault_sink_get_id(int id) +{ + if (id > cf_fault_sinks_inuse) return(0); + return ( &cf_fault_sinks[id] ); +} + +int +cf_fault_sink_context_all_strlist(int sink_id, cf_dyn_buf *db) +{ + // get the sink + if (sink_id > cf_fault_sinks_inuse) return(-1); + cf_fault_sink *s = &cf_fault_sinks[sink_id]; + + for (int i = 0; i < CF_FAULT_CONTEXT_UNDEF; i++) { + cf_dyn_buf_append_string(db, cf_fault_context_strings[i]); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_string(db, cf_fault_severity_strings[s->limit[i]]); + cf_dyn_buf_append_char(db, ';'); + } + cf_dyn_buf_chomp(db); + return(0); +} + +int +cf_fault_sink_context_strlist(int sink_id, char *context, cf_dyn_buf *db) +{ + // get the sink + if (sink_id > cf_fault_sinks_inuse) return(-1); + cf_fault_sink *s = &cf_fault_sinks[sink_id]; + + // get the severity + int i; + for (i = 0; i < CF_FAULT_CONTEXT_UNDEF; i++) { + if (0 == strcmp(cf_fault_context_strings[i],context)) + break; + } + if (i == CF_FAULT_CONTEXT_UNDEF) { + cf_dyn_buf_append_string(db, context); + cf_dyn_buf_append_string(db, ":unknown"); + return(0); + } + + // get the string + cf_dyn_buf_append_string(db, context); + cf_dyn_buf_append_char(db, ':'); + cf_dyn_buf_append_string(db, cf_fault_severity_strings[s->limit[i]]); + return(0); +} + + +static int +cf_fault_cache_reduce_fn(const void *key, void *data, void *udata) +{ + uint32_t *count = (uint32_t*)data; + + if (*count == 0) { + return CF_SHASH_REDUCE_DELETE; + } + + const cf_fault_cache_hkey *hkey = (const cf_fault_cache_hkey*)key; + + cf_fault_event(hkey->context, hkey->severity, hkey->file_name, hkey->line, + "(repeated:%u) %s", *count, hkey->msg); + + *count = 0; + + return CF_SHASH_OK; +} + + +// For now there's only one cache, dumped by the ticker. +void +cf_fault_dump_cache() +{ + cf_shash_reduce(g_ticker_hash, cf_fault_cache_reduce_fn, NULL); +} + + +// For now there's only one cache, dumped by the ticker. +void +cf_fault_cache_event(cf_fault_context context, cf_fault_severity severity, + const char *file_name, int line, char *msg, ...) +{ + cf_fault_cache_hkey key = { + .line = line, + .context = context, + .file_name = file_name, + .severity = severity, + .msg = { 0 } // must pad hash keys + }; + + size_t limit = sizeof(key.msg) - 1; // truncate leaving null-terminator + + va_list argp; + + va_start(argp, msg); + vsnprintf(key.msg, limit, msg, argp); + va_end(argp); + + while (true) { + uint32_t *valp = NULL; + pthread_mutex_t *lockp = NULL; + + if (cf_shash_get_vlock(g_ticker_hash, &key, (void**)&valp, &lockp) == + CF_SHASH_OK) { + // Already in hash - increment count and don't log it. + (*valp)++; + pthread_mutex_unlock(lockp); + break; + } + // else - not found, add it to hash and log it. + + uint32_t initv = 1; + + if (cf_shash_put_unique(g_ticker_hash, &key, &initv) == + CF_SHASH_ERR_FOUND) { + continue; // other thread beat us to it - loop around and get it + } + + cf_fault_event(context, severity, file_name, line, "%s", key.msg); + break; + } +} + +void +cf_fault_hex_dump(const char *title, const void *data, size_t len) +{ + const uint8_t *data8 = data; + char line[8 + 3 * 16 + 17]; + size_t k; + + cf_info(CF_MISC, "hex dump - %s", title); + + for (size_t i = 0; i < len; i += k) { + sprintf(line, "%06zx: ", i); + + for (k = 0; i + k < len && k < 16; ++k) { + char num[3]; + uint8_t d = data8[i + k]; + sprintf(num, "%02x", d); + line[8 + 3 * k + 0] = num[0]; + line[8 + 3 * k + 1] = num[1]; + line[8 + 3 * 16 + k] = d >= 32 && d <= 126 ? d : '.'; + } + + cf_info(CF_MISC, "%s", line); + } +} diff --git a/cf/src/hardware.c b/cf/src/hardware.c new file mode 100644 index 00000000..e0358436 --- /dev/null +++ b/cf/src/hardware.c @@ -0,0 +1,1791 @@ +/* + * hardware.c + * + * Copyright (C) 2016-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "hardware.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "daemon.h" +#include "fault.h" +#include "socket.h" + +#include "citrusleaf/alloc.h" + +#include "warnings.h" + +// Only available in Linux kernel version 3.19 and later; but we'd like to +// allow compilation with older kernel headers. +#if !defined SO_INCOMING_CPU +#define SO_INCOMING_CPU 49 +#endif + +#define INVALID_INDEX ((uint16_t)-1) +#define POLICY_SCRIPT "/etc/aerospike/irqbalance-ban.sh" + +#define MEM_PAGE_SIZE (4096L) + +typedef enum { + FILE_RES_OK, + FILE_RES_NOT_FOUND, + FILE_RES_ERROR +} file_res; + +typedef enum { + CHECK_PROC_PRESENT, + CHECK_PROC_PRESENT_NO_ARG, + CHECK_PROC_ABSENT +} check_proc_res; + +typedef uint16_t os_numa_node_index; +typedef uint16_t os_package_index; +typedef uint16_t os_core_index; + +typedef uint16_t irq_number; + +typedef struct { + uint16_t n_irqs; + irq_number irqs[CPU_SETSIZE]; + uint16_t per_cpu; +} irq_list; + +static cpu_set_t g_os_cpus_online; +static cpu_set_t g_numa_node_os_cpus_online[CPU_SETSIZE]; + +static uint16_t g_n_numa_nodes; +static uint16_t g_n_cores; +static uint16_t g_n_os_cpus; +static uint16_t g_n_cpus; +static uint16_t g_n_irq_cpus; + +static os_numa_node_index g_numa_node_index_to_os_numa_node_index[CPU_SETSIZE]; +static cf_topo_os_cpu_index g_core_index_to_os_cpu_index[CPU_SETSIZE]; +static cf_topo_os_cpu_index g_cpu_index_to_os_cpu_index[CPU_SETSIZE]; +static cf_topo_cpu_index g_os_cpu_index_to_cpu_index[CPU_SETSIZE]; + +static cf_topo_numa_node_index g_i_numa_node; + +static file_res +read_file(const char *path, void *buff, size_t *limit) +{ + cf_detail(CF_HARDWARE, "reading file %s with buffer size %zu", path, *limit); + int32_t fd = open(path, O_RDONLY); + + if (fd < 0) { + if (errno == ENOENT) { + cf_detail(CF_HARDWARE, "file %s not found", path); + return FILE_RES_NOT_FOUND; + } + + cf_warning(CF_HARDWARE, "error while opening file %s for reading: %d (%s)", + path, errno, cf_strerror(errno)); + return FILE_RES_ERROR; + } + + size_t total = 0; + + while (total < *limit) { + cf_detail(CF_HARDWARE, "reading %zd byte(s) at offset %zu", *limit - total, total); + ssize_t len = read(fd, (uint8_t *)buff + total, *limit - total); + CF_NEVER_FAILS(len); + + if (len == 0) { + cf_detail(CF_HARDWARE, "EOF"); + break; + } + + total += (size_t)len; + } + + cf_detail(CF_HARDWARE, "read %zu byte(s) from file %s", total, path); + file_res res; + + if (total == *limit) { + cf_warning(CF_HARDWARE, "read buffer too small for file %s", path); + res = FILE_RES_ERROR; + } + else { + res = FILE_RES_OK; + *limit = total; + } + + CF_NEVER_FAILS(close(fd)); + return res; +} + +static file_res +write_file(const char *path, const void *buff, size_t limit) +{ + cf_detail(CF_HARDWARE, "writing file %s with buffer size %zu", path, limit); + int32_t fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0600); + + if (fd < 0) { + if (errno == ENOENT) { + cf_detail(CF_HARDWARE, "file %s not found", path); + return FILE_RES_NOT_FOUND; + } + + cf_warning(CF_HARDWARE, "error while opening file %s for writing: %d (%s)", + path, errno, cf_strerror(errno)); + return FILE_RES_ERROR; + } + + size_t total = 0; + + while (total < limit) { + cf_detail(CF_HARDWARE, "writing %zd byte(s) at offset %zu", limit - total, total); + ssize_t len = write(fd, (uint8_t *)buff + total, limit - total); + CF_NEVER_FAILS(len); + total += (size_t)len; + } + + cf_detail(CF_HARDWARE, "done writing"); + CF_NEVER_FAILS(close(fd)); + return FILE_RES_OK; +} + +static void +write_file_safe(const char *path, const void *buff, size_t limit) +{ + if (write_file(path, buff, limit) != FILE_RES_OK) { + cf_crash(CF_HARDWARE, "write failed unexpectedly"); + } +} + +static DIR * +opendir_safe(const char *path) +{ + DIR *dir = opendir(path); + + if (dir == NULL) { + cf_crash(CF_HARDWARE, "error while opening directory: %d (%s)", + errno, cf_strerror(errno)); + } + + return dir; +} + +static int32_t +readdir_safe(DIR *dir, struct dirent *ent) +{ + while (true) { + errno = 0; + struct dirent *tmp = readdir(dir); + + if (tmp == NULL) { + if (errno != 0) { + cf_crash(CF_HARDWARE, "error while reading directory: %d (%s)", + errno, cf_strerror(errno)); + } + + return -1; + } + + if (strcmp(tmp->d_name, ".") == 0 || strcmp(tmp->d_name, "..") == 0) { + continue; + } + + memcpy(ent, tmp, sizeof(struct dirent)); + return 0; + } +} + +static void +closedir_safe(DIR *dir) +{ + if (closedir(dir) < 0) { + cf_crash(CF_HARDWARE, "error while closing PCI device directory: %d (%s)", + errno, cf_strerror(errno)); + } +} + +static bool +path_exists(const char *path) +{ + struct stat stat_info; + + if (stat(path, &stat_info) < 0) { + if (errno == ENOENT) { + return false; + } + + cf_crash(CF_HARDWARE, "error while checking for path %s: %d (%s)", + path, errno, cf_strerror(errno)); + } + + return true; +} + +static void +set_mempolicy_safe(uint32_t mode, uint64_t *node_mask, size_t max_node) +{ + if (syscall(__NR_set_mempolicy, mode, node_mask, max_node) < 0) { + cf_crash(CF_HARDWARE, "set_mempolicy() system call failed: %d (%s)", + errno, cf_strerror(errno)); + } +} + +static void +migrate_pages_safe(pid_t pid, size_t max_node, uint64_t *from_mask, uint64_t *to_mask) +{ + int64_t res = syscall(__NR_migrate_pages, pid, max_node, from_mask, to_mask); + + if (res < 0) { + cf_crash(CF_HARDWARE, "migrate_pages() syscall failed: %d (%s)", + errno, cf_strerror(errno)); + } + + if (res > 0) { + cf_warning(CF_HARDWARE, "could not NUMA-migrate %" PRId64 " page(s)", res); + } +} + +static void +mask_to_string(cpu_set_t *mask, char *buff, size_t limit) +{ + cf_topo_os_cpu_index max; + + for (max = CPU_SETSIZE - 1; max > 0; --max) { + if (CPU_ISSET(max, mask)) { + break; + } + } + + int32_t words = max / 32 + 1; + size_t size = (size_t)words * 9; + + if (size > limit) { + cf_crash(CF_HARDWARE, "CPU mask buffer overflow: %zu vs. %zu", size, limit); + } + + for (int32_t i = words - 1; i >= 0; --i) { + uint32_t val = 0; + + for (int32_t k = 0; k < 32; ++k) { + if (CPU_ISSET((size_t)(i * 32 + k), mask)) { + val |= 1u << k; + } + } + + snprintf(buff, limit, "%08x", val); + + if (i > 0) { + buff[8] = ','; + } + + buff += 9; + limit -= 9; + } +} + +static file_res +read_index(const char *path, uint16_t *val) +{ + cf_detail(CF_HARDWARE, "reading index from file %s", path); + char buff[100]; + size_t limit = sizeof(buff); + file_res res = read_file(path, buff, &limit); + + if (res != FILE_RES_OK) { + return res; + } + + buff[limit - 1] = '\0'; + cf_detail(CF_HARDWARE, "parsing index \"%s\"", buff); + + char *end; + uint64_t x = strtoul(buff, &end, 10); + + if (*end != '\0' || x >= CPU_SETSIZE) { + cf_warning(CF_HARDWARE, "invalid index \"%s\" in %s", buff, path); + return FILE_RES_ERROR; + } + + *val = (uint16_t)x; + return FILE_RES_OK; +} + +static file_res +read_list(const char *path, cpu_set_t *mask) +{ + cf_detail(CF_HARDWARE, "reading list from file %s", path); + char buff[1000]; + size_t limit = sizeof(buff); + file_res res = read_file(path, buff, &limit); + + if (res != FILE_RES_OK) { + return res; + } + + buff[limit - 1] = '\0'; + cf_detail(CF_HARDWARE, "parsing list \"%s\"", buff); + + CPU_ZERO(mask); + char *walker = buff; + + while (true) { + char *delim; + uint64_t from = strtoul(walker, &delim, 10); + uint64_t thru; + + if (*delim == ',' || *delim == '\0'){ + thru = from; + } + else if (*delim == '-') { + walker = delim + 1; + thru = strtoul(walker, &delim, 10); + } + else { + cf_warning(CF_HARDWARE, "invalid list \"%s\" in %s", buff, path); + return FILE_RES_ERROR; + } + + if (from >= CPU_SETSIZE || thru >= CPU_SETSIZE || from > thru) { + cf_warning(CF_HARDWARE, "invalid list \"%s\" in %s", buff, path); + return FILE_RES_ERROR; + } + + cf_detail(CF_HARDWARE, "marking %d through %d", (int32_t)from, (int32_t)thru); + + for (size_t i = from; i <= thru; ++i) { + CPU_SET(i, mask); + } + + if (*delim == '\0') { + break; + } + + walker = delim + 1; + } + + char buff2[1000]; + mask_to_string(mask, buff2, sizeof(buff2)); + cf_detail(CF_HARDWARE, "list \"%s\" -> mask %s", buff, buff2); + + return FILE_RES_OK; +} + +static void +detect(cf_topo_numa_node_index a_numa_node) +{ + if (a_numa_node == INVALID_INDEX) { + cf_detail(CF_HARDWARE, "detecting online CPUs"); + } + else { + cf_detail(CF_HARDWARE, "detecting online CPUs on NUMA node %hu", a_numa_node); + } + + if (read_list("/sys/devices/system/cpu/online", &g_os_cpus_online) != FILE_RES_OK) { + cf_crash(CF_HARDWARE, "error while reading list of online CPUs"); + } + + cf_detail(CF_HARDWARE, "learning CPU topology"); + + cf_topo_numa_node_index os_numa_node_index_to_numa_node_index[CPU_SETSIZE]; + + for (int32_t i = 0; i < CPU_SETSIZE; ++i) { + CPU_ZERO(&g_numa_node_os_cpus_online[i]); + + g_core_index_to_os_cpu_index[i] = INVALID_INDEX; + g_cpu_index_to_os_cpu_index[i] = INVALID_INDEX; + g_os_cpu_index_to_cpu_index[i] = INVALID_INDEX; + + os_numa_node_index_to_numa_node_index[i] = INVALID_INDEX; + g_numa_node_index_to_os_numa_node_index[i] = INVALID_INDEX; + } + + cpu_set_t covered_numa_nodes; + cpu_set_t covered_cores[CPU_SETSIZE]; // One mask per package. + + CPU_ZERO(&covered_numa_nodes); + + for (int32_t i = 0; i < CPU_SETSIZE; ++i) { + CPU_ZERO(&covered_cores[i]); + } + + g_n_numa_nodes = 0; + g_n_cores = 0; + g_n_os_cpus = 0; + g_n_cpus = 0; + char path[1000]; + bool no_numa = false; + + // Loop through all CPUs in the system by looping through OS CPU indexes. + + for (g_n_os_cpus = 0; g_n_os_cpus < CPU_SETSIZE; ++g_n_os_cpus) { + cf_detail(CF_HARDWARE, "querying OS CPU index %hu", g_n_os_cpus); + + // Let's look at the CPU's package. + + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%hu/topology/physical_package_id", + g_n_os_cpus); + os_package_index i_os_package; + file_res res = read_index(path, &i_os_package); + + // The entry doesn't exist. We've processed all available CPUs. Stop + // looping through the CPUs. + + if (res == FILE_RES_NOT_FOUND) { + break; + } + + if (res != FILE_RES_OK) { + cf_crash(CF_HARDWARE, "error while reading OS package index from %s", path); + } + + cf_detail(CF_HARDWARE, "OS package index is %hu", i_os_package); + + // Only consider CPUs that are actually in use. + + if (!CPU_ISSET(g_n_os_cpus, &g_os_cpus_online)) { + cf_detail(CF_HARDWARE, "OS CPU index %hu is offline", g_n_os_cpus); + continue; + } + + // Let's look at the CPU's underlying core. In Hyper Threading systems, + // two (logical) CPUs share one (physical) core. + + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%hu/topology/core_id", + g_n_os_cpus); + os_core_index i_os_core; + res = read_index(path, &i_os_core); + + if (res != FILE_RES_OK) { + cf_crash(CF_HARDWARE, "error while reading OS core index from %s", path); + } + + cf_detail(CF_HARDWARE, "OS core index is %hu", i_os_core); + + // Consider a core when we see it for the first time. In other words, we + // consider the first Hyper Threading peer of each core to be that core. + + bool new_core; + + if (CPU_ISSET(i_os_core, &covered_cores[i_os_package])) { + cf_detail(CF_HARDWARE, "core (%hu, %hu) already covered", i_os_core, i_os_package); + new_core = false; + } + else { + cf_detail(CF_HARDWARE, "core (%hu, %hu) is new", i_os_core, i_os_package); + new_core = true; + CPU_SET(i_os_core, &covered_cores[i_os_package]); + } + + // Identify the NUMA node of the current CPU. We simply look for the + // current CPU's topology info subtree in each NUMA node's subtree. + // Specifically, we look for the current CPU's "core_id" entry. + + os_numa_node_index i_os_numa_node; + + for (i_os_numa_node = 0; i_os_numa_node < CPU_SETSIZE; ++i_os_numa_node) { + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%hu/node%hu/cpu%hu/topology/core_id", + g_n_os_cpus, i_os_numa_node, g_n_os_cpus); + uint16_t dummy; + res = read_index(path, &dummy); + + // We found the NUMA node that has the current CPU in its subtree. + + if (res == FILE_RES_OK) { + break; + } + + if (res != FILE_RES_NOT_FOUND) { + cf_crash(CF_HARDWARE, "error while reading core number from %s", path); + } + } + + // Some Docker installations seem to not have any NUMA information + // in /sys. In this case, assume a system with a single NUMA node. + + if (i_os_numa_node == CPU_SETSIZE) { + cf_detail(CF_HARDWARE, "OS CPU index %hu does not have a NUMA node", g_n_os_cpus); + no_numa = true; + i_os_numa_node = 0; + } + + cf_detail(CF_HARDWARE, "OS NUMA node index is %hu", i_os_numa_node); + + // Again, just like with cores, we consider a NUMA node when we encounter + // it for the first time. + + bool new_numa_node; + + if (CPU_ISSET(i_os_numa_node, &covered_numa_nodes)) { + cf_detail(CF_HARDWARE, "OS NUMA node index %hu already covered", i_os_numa_node); + new_numa_node = false; + } + else { + cf_detail(CF_HARDWARE, "OS NUMA node index %hu is new", i_os_numa_node); + new_numa_node = true; + CPU_SET(i_os_numa_node, &covered_numa_nodes); + + // For now, we only support a 64-bit bitmask (= one uint64_t). + + if (i_os_numa_node >= 64) { + cf_crash(CF_HARDWARE, "OS NUMA node index %hu too high", i_os_numa_node); + } + } + + // Now we know that the CPU is online and we know, whether it is in a newly + // seen core (new_core) and/or a newly seen NUMA node (new_numa_node). + + cf_topo_numa_node_index i_numa_node; + + if (new_numa_node) { + i_numa_node = g_n_numa_nodes; + ++g_n_numa_nodes; + os_numa_node_index_to_numa_node_index[i_os_numa_node] = i_numa_node; + g_numa_node_index_to_os_numa_node_index[i_numa_node] = i_os_numa_node; + cf_detail(CF_HARDWARE, "OS NUMA node index %hu -> new NUMA node index %hu", + i_os_numa_node, i_numa_node); + } + else { + i_numa_node = os_numa_node_index_to_numa_node_index[i_os_numa_node]; + cf_detail(CF_HARDWARE, "OS NUMA node index %hu -> existing NUMA node index %hu", + i_os_numa_node, i_numa_node); + } + + cf_detail(CF_HARDWARE, "OS CPU index %hu on NUMA node index %hu", g_n_os_cpus, i_numa_node); + CPU_SET(g_n_os_cpus, &g_numa_node_os_cpus_online[i_numa_node]); + + // If we're in NUMA mode and the CPU isn't on the NUMA mode that we're + // running on, then ignore the CPU. + + if (a_numa_node != INVALID_INDEX && a_numa_node != i_numa_node) { + cf_detail(CF_HARDWARE, "skipping unwanted NUMA node index %hu", i_numa_node); + continue; + } + + // If the CPU is a new core, then map a new core index to the OS CPU index. + + if (new_core) { + g_core_index_to_os_cpu_index[g_n_cores] = g_n_os_cpus; + cf_detail(CF_HARDWARE, "core index %hu -> OS CPU index %hu", g_n_cores, g_n_os_cpus); + ++g_n_cores; + } + + // Map the OS CPU index to a new CPU index and vice versa. + + g_os_cpu_index_to_cpu_index[g_n_os_cpus] = g_n_cpus; + g_cpu_index_to_os_cpu_index[g_n_cpus] = g_n_os_cpus; + + cf_detail(CF_HARDWARE, "OS CPU index %hu <-> CPU index %hu", g_n_os_cpus, g_n_cpus); + ++g_n_cpus; + } + + if (g_n_os_cpus == CPU_SETSIZE) { + cf_crash(CF_HARDWARE, "too many CPUs"); + } + + if (a_numa_node != INVALID_INDEX && no_numa) { + cf_warning(CF_HARDWARE, "no NUMA information found in /sys"); + } + + g_i_numa_node = a_numa_node; +} + +static void +pin_to_numa_node(cf_topo_numa_node_index a_numa_node) +{ + cf_info(CF_HARDWARE, "pinning to NUMA node %hu", a_numa_node); + + // Move the current thread (and all of its future descendants) to the CPUs + // on the selected NUMA node. + + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + + for (cf_topo_cpu_index i_cpu = 0; i_cpu < g_n_cpus; ++i_cpu) { + cf_topo_os_cpu_index i_os_cpu = g_cpu_index_to_os_cpu_index[i_cpu]; + CPU_SET(i_os_cpu, &cpu_set); + } + + char buff[1000]; + mask_to_string(&cpu_set, buff, sizeof(buff)); + cf_detail(CF_HARDWARE, "NUMA node %hu CPU mask: %s", a_numa_node, buff); + + if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) { + cf_crash(CF_HARDWARE, "error while pinning thread to NUMA node %hu: %d (%s)", + a_numa_node, errno, cf_strerror(errno)); + } + + // Force future memory allocations to the selected NUMA node. + + os_numa_node_index i_os_numa_node = g_numa_node_index_to_os_numa_node_index[a_numa_node]; + uint64_t to_mask = 1UL << i_os_numa_node; + cf_detail(CF_HARDWARE, "NUMA node mask (to): %016" PRIx64, to_mask); + + // Unlike select(), we have to pass "number of valid bits + 1". + set_mempolicy_safe(MPOL_BIND, &to_mask, 65); + + // Make sure we can migrate shared memory that we later attach and map. + cf_process_holdcap(); +} + +static uint32_t +pick_random(uint32_t limit) +{ + static __thread uint64_t state = 0; + + if (state == 0) { + state = (uint64_t)syscall(SYS_gettid); + } + + state = state * 6364136223846793005 + 1; + + if (state == 0) { + state = 1; + } + + return (uint32_t)((state >> 32) % limit); +} + +uint16_t +cf_topo_count_cores(void) +{ + return g_n_cores; +} + +uint16_t +cf_topo_count_cpus(void) +{ + return g_n_cpus; +} + +static cf_topo_cpu_index +os_cpu_index_to_cpu_index(cf_topo_os_cpu_index i_os_cpu) +{ + cf_detail(CF_HARDWARE, "translating OS CPU index %hu", i_os_cpu); + + if (i_os_cpu >= g_n_os_cpus) { + cf_crash(CF_HARDWARE, "invalid OS CPU index %hu", i_os_cpu); + } + + cf_topo_cpu_index i_cpu = g_os_cpu_index_to_cpu_index[i_os_cpu]; + + if (i_cpu == INVALID_INDEX) { + cf_detail(CF_HARDWARE, "foreign OS CPU index %hu", i_os_cpu); + } + else { + cf_detail(CF_HARDWARE, "CPU index is %hu", i_cpu); + } + + return i_cpu; +} + +cf_topo_cpu_index +cf_topo_current_cpu(void) +{ + cf_detail(CF_HARDWARE, "getting current OS CPU index"); + int32_t os = sched_getcpu(); + + if (os < 0) { + cf_crash(CF_HARDWARE, "error while getting OS CPU index: %d (%s)", + errno, cf_strerror(errno)); + } + + return os_cpu_index_to_cpu_index((cf_topo_os_cpu_index)os); +} + +cf_topo_cpu_index +cf_topo_socket_cpu(const cf_socket *sock) +{ + cf_detail(CF_HARDWARE, "determining CPU index for socket FD %d", CSFD(sock)); + + int32_t os; + socklen_t len = sizeof(os); + + if (getsockopt(sock->fd, SOL_SOCKET, SO_INCOMING_CPU, &os, &len) < 0) { + cf_crash(CF_SOCKET, "error while determining incoming OS CPU index: %d (%s)", + errno, cf_strerror(errno)); + } + + cf_detail(CF_HARDWARE, "OS CPU index is %d", os); + cf_topo_cpu_index i_cpu = os_cpu_index_to_cpu_index((cf_topo_os_cpu_index)os); + + // 1. The incoming connection was handled on the wrong NUMA node. In this case, + // pick a random CPU on the correct NUMA node. + + if (i_cpu == INVALID_INDEX) { + i_cpu = (cf_topo_cpu_index)pick_random(g_n_cpus); + cf_detail(CF_HARDWARE, "picking random CPU index %hu", i_cpu); + return i_cpu; + } + + // 2. The incoming connection was handled on a CPU that doesn't get any NIC + // interrupts. This should not happen for connections from other machines, but + // it does happen for connections from the local machine, because they don't + // go through the NIC hardware. In this case, pick a random CPU. + + if (i_cpu >= g_n_irq_cpus) { + i_cpu = (cf_topo_cpu_index)pick_random(g_n_cpus); + cf_detail(CF_HARDWARE, "randomizing unexpected CPU index >%hu to %hu", + g_n_irq_cpus - 1, i_cpu); + return i_cpu; + } + + // 3. Otherwise, redistribute. The first g_n_irq_cpus CPUs out of a total of + // g_n_cpus CPUs get NIC interrupts. Suppose we have 2 NIC queues and 8 CPUs, + // i.e., that g_n_irq_cpus == 2 and g_n_cpus == 8. We want to redistribute + // evenly across the 8 CPUs, i.e., each CPU should be picked with a probability + // of 0.125. + + // We're currently running on one of the 2 CPUs that get NIC interrupts, on + // either with a probability of p1 = 0.5. We want to stay on the current CPU + // with a probability of p2 = g_n_irq_cpus / g_n_cpus == 2 / 8 == 0.25, which + // yields the desired total probability of p1 * p2 = 0.5 * 0.25 = 0.125. + + if (pick_random(100000) < g_n_irq_cpus * (uint32_t)100000 / g_n_cpus) { + cf_detail(CF_HARDWARE, "staying on CPU index %hu", i_cpu); + return i_cpu; + } + + // 4. Otherwise, if we switch CPUs, then we jump to a CPU that doesn't receive + // NIC interrupts, i.e., one of the remaining 6 CPUs [2 .. 8] in our example. + // This reaches each CPU with a probability of (1 - p2) / 6 = 0.125. + + i_cpu = (cf_topo_cpu_index)(g_n_irq_cpus + + pick_random((uint32_t)g_n_cpus - (uint32_t)g_n_irq_cpus)); + cf_detail(CF_HARDWARE, "redirecting to CPU index %hu", i_cpu); + return i_cpu; +} + +static void +pin_to_os_cpu(cf_topo_os_cpu_index i_os_cpu) +{ + cf_detail(CF_HARDWARE, "pinning to OS CPU index %hu", i_os_cpu); + + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + CPU_SET(i_os_cpu, &cpu_set); + + if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) { + cf_crash(CF_HARDWARE, "error while pinning thread to OS CPU %hu: %d (%s)", + i_os_cpu, errno, cf_strerror(errno)); + } +} + +void +cf_topo_pin_to_core(cf_topo_core_index i_core) +{ + cf_detail(CF_HARDWARE, "pinning to core index %hu", i_core); + + if (i_core >= g_n_cores) { + cf_crash(CF_HARDWARE, "invalid core index %hu", i_core); + } + + pin_to_os_cpu(g_core_index_to_os_cpu_index[i_core]); +} + +void +cf_topo_pin_to_cpu(cf_topo_cpu_index i_cpu) +{ + cf_detail(CF_HARDWARE, "pinning to CPU index %hu", i_cpu); + + if (i_cpu >= g_n_cpus) { + cf_crash(CF_HARDWARE, "invalid CPU index %hu", i_cpu); + } + + pin_to_os_cpu(g_cpu_index_to_os_cpu_index[i_cpu]); +} + +static check_proc_res +check_proc(const char *name, int32_t argc, const char *argv[]) +{ + cf_detail(CF_HARDWARE, "looking for process %s", name); + + for (int32_t i = 0; i < argc; ++i) { + cf_detail(CF_HARDWARE, "argv[%d]: %s", i, argv[i]); + } + + DIR *dir = opendir_safe("/proc"); + struct dirent ent; + char cmd[10000]; + size_t limit; + bool found = false; + + while (readdir_safe(dir, &ent) >= 0) { + bool numeric = true; + + for (int32_t i = 0; ent.d_name[i] != 0; ++i) { + if (!isascii(ent.d_name[i]) || !isdigit(ent.d_name[i])) { + numeric = false; + break; + } + } + + if (!numeric) { + continue; + } + + char path[100]; + snprintf(path, sizeof(path), "/proc/%s/cmdline", ent.d_name); + + limit = sizeof(cmd) - 1; + file_res rfr = read_file(path, cmd, &limit); + + // Can legitimately happen, if the process has exited in the meantime. + if (rfr == FILE_RES_NOT_FOUND) { + continue; + } + + if (rfr == FILE_RES_ERROR) { + cf_crash(CF_HARDWARE, "error while reading file %s", path); + } + + if (limit > 0 && cmd[limit - 1] != 0) { + cmd[limit] = 0; + } + + const char *name2 = strrchr(cmd, '/'); + + if (name2 != NULL) { + ++name2; + } + else { + name2 = cmd; + } + + if (strcmp(name2, name) == 0) { + found = true; + break; + } + } + + closedir_safe(dir); + + if (!found) { + cf_detail(CF_HARDWARE, "process %s absent", name); + return CHECK_PROC_ABSENT; + } + + cf_detail(CF_HARDWARE, "process %s is %s", name, cmd); + + if (argc > 0) { + int32_t i_arg = 0; + + for (size_t off = strlen(cmd) + 1; off < limit; off += strlen(cmd + off) + 1) { + cf_detail(CF_HARDWARE, "checking argument %s against %s", cmd + off, argv[i_arg]); + + if (strcmp(cmd + off, argv[i_arg]) == 0) { + ++i_arg; + + if (i_arg >= argc) { + break; + } + } + else { + i_arg = 0; + } + } + + if (i_arg >= argc) { + cf_detail(CF_HARDWARE, "process %s present with argument", name); + return CHECK_PROC_PRESENT; + } + } + + cf_detail(CF_HARDWARE, "process %s present", name); + return CHECK_PROC_PRESENT_NO_ARG; +} + +static uint16_t +interface_queues(const char *if_name, const char *format) +{ + uint16_t n_queues = 0; + + while (true) { + char path[1000]; + snprintf(path, sizeof(path), format, if_name, n_queues); + cf_detail(CF_HARDWARE, "checking for path %s", path); + + if (!path_exists(path)) { + cf_detail(CF_HARDWARE, "path not found"); + break; + } + + ++n_queues; + } + + cf_assert(n_queues != 0, CF_HARDWARE, "interface %s has no queues", if_name); + + return n_queues; +} + +static uint16_t +interface_rx_queues(const char *if_name) +{ + cf_detail(CF_HARDWARE, "getting receive queues for interface %s", if_name); + return interface_queues(if_name, "/sys/class/net/%s/queues/rx-%hu"); +} + +static uint16_t +interface_tx_queues(const char *if_name) +{ + cf_detail(CF_HARDWARE, "getting transmit queues for interface %s", if_name); + return interface_queues(if_name, "/sys/class/net/%s/queues/tx-%hu"); +} + +static int +comp_irq_number(const void *lhs, const void *rhs) +{ + return *(irq_number *)lhs - *(irq_number *)rhs; +} + +static void +interface_irqs(const char *if_name, irq_list *irqs) +{ + cf_detail(CF_HARDWARE, "getting IRQs for interface %s", if_name); + + DIR *dir = opendir_safe("/sys/bus/pci/devices"); + struct dirent ent; + char path[PATH_MAX]; + bool found = false; + + while (readdir_safe(dir, &ent) >= 0) { + snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/net/%s/ifindex", + ent.d_name, if_name); + bool exists = path_exists(path); + + if (!exists) { + for (int32_t i = 0; i < 100; ++i) { + snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/virtio%d/net/%s/ifindex", + ent.d_name, i, if_name); + exists = path_exists(path); + + if (exists) { + break; + } + } + } + + if (!exists) { + continue; + } + + snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/msi_irqs", ent.d_name); + + if (!path_exists(path)) { + cf_crash(CF_HARDWARE, "interface %s does not support MSIs", if_name); + } + + cf_detail(CF_HARDWARE, "interface %s is %s", if_name, ent.d_name); + found = true; + break; + } + + closedir_safe(dir); + + if (!found) { + cf_crash(CF_HARDWARE, "interface %s does not have a PCI device entry", if_name); + } + + dir = opendir_safe(path); + int32_t count = 0; + irq_number irq_nums[CPU_SETSIZE]; + + while (readdir_safe(dir, &ent) >= 0) { + char *end; + uint64_t tmp = strtoul(ent.d_name, &end, 10); + + if (*end != 0 || tmp > 65535) { + cf_crash(CF_HARDWARE, "invalid IRQ number %s in %s", ent.d_name, path); + } + + if (count >= CPU_SETSIZE) { + cf_crash(CF_HARDWARE, "too many IRQs in %s", path); + } + + cf_detail(CF_HARDWARE, "interface %s has IRQ %hu", if_name, (irq_number)tmp); + irq_nums[count] = (irq_number)tmp; + ++count; + } + + closedir_safe(dir); + + // Sort IRQ numbers, so that RX and TX interrupts pair up nicely when + // populating irqs->irqs[]. + qsort(irq_nums, (size_t)count, sizeof(irq_number), comp_irq_number); + + char actions[count][100]; + memset(actions, 0, sizeof(actions)); + + FILE *fh = fopen("/proc/interrupts", "r"); + + if (fh == NULL) { + cf_crash(CF_HARDWARE, "error while opening /proc/interrupts"); + } + + int32_t line_no = 0; + char line[25000]; + + while (fgets(line, sizeof(line), fh) != NULL) { + ++line_no; + + if (line_no == 1) { + continue; + } + + int32_t i = 0; + + while (line[i] == ' ') { + ++i; + } + + irq_number irq_num = 0; + + while (line[i] >= '0' && line[i] <= '9') { + irq_num = (irq_number)(irq_num * 10 + line[i] - '0'); + ++i; + } + + if (line[i] != ':') { + continue; + } + + while (line[i] != 0 && line[i] != '\n') { + ++i; + } + + line[i] = 0; + + while (i >= 0 && line[i] != ' ') { + --i; + } + + char *action = line + i + 1; + + if (strlen(action) >= sizeof(actions[0])) { + cf_crash(CF_HARDWARE, "oversize action in line %d in /proc/interrupts: %s", + line_no, action); + } + + cf_detail(CF_HARDWARE, "IRQ %hu has action %s", irq_num, action); + + for (i = 0; i < count; ++i) { + if (irq_nums[i] == irq_num) { + int32_t m = 0; + + // Remove any digits, so that the queue index goes away and all queues + // look alike. Also, normalize to lower case. For example: + // + // "i40e-em1-TxRx-0" -> "ie-em-txrx-" + // "i40e-em1-TxRx-1" -> "ie-em-txrx-" + // ... + + for (int32_t k = 0; action[k] != 0; ++k) { + if (action[k] < '0' || action[k] > '9') { + actions[i][m] = (char)tolower((uint8_t)action[k]); + ++m; + } + } + + actions[i][m] = 0; + cf_detail(CF_HARDWARE, "action pattern is %s", actions[i]); + break; + } + } + } + + fclose(fh); + + int32_t n_groups = 0; + int32_t group_sizes[count]; + int32_t group_extra[count]; + int32_t action_groups[count]; + int32_t inactive_group = -1; + + for (int32_t i = 0; i < count; ++i) { + group_sizes[i] = 0; + group_extra[i] = 0; + action_groups[i] = -1; + } + + // Group by action pattern. + + for (int32_t i = 0; i < count; ++i) { + if (action_groups[i] >= 0) { + continue; + } + + action_groups[i] = n_groups; + ++group_sizes[n_groups]; + + if (actions[i][0] == 0) { + inactive_group = n_groups; + cf_detail(CF_HARDWARE, "inactive IRQs in new group %d", n_groups); + } + else { + cf_detail(CF_HARDWARE, "new group %d: %s", n_groups, actions[i]); + } + + for (int32_t k = i + 1; k < count; ++k) { + if (strcmp(actions[i], actions[k]) == 0) { + action_groups[k] = n_groups; + ++group_sizes[n_groups]; + } + } + + cf_detail(CF_HARDWARE, "group %d has %d member(s)", n_groups, group_sizes[n_groups]); + + // Prefer groups whose action patterns have "rx", "tx", "input", or "output" in them. + + if (strstr(actions[i], "rx") != NULL || strstr(actions[i], "tx") != NULL || + strstr(actions[i], "input") != NULL || strstr(actions[i], "output") != NULL) { + cf_detail(CF_HARDWARE, "preferring group %d", n_groups); + group_extra[n_groups] = 1; + } + + ++n_groups; + } + + // Find the two largest groups. + + int32_t a = -1; + int32_t b = -1; + + for (int32_t i = 0; i < n_groups; ++i) { + if (i != inactive_group && + (a < 0 || group_sizes[i] + group_extra[i] > group_sizes[a] + group_extra[a])) { + a = i; + } + } + + if (a < 0) { + cf_crash(CF_HARDWARE, "no active interrupts for interface %s", if_name); + } + + for (int32_t i = 0; i < n_groups; ++i) { + if (i != inactive_group && i != a && + (b < 0 || group_sizes[i] + group_extra[i] > group_sizes[b] + group_extra[b])) { + b = i; + } + } + + cf_detail(CF_HARDWARE, "largest groups: %d, %d", a, b); + + // If the two largest groups have an equal number of members, then we assume + // that it's a NIC with separate RX and TX queue IRQs. + + if (b >= 0 && group_sizes[a] == group_sizes[b]) { + cf_detail(CF_HARDWARE, "assuming %d separate RX and TX queue IRQ(s)", + group_sizes[a] + group_sizes[b]); + int32_t ia = 0; + int32_t ib = 0; + + // Make RX and TX queue IRQs take turns in the IRQ list. + + for (int32_t k = 0; k < count; ++k) { + if (action_groups[k] == a) { + irqs->irqs[ia * 2] = irq_nums[k]; + cf_detail(CF_HARDWARE, "irqs[%d] = %hu", ia * 2, irq_nums[k]); + ++ia; + } + else if (action_groups[k] == b) { + irqs->irqs[ib * 2 + 1] = irq_nums[k]; + cf_detail(CF_HARDWARE, "irqs[%d] = %hu", ib * 2 + 1, irq_nums[k]); + ++ib; + } + } + + irqs->n_irqs = (uint16_t)(group_sizes[a] + group_sizes[b]); + + // Send pairs of two consecutive IRQs in the IRQ list (= the RX and the + // TX queue IRQ of a given NIC queue pair) to the same CPU. + + irqs->per_cpu = 2; + return; + } + + // Otherwise, we assume that it's a NIC with combined RX and TX queue IRQs + // and that the largest group contains these IRQs. + + cf_detail(CF_HARDWARE, "assuming %d combined RX and TX queue IRQ(s)", group_sizes[a]); + int32_t ia = 0; + + for (int32_t k = 0; k < count; ++k) { + if (action_groups[k] == a) { + irqs->irqs[ia] = irq_nums[k]; + cf_detail(CF_HARDWARE, "irqs[%d] = %hu", ia, irq_nums[k]); + ++ia; + } + } + + irqs->n_irqs = (uint16_t)group_sizes[a]; + + // Send each IRQ in the IRQ list to a different CPU. + + irqs->per_cpu = 1; +} + +static void +pin_irq(irq_number i_irq, cf_topo_os_cpu_index i_os_cpu) +{ + cf_detail(CF_HARDWARE, "pinning IRQ number %hu to OS CPU index %hu", i_irq, i_os_cpu); + + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(i_os_cpu, &mask); + + char mask_str[200]; + mask_to_string(&mask, mask_str, sizeof(mask_str)); + cf_detail(CF_HARDWARE, "CPU mask is %s", mask_str); + + char path[1000]; + snprintf(path, sizeof(path), "/proc/irq/%hu/smp_affinity", i_irq); + + if (write_file(path, mask_str, strlen(mask_str)) != FILE_RES_OK) { + cf_crash(CF_HARDWARE, "error while pinning IRQ, path %s", path); + } +} + +static cf_topo_os_cpu_index +fix_os_cpu_index(cf_topo_os_cpu_index i_os_cpu, const cpu_set_t *online) +{ + while (true) { + if (i_os_cpu >= g_n_os_cpus) { + i_os_cpu = 0; + } + + if (CPU_ISSET(i_os_cpu, online)) { + return i_os_cpu; + } + + ++i_os_cpu; + } +} + +static void +config_steering(const char *format, const char *if_name, uint16_t n_queues, bool enable) +{ + uint16_t i_queue; + cpu_set_t masks[n_queues]; + + for (i_queue = 0; i_queue < n_queues; ++i_queue) { + CPU_ZERO(&masks[i_queue]); + } + + if (enable) { + i_queue = 0; + + for (cf_topo_os_cpu_index i_os_cpu = 0; i_os_cpu < g_n_os_cpus; ++i_os_cpu) { + if (CPU_ISSET(i_os_cpu, &g_os_cpus_online)) { + CPU_SET(i_os_cpu, &masks[i_queue % n_queues]); + ++i_queue; + } + } + } + + for (i_queue = 0; i_queue < n_queues; ++i_queue) { + char path[1000]; + snprintf(path, sizeof(path), format, if_name, i_queue); + cf_detail(CF_HARDWARE, "path is %s", path); + + char mask_str[200]; + mask_to_string(&masks[i_queue], mask_str, sizeof(mask_str)); + cf_detail(CF_HARDWARE, "CPU mask is %s", mask_str); + + write_file_safe(path, mask_str, strlen(mask_str)); + } +} + +static void +enable_xps(const char *if_name) +{ + cf_detail(CF_HARDWARE, "enabling XPS for interface %s", if_name); + uint16_t n_queues = interface_tx_queues(if_name); + config_steering("/sys/class/net/%s/queues/tx-%hu/xps_cpus", if_name, n_queues, true); +} + +static void +disable_rps(const char *if_name) +{ + cf_detail(CF_HARDWARE, "disabling RPS for interface %s", if_name); + uint16_t n_queues = interface_rx_queues(if_name); + config_steering("/sys/class/net/%s/queues/rx-%hu/rps_cpus", if_name, n_queues, false); +} + +static void +config_rfs(const char *if_name, bool enable) +{ + cf_detail(CF_HARDWARE, "%s RFS for interface %s", enable ? "enabling" : "disabling", if_name); + + uint16_t n_queues = interface_rx_queues(if_name); + uint32_t sz_glob = enable ? 1000000 : 0; + uint32_t sz_queue = sz_glob / n_queues; + + cf_detail(CF_HARDWARE, "global size is %u, per-queue size is %u", sz_glob, sz_queue); + + char string[50]; + snprintf(string, sizeof(string), "%u", sz_glob); + write_file_safe("/proc/sys/net/core/rps_sock_flow_entries", string, strlen(string)); + + snprintf(string, sizeof(string), "%u", sz_queue); + + for (uint16_t i_queue = 0; i_queue < n_queues; ++i_queue) { + char path[1000]; + snprintf(path, sizeof(path), "/sys/class/net/%s/queues/rx-%hu/rps_flow_cnt", + if_name, i_queue); + write_file_safe(path, string, strlen(string)); + } +} + +static void +enable_coalescing(const char *if_name) +{ + cf_detail(CF_HARDWARE, "enabling interrupt coalescing for interface %s", if_name); + int32_t sock = socket(AF_INET, SOCK_DGRAM, 0); + + if (sock < 0) { + cf_crash(CF_HARDWARE, "error while create ethtool socket: %d (%s)", errno, cf_strerror(errno)); + } + + struct ifreq req; + memset(&req, 0, sizeof(req)); + + if (strlen(if_name) > IFNAMSIZ - 1) { + cf_crash(CF_HARDWARE, "invalid interface name %s", if_name); + } + + strcpy(req.ifr_name, if_name); + struct ethtool_coalesce coal = { .cmd = ETHTOOL_GCOALESCE }; + req.ifr_data = &coal; + + if (ioctl(sock, SIOCETHTOOL, &req) < 0) { + if (errno == EOPNOTSUPP) { + cf_detail(CF_HARDWARE, "interface %s does not support ETHTOOL_GCOALESCE", if_name); + goto cleanup1; + } + + cf_crash(CF_HARDWARE, "error while getting interface settings: %d (%s)", + errno, cf_strerror(errno)); + } + + cf_detail(CF_HARDWARE, "current interface settings: adaptive = %u, usecs = %u", + coal.use_adaptive_rx_coalesce, coal.rx_coalesce_usecs); + + if (coal.use_adaptive_rx_coalesce != 0 || coal.rx_coalesce_usecs >= 100) { + cf_detail(CF_HARDWARE, "leaving interface settings untouched"); + goto cleanup1; + } + + cf_detail(CF_HARDWARE, "adjusting interface settings"); + coal = (struct ethtool_coalesce){ + .cmd = ETHTOOL_SCOALESCE, + .rx_coalesce_usecs = 100 // .1 ms for now, which adds .05 ms to a request on average. + }; + + if (ioctl(sock, SIOCETHTOOL, &req) < 0) { + if (errno == EOPNOTSUPP) { + cf_detail(CF_HARDWARE, "interface %s does not support ETHTOOL_SCOALESCE", if_name); + goto cleanup1; + } + + cf_crash(CF_HARDWARE, "error while adjusting interface settings: %d (%s)", + errno, cf_strerror(errno)); + } + +cleanup1: + CF_NEVER_FAILS(close(sock)); +} + +static void +check_irqbalance(void) +{ + cf_detail(CF_HARDWARE, "checking irqbalance"); + + check_proc_res res = check_proc("irqbalance", 1, (const char *[]){ + "--policyscript=" POLICY_SCRIPT + }); + + if (res == CHECK_PROC_PRESENT_NO_ARG) { + res = check_proc("irqbalance", 2, (const char *[]){ + "--policyscript", + POLICY_SCRIPT + }); + } + + if (res == CHECK_PROC_PRESENT_NO_ARG) { + res = check_proc("irqbalance", 1, (const char *[]){ + "-l" POLICY_SCRIPT + }); + } + + if (res == CHECK_PROC_PRESENT_NO_ARG) { + res = check_proc("irqbalance", 2, (const char *[]){ + "-l", + POLICY_SCRIPT + }); + } + + if (res == CHECK_PROC_PRESENT_NO_ARG) { + cf_crash_nostack(CF_HARDWARE, "please disable irqbalance or run it with the Aerospike policy script, /etc/aerospike/irqbalance-ban.sh"); + } +} + +static void +config_interface(const char *if_name, bool rfs, irq_list *irqs) +{ + uint16_t n_irq_cpus = 0; + cf_topo_os_cpu_index i_os_cpu = fix_os_cpu_index(0, &g_os_cpus_online); + + for (uint16_t i = 0; i < irqs->n_irqs; ++i) { + pin_irq(irqs->irqs[i], i_os_cpu); + + if (i % irqs->per_cpu == irqs->per_cpu - 1) { + ++n_irq_cpus; + i_os_cpu = fix_os_cpu_index((cf_topo_os_cpu_index)(i_os_cpu + 1), &g_os_cpus_online); + } + } + + cf_detail(CF_HARDWARE, "interface %s with %hu RX interrupt(s)", if_name, n_irq_cpus); + + if (g_n_irq_cpus == 0) { + g_n_irq_cpus = n_irq_cpus; + } + else if (n_irq_cpus != g_n_irq_cpus) { + cf_crash(CF_HARDWARE, "interface %s with inconsistent number of RX interrupts: %hu vs. %hu", + if_name, n_irq_cpus, g_n_irq_cpus); + } + + disable_rps(if_name); + config_rfs(if_name, rfs); + enable_xps(if_name); + + // Redistributing packets with RFS causes inter-CPU interrupts, which increases + // the interrupt load on the machine. For low-end systems, make sure that + // interrupt coalescing is enabled. + // + // We consider a machine low-end, if we handle interrupts on 25% or less of the + // available CPUs (i.e., if the number of NIC queues is 25% or less of the number + // of available CPUs) and it has fewer than 4 NIC queues. + // + // Better (i.e., NUMA) machines typically come with adaptive interrupt coalescing + // enabled by default. That's why we only do this here and not in the NUMA case. + + if (rfs && n_irq_cpus <= g_n_cpus / 4 && n_irq_cpus < 4) { + enable_coalescing(if_name); + } +} + +static void +config_interface_numa(const char *if_name, irq_list *irqs) +{ + uint16_t n_irq_cpus = 0; + cf_topo_os_cpu_index i_os_cpu[g_n_numa_nodes]; + uint16_t i_numa_node; + + for (i_numa_node = 0; i_numa_node < g_n_numa_nodes; ++i_numa_node) { + i_os_cpu[i_numa_node] = fix_os_cpu_index(0, &g_numa_node_os_cpus_online[i_numa_node]); + } + + i_numa_node = 0; + + // This configures the IRQs for all NUMA nodes. If multiple asd processes are + // running, each process does this, but each does it identically. Hence there + // isn't any conflict. + + for (uint16_t i = 0; i < irqs->n_irqs; ++i) { + char mask_str[200]; + mask_to_string(&g_numa_node_os_cpus_online[i_numa_node], mask_str, sizeof(mask_str)); + cf_detail(CF_HARDWARE, "NUMA node index %hu CPU mask is %s", i_numa_node, mask_str); + + pin_irq(irqs->irqs[i], i_os_cpu[i_numa_node]); + + if (i % irqs->per_cpu == irqs->per_cpu - 1) { + // Only count CPUs on our NUMA node. + + if (i_numa_node == g_i_numa_node) { + ++n_irq_cpus; + } + + i_os_cpu[i_numa_node] = + fix_os_cpu_index((cf_topo_os_cpu_index)(i_os_cpu[i_numa_node] + 1), + &g_numa_node_os_cpus_online[i_numa_node]); + i_numa_node = (uint16_t)((i_numa_node + 1) % g_n_numa_nodes); + } + } + + cf_detail(CF_HARDWARE, "interface %s with %hu RX interrupt(s) on NUMA node %hu", + if_name, n_irq_cpus, g_i_numa_node); + + if (g_n_irq_cpus == 0) { + g_n_irq_cpus = n_irq_cpus; + } + else if (n_irq_cpus != g_n_irq_cpus) { + cf_crash(CF_HARDWARE, "interface %s with inconsistent number of RX interrupts: %hu vs. %hu", + if_name, n_irq_cpus, g_n_irq_cpus); + } + + disable_rps(if_name); + config_rfs(if_name, true); + enable_xps(if_name); +} + +static void +optimize_interface(const char *if_name) +{ + cf_detail(CF_HARDWARE, "optimizing interface %s", if_name); + uint16_t n_queues = interface_rx_queues(if_name); + irq_list irqs; + interface_irqs(if_name, &irqs); + + cf_info(CF_HARDWARE, "detected %hu NIC receive queue(s), %hu interrupt(s) for %s", + n_queues, irqs.n_irqs, if_name); + + // We either expect one interrupt per RX queue (shared with TX) or two + // interrupts per RX queue (one RX, one TX). + + uint16_t n_irq_cpus = irqs.n_irqs / irqs.per_cpu; + + if (n_irq_cpus != n_queues) { + cf_crash(CF_HARDWARE, "suspicious NIC interrupt count %hu with %hu NIC receive queue(s)", + irqs.n_irqs, n_queues); + } + + if (n_irq_cpus == g_n_cpus) { + if (g_i_numa_node != INVALID_INDEX) { + cf_detail(CF_HARDWARE, "setting up for a fancy interface with NUMA"); + config_interface_numa(if_name, &irqs); + } + else { + cf_detail(CF_HARDWARE, "setting up for a fancy interface, no NUMA"); + config_interface(if_name, false, &irqs); + } + } + else { + if (n_irq_cpus <= g_n_cpus / 4) { + cf_warning(CF_HARDWARE, "%s has very few NIC queues; only %hu out of %hu CPUs handle(s) NIC interrupts", + if_name, n_irq_cpus, g_n_cpus); + } + + if (g_i_numa_node != INVALID_INDEX) { + cf_detail(CF_HARDWARE, "setting up for a lame interface with NUMA"); + config_interface_numa(if_name, &irqs); + } + else { + cf_detail(CF_HARDWARE, "setting up for a lame interface, no NUMA"); + config_interface(if_name, true, &irqs); + } + } +} + +static void +check_socket_cpu(void) +{ + int32_t fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); + + if (fd < 0) { + cf_crash(CF_SOCKET, "error while creating UDP test socket: %d (%s)", + errno, cf_strerror(errno)); + } + + int32_t val = -1; + + if (setsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &val, sizeof(val)) < 0) { + if (errno == ENOPROTOOPT) { + cf_crash_nostack(CF_SOCKET, "CPU pinning requires Linux kernel 3.19 or later"); + } + + cf_crash(CF_SOCKET, "error while testing for SO_INCOMING_CPU: %d (%s)", + errno, cf_strerror(errno)); + } + + CF_NEVER_FAILS(close(fd)); +} + +void +cf_topo_config(cf_topo_auto_pin auto_pin, cf_topo_numa_node_index a_numa_node, + const cf_addr_list *addrs) +{ + // Detect the NUMA topology. + + switch (auto_pin) { + case CF_TOPO_AUTO_PIN_NONE: + case CF_TOPO_AUTO_PIN_CPU: + detect(INVALID_INDEX); + break; + + case CF_TOPO_AUTO_PIN_NUMA: + detect(a_numa_node); + + // Clamp the given NUMA node index to the valid range. We can only do this + // after we know what g_n_numa_nodes is, which is initialized by the above + // call to detect(). + + if (a_numa_node >= g_n_numa_nodes) { + cf_topo_numa_node_index orig = a_numa_node; + a_numa_node = (cf_topo_numa_node_index)(a_numa_node % g_n_numa_nodes); + cf_detail(CF_HARDWARE, "invalid NUMA node index: %hu, clamping to %hu", orig, a_numa_node); + detect(a_numa_node); + } + + break; + } + + // If we don't do any pinning, then we're done after NUMA topology detection. + + if (auto_pin == CF_TOPO_AUTO_PIN_NONE) { + return; + } + + // Make sure that we are running on Linux 3.19 or later. + + check_socket_cpu(); + + // Reconfigure the client-facing network interface(s). + + check_irqbalance(); + + if (addrs->n_addrs == 0) { + cf_crash_nostack(CF_HARDWARE, "auto-pinning requires binding the service to one or more network interfaces"); + } + + for (uint32_t i = 0; i < addrs->n_addrs; ++i) { + const char *if_name = addrs->addrs[i]; + + if (!cf_inter_is_inter_name(if_name)) { + cf_crash_nostack(CF_HARDWARE, "auto-pinning requires binding the service to network interfaces; \"%s\" isn't a network interface", + if_name); + } + + char *exp_names[100]; + uint32_t n_exp = sizeof(exp_names) / sizeof(exp_names[0]); + cf_inter_expand_bond(if_name, exp_names, &n_exp); + + for (uint32_t k = 0; k < n_exp; ++k) { + optimize_interface(exp_names[k]); + cf_free(exp_names[k]); + } + } + + // If we don't do NUMA pinning, then we're done after setting up the + // client-facing network interface(s). + + if (auto_pin == CF_TOPO_AUTO_PIN_CPU) { + return; + } + + // NUMA pinning. + + pin_to_numa_node(a_numa_node); +} + +void +cf_topo_force_map_memory(const uint8_t *from, size_t size) +{ + if (g_i_numa_node == INVALID_INDEX || size == 0) { + return; + } + + cf_assert(from, CF_HARDWARE, "invalid cf_topo_force_map_memory() call"); + + // Read one byte per memory page to force otherwise lazy mapping. + + const uint8_t *start = (const uint8_t *) + (((int64_t)from + (MEM_PAGE_SIZE - 1)) & -MEM_PAGE_SIZE); + const uint8_t *end = from + size; + const volatile uint8_t *p_byte; + + // In case 'from' was not page-aligned, take care of the partial page. + if (start > from) { + p_byte = from; + p_byte[0]; + } + + for (p_byte = start; p_byte < end; p_byte += MEM_PAGE_SIZE) { + p_byte[0]; + } +} + +void +cf_topo_migrate_memory(void) +{ + if (g_i_numa_node == INVALID_INDEX) { + return; + } + + // Migrate existing memory allocations to the selected NUMA node. + + os_numa_node_index i_os_numa_node = g_numa_node_index_to_os_numa_node_index[g_i_numa_node]; + uint64_t to_mask = 1UL << i_os_numa_node; + cf_detail(CF_HARDWARE, "NUMA node mask (to): %016" PRIx64, to_mask); + + uint64_t from_mask = 0; + + for (cf_topo_numa_node_index i_numa_node = 0; i_numa_node < g_n_numa_nodes; ++i_numa_node) { + i_os_numa_node = g_numa_node_index_to_os_numa_node_index[i_numa_node]; + from_mask |= 1u << i_os_numa_node; + } + + from_mask &= ~to_mask; + cf_detail(CF_HARDWARE, "NUMA node mask (from): %016" PRIx64, from_mask); + + if (from_mask != 0) { + cf_info(CF_HARDWARE, "migrating shared memory to local NUMA node - this may take a bit"); + // Unlike select(), we have to pass "number of valid bits + 1". + migrate_pages_safe(0, 65, &from_mask, &to_mask); + } + + // We had kept capabilities so we could do this migrate - revoke them now. + cf_process_clearcap(); +} + +void +cf_topo_info(void) +{ + if (g_i_numa_node == INVALID_INDEX) { + cf_info(CF_HARDWARE, "detected %hu CPU(s), %hu core(s), %hu NUMA node(s)", + g_n_cpus, g_n_cores, g_n_numa_nodes); + } + else { + cf_info(CF_HARDWARE, "detected %hu CPU(s), %hu core(s) on NUMA node %hu of %hu", + g_n_cpus, g_n_cores, g_i_numa_node, g_n_numa_nodes); + } +} diff --git a/cf/src/hist.c b/cf/src/hist.c new file mode 100644 index 00000000..6e03563c --- /dev/null +++ b/cf/src/hist.c @@ -0,0 +1,305 @@ +/* + * hist.c + * + * Copyright (C) 2008-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "hist.h" + +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_clock.h" + +#include "dynbuf.h" +#include "fault.h" + + +//========================================================== +// Histogram with logarithmic buckets. +// + +//------------------------------------------------ +// Create a histogram. There's no destroy(), but +// you can just cf_free() the histogram. +// +histogram* +histogram_create(const char *name, histogram_scale scale) +{ + cf_assert(name, AS_INFO, "null histogram name"); + cf_assert(strlen(name) < HISTOGRAM_NAME_SIZE, AS_INFO, + "bad histogram name %s", name); + cf_assert(scale >= 0 && scale < HIST_SCALE_MAX_PLUS_1, AS_INFO, + "bad histogram scale %d", scale); + + histogram *h = cf_malloc(sizeof(histogram)); + + strcpy(h->name, name); + memset((void *)&h->counts, 0, sizeof(h->counts)); + + // If histogram_insert_data_point() is called for a size or count histogram, + // the divide by 0 will crash - consider that a high-performance assert. + + switch (scale) { + case HIST_MILLISECONDS: + h->scale_tag = HIST_TAG_MILLISECONDS; + h->time_div = 1000 * 1000; + break; + case HIST_MICROSECONDS: + h->scale_tag = HIST_TAG_MICROSECONDS; + h->time_div = 1000; + break; + case HIST_SIZE: + h->scale_tag = HIST_TAG_SIZE; + h->time_div = 0; + break; + case HIST_COUNT: + h->scale_tag = HIST_TAG_COUNT; + h->time_div = 0; + break; + default: + cf_crash(AS_INFO, "%s: unrecognized histogram scale %d", name, scale); + break; + } + + return h; +} + +//------------------------------------------------ +// Clear a histogram. +// +void +histogram_clear(histogram *h) +{ + for (int i = 0; i < N_BUCKETS; i++) { + cf_atomic64_set(&h->counts[i], 0); + } +} + +//------------------------------------------------ +// Dump a histogram to log. +// +// Note - DO NOT change the log output format in +// this method - tools such as as_log_latency +// assume this format. +// +void +histogram_dump(histogram *h) +{ + int b; + uint64_t counts[N_BUCKETS]; + + for (b = 0; b < N_BUCKETS; b++) { + counts[b] = cf_atomic64_get(h->counts[b]); + } + + int i = N_BUCKETS; + int j = 0; + uint64_t total_count = 0; + + for (b = 0; b < N_BUCKETS; b++) { + if (counts[b] != 0) { + if (i > b) { + i = b; + } + + j = b; + total_count += counts[b]; + } + } + + char buf[100]; + int pos = 0; + int k = 0; + + buf[0] = '\0'; + + cf_info(AS_INFO, "histogram dump: %s (%lu total) %s", h->name, total_count, + h->scale_tag); + + for ( ; i <= j; i++) { + if (counts[i] == 0) { // print only non-zero columns + continue; + } + + int bytes = sprintf(buf + pos, " (%02d: %010lu)", i, counts[i]); + + if (bytes <= 0) { + cf_info(AS_INFO, "histogram dump error"); + return; + } + + pos += bytes; + + if ((k & 3) == 3) { // maximum of 4 printed columns per log line + cf_info(AS_INFO, "%s", buf); + pos = 0; + buf[0] = '\0'; + } + + k++; + } + + if (pos > 0) { + cf_info(AS_INFO, "%s", buf); + } +} + +//------------------------------------------------ +// BYTE_MSB[n] returns the position of the most +// significant bit. If no bits are set (n = 0) it +// returns 0. Otherwise the positions are 1 ... 8 +// from low to high, so e.g. n = 13 returns 4: +// +// bits: 0 0 0 0 1 1 0 1 +// position: 8 7 6 5 [4] 3 2 1 +// +static const char BYTE_MSB[] = { + 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 +}; + +//------------------------------------------------ +// Returns the position of the most significant +// bit of n. Positions are 1 ... 64 from low to +// high, so: +// +// n msb(n) +// -------- ------ +// 0 0 +// 1 1 +// 2 ... 3 2 +// 4 ... 7 3 +// 8 ... 15 4 +// etc. +// +static int +msb(uint64_t n) +{ + int shift = 0; + + while (true) { + uint64_t n_div_256 = n >> 8; + + if (n_div_256 == 0) { + return shift + (int)BYTE_MSB[n]; + } + + n = n_div_256; + shift += 8; + } + + // Should never get here. + cf_crash(AS_INFO, "end of msb()"); + return -1; +} + +//------------------------------------------------ +// Insert a time interval data point. The interval +// is time elapsed since start_ns, converted to +// milliseconds or microseconds as appropriate. +// Assumes start_ns was obtained via cf_getns() +// some time ago. Generates a histogram with +// either: +// +// bucket millisecond range +// ------ ----------------- +// 0 0 to 1 (more exactly, 0.999999) +// 1 1 to 2 (more exactly, 1.999999) +// 2 2 to 4 (more exactly, 3.999999) +// 3 4 to 8 (more exactly, 7.999999) +// 4 8 to 16 (more exactly, 15.999999) +// etc. +// +// or: +// +// bucket microsecond range +// ------ ----------------- +// 0 0 to 1 (more exactly, 0.999) +// 1 1 to 2 (more exactly, 1.999) +// 2 2 to 4 (more exactly, 3.999) +// 3 4 to 8 (more exactly, 7.999) +// 4 8 to 16 (more exactly, 15.999) +// etc. +// +uint64_t +histogram_insert_data_point(histogram *h, uint64_t start_ns) +{ + uint64_t end_ns = cf_getns(); + uint64_t delta_t = (end_ns - start_ns) / h->time_div; + + int bucket = 0; + + if (delta_t != 0) { + bucket = msb(delta_t); + + if (start_ns > end_ns) { + // Either the clock went backwards, or wrapped. (Assume the former, + // since it takes ~580 years from 0 to wrap.) + cf_warning(AS_INFO, "%s - clock went backwards: start %lu end %lu", + h->name, start_ns, end_ns); + bucket = 0; + } + } + + cf_atomic64_incr(&h->counts[bucket]); + + return end_ns; +} + +//------------------------------------------------ +// Insert a raw data point. Generates a histogram +// with: +// +// bucket value range +// ------ ----------- +// 0 0 +// 1 1 +// 2 2, 3 +// 3 4 to 7 +// 4 8 to 15 +// etc. +// +void +histogram_insert_raw(histogram *h, uint64_t value) +{ + cf_atomic64_incr(&h->counts[msb(value)]); +} diff --git a/cf/src/hist_track.c b/cf/src/hist_track.c new file mode 100644 index 00000000..16177fd8 --- /dev/null +++ b/cf/src/hist_track.c @@ -0,0 +1,732 @@ +/* + * hist_track.c + * + * Copyright (C) 2012-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * A histogram with cached data. + */ + + +//========================================================== +// Includes +// + +#include "hist_track.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "dynbuf.h" +#include "fault.h" +#include "hist.h" + + +//========================================================== +// Private "Class Members" +// + +//------------------------------------------------ +// Constants +// + +// More than one day of 10 second slices uses too much memory. +const uint32_t MAX_NUM_ROWS = (24 * 60 * 60) / 10; + +// Caching this few is legal but silly. +const uint32_t MIN_NUM_ROWS = 2; + +// Don't track/report thresholds with a larger bucket index than this. +// This corresponds to the 32 second threshold - that should be big enough. +#define MAX_BUCKET 15 + +// Don't track/report more than this many thresholds. +// This could in principle be less than (MAX_BUCKET + 1), e.g. it could be +// 4, and we could track buckets 0, 5, 10, 15. +#define MAX_NUM_COLS (MAX_BUCKET + 1) + +#define DEFAULT_NUM_COLS 3 +const uint32_t default_buckets[DEFAULT_NUM_COLS] = { 0, 3, 6 }; +// For our standard latency histograms, 0: >1ms, 3: >8ms, 6: >64ms. + +// No output line can be longer than this. +#define MAX_FORMATTED_ROW_SIZE 512 +#define MAX_FORMATTED_SETTINGS_SIZE 512 + +//------------------------------------------------ +// Data +// + +typedef struct row_s { + uint32_t timestamp; + uint64_t total; + uint64_t overs[]; +} row; + +struct cf_hist_track_s { + // Base Histogram (must be first) + histogram hist; + + // Tracking-related + row* rows; + size_t row_size; + uint32_t num_rows; + uint32_t write_row_n; + uint32_t oldest_row_n; + pthread_mutex_t rows_lock; + uint32_t slice_sec; + uint32_t buckets[MAX_NUM_COLS]; + uint32_t num_cols; +}; + +//------------------------------------------------ +// Function Declarations +// + +static inline row* get_row(cf_hist_track* this, uint32_t row_n); +static uint32_t get_start_row_n(cf_hist_track* this, uint32_t back_sec); +static void output_header(cf_hist_track* this, uint32_t start_ts, + uint32_t num_cols, cf_hist_track_info_format info_fmt, + cf_dyn_buf* db_p); +static void output_slice(cf_hist_track* this, row* prev_row_p, row* row_p, + uint32_t diff_sec, uint32_t num_cols, + cf_hist_track_info_format info_fmt, cf_dyn_buf* db_p); +static int threshold_to_bucket(int threshold); +static int thresholds_to_buckets(const char* thresholds, uint32_t buckets[]); + + +//========================================================== +// Public API +// + +//------------------------------------------------ +// Create a cf_hist_track object. +// +cf_hist_track* +cf_hist_track_create(const char* name, histogram_scale scale) +{ + cf_assert(name, AS_INFO, "null histogram name"); + cf_assert(strlen(name) < HISTOGRAM_NAME_SIZE, AS_INFO, + "bad histogram name %s", name); + cf_assert(scale >= 0 && scale < HIST_SCALE_MAX_PLUS_1, AS_INFO, + "bad histogram scale %d", scale); + + cf_hist_track* this = (cf_hist_track*)cf_malloc(sizeof(cf_hist_track)); + + pthread_mutex_init(&this->rows_lock, NULL); + + // Base histogram setup, same as in histogram_create(): + strcpy(this->hist.name, name); + memset((void*)this->hist.counts, 0, sizeof(this->hist.counts)); + + // If cf_hist_track_insert_data_point() is called for a size or count + // histogram, the divide by 0 will crash - consider that a high-performance + // assert. + + switch (scale) { + case HIST_MILLISECONDS: + this->hist.scale_tag = HIST_TAG_MILLISECONDS; + this->hist.time_div = 1000 * 1000; + break; + case HIST_MICROSECONDS: + this->hist.scale_tag = HIST_TAG_MICROSECONDS; + this->hist.time_div = 1000; + break; + case HIST_SIZE: + this->hist.scale_tag = HIST_TAG_SIZE; + this->hist.time_div = 0; + break; + case HIST_COUNT: + this->hist.scale_tag = HIST_TAG_COUNT; + this->hist.time_div = 0; + break; + default: + cf_crash(AS_INFO, "%s: unrecognized histogram scale %d", name, scale); + break; + } + + // Start with tracking off. + this->rows = NULL; + + return this; +} + +//------------------------------------------------ +// Destroy a cf_hist_track object. +// +void +cf_hist_track_destroy(cf_hist_track* this) +{ + cf_hist_track_stop(this); + pthread_mutex_destroy(&this->rows_lock); + cf_free(this); +} + +//------------------------------------------------ +// Start tracking. May call this again without +// first calling cf_hist_track_disable() to use +// different caching parameters, but previous +// cache is lost. +// +// TODO - resolve errors ??? +bool +cf_hist_track_start(cf_hist_track* this, uint32_t back_sec, uint32_t slice_sec, + const char* thresholds) +{ + if (slice_sec == 0) { + return false; + } + + uint32_t num_rows = back_sec / slice_sec; + + // Check basic sanity of row-related parameters. + if (num_rows > MAX_NUM_ROWS || num_rows < MIN_NUM_ROWS) { + return false; + } + + // If thresholds aren't specified, use defaults. + uint32_t* buckets = (uint32_t*)default_buckets; + int num_cols = DEFAULT_NUM_COLS; + + // Parse non-default thresholds and check resulting buckets. + uint32_t parsed_buckets[MAX_NUM_COLS]; + + if (thresholds) { + buckets = parsed_buckets; + num_cols = thresholds_to_buckets(thresholds, buckets); + + if (num_cols < 0) { + return false; + } + } + + pthread_mutex_lock(&this->rows_lock); + + if (this->rows) { + cf_free(this->rows); + } + + this->row_size = sizeof(row) + (num_cols * sizeof(uint64_t)); + this->rows = (row*)cf_malloc(num_rows * this->row_size); + this->num_rows = num_rows; + this->write_row_n = 0; + this->oldest_row_n = 0; + this->slice_sec = slice_sec; + + for (int i = 0; i < num_cols; i++) { + this->buckets[i] = buckets[i]; + } + + this->num_cols = (uint32_t)num_cols; + + pthread_mutex_unlock(&this->rows_lock); + + return true; +} + +//------------------------------------------------ +// Stop tracking, freeing cache. +// +void +cf_hist_track_stop(cf_hist_track* this) +{ + pthread_mutex_lock(&this->rows_lock); + + if (this->rows) { + cf_free(this->rows); + this->rows = NULL; + } + + pthread_mutex_unlock(&this->rows_lock); +} + +//------------------------------------------------ +// Clear histogram buckets, and if tracking, stop. +// Must call cf_hist_track_enable() to start +// tracking again. +// +void +cf_hist_track_clear(cf_hist_track* this) +{ + cf_hist_track_stop(this); + histogram_clear((histogram*)this); +} + +//------------------------------------------------ +// Print all non-zero histogram buckets, and if +// tracking, cache timestamp, total data points, +// and threshold data. +// +void +cf_hist_track_dump(cf_hist_track* this) +{ + // Always print the histogram. + histogram_dump((histogram*)this); + + // If tracking is enabled, save a row in the cache. + pthread_mutex_lock(&this->rows_lock); + + if (! this->rows) { + pthread_mutex_unlock(&this->rows_lock); + return; + } + + uint32_t now_ts = (uint32_t)time(NULL); + + // But don't save row if slice_sec hasn't elapsed since last saved row. + if (this->write_row_n != 0 && + now_ts - get_row(this, this->write_row_n - 1)->timestamp < + this->slice_sec) { + pthread_mutex_unlock(&this->rows_lock); + return; + } + + row* row_p = get_row(this, this->write_row_n); + + // "Freeze" the histogram for consistency of total. + uint64_t counts[N_BUCKETS]; + uint64_t total_count = 0; + + for (int j = 0; j < N_BUCKETS; j++) { + counts[j] = cf_atomic64_get(this->hist.counts[j]); + total_count += counts[j]; + } + + uint64_t subtotal = 0; + + // b's "over" is total minus sum of values in all buckets 0 thru b. + for (int i = 0, b = 0; i < this->num_cols; b++) { + subtotal += counts[b]; + + if (this->buckets[i] == b) { + row_p->overs[i++] = total_count - subtotal; + } + } + + row_p->total = total_count; + row_p->timestamp = now_ts; + + // Increment the current and oldest row indexes. + this->write_row_n++; + + if (this->write_row_n > this->num_rows) { + this->oldest_row_n++; + } + + pthread_mutex_unlock(&this->rows_lock); +} + +//------------------------------------------------ +// Pass-through to base histogram. +// +uint64_t +cf_hist_track_insert_data_point(cf_hist_track* this, uint64_t start_ns) +{ + return histogram_insert_data_point((histogram*)this, start_ns); +} + +//------------------------------------------------ +// Pass-through to base histogram. +// +void +cf_hist_track_insert_raw(cf_hist_track* this, uint64_t value) +{ + histogram_insert_raw((histogram*)this, value); +} + +//------------------------------------------------ +// Get time-sliced info from cache. +// +void +cf_hist_track_get_info(cf_hist_track* this, uint32_t back_sec, + uint32_t duration_sec, uint32_t slice_sec, bool throughput_only, + cf_hist_track_info_format info_fmt, cf_dyn_buf* db_p) +{ + pthread_mutex_lock(&this->rows_lock); + + if (! this->rows) { + cf_dyn_buf_append_string(db_p, "error-not-tracking;"); + pthread_mutex_unlock(&this->rows_lock); + return; + } + + uint32_t start_row_n = get_start_row_n(this, back_sec); + + if (start_row_n == -1) { + cf_dyn_buf_append_string(db_p, "error-no-data-yet-or-back-too-small;"); + pthread_mutex_unlock(&this->rows_lock); + return; + } + + uint32_t num_cols = throughput_only ? 0 : this->num_cols; + row* prev_row_p = get_row(this, start_row_n); + + output_header(this, prev_row_p->timestamp, num_cols, info_fmt, db_p); + + if (slice_sec == 0) { + row* row_p = get_row(this, this->write_row_n - 1); + uint32_t diff_sec = row_p->timestamp - prev_row_p->timestamp; + + output_slice(this, prev_row_p, row_p, diff_sec, num_cols, info_fmt, + db_p); + + pthread_mutex_unlock(&this->rows_lock); + return; + } + + uint32_t start_ts = prev_row_p->timestamp; + bool no_slices = true; + + for (uint32_t row_n = start_row_n + 1; row_n < this->write_row_n; row_n++) { + row* row_p = get_row(this, row_n); + + uint32_t diff_sec = row_p->timestamp - prev_row_p->timestamp; + + if (diff_sec < slice_sec) { + continue; + } + + output_slice(this, prev_row_p, row_p, diff_sec, num_cols, info_fmt, + db_p); + no_slices = false; + + // Doing this at the end guarantees we get at least one slice. + if (duration_sec != 0 && row_p->timestamp - start_ts > duration_sec) { + break; + } + + prev_row_p = row_p; + } + + if (no_slices) { + cf_dyn_buf_append_string(db_p, + "error-slice-too-big-or-back-too-small;"); + } + + pthread_mutex_unlock(&this->rows_lock); +} + +//------------------------------------------------ +// Get current settings which were passed into +// cf_hist_track_start(), in format suitable for +// info_command_config_get(). +// +void +cf_hist_track_get_settings(cf_hist_track* this, cf_dyn_buf* db_p) +{ + pthread_mutex_lock(&this->rows_lock); + + if (! this->rows) { + pthread_mutex_unlock(&this->rows_lock); + return; + } + + const char* name = ((histogram*)this)->name; + char output[MAX_FORMATTED_SETTINGS_SIZE]; + char* write_p = output; + char* end_p = output + MAX_FORMATTED_SETTINGS_SIZE - 2; + + write_p += snprintf(output, MAX_FORMATTED_SETTINGS_SIZE - 2, + "%s-hist-track-back=%u;" + "%s-hist-track-slice=%u;" + "%s-hist-track-thresholds=", + name, this->num_rows * this->slice_sec, + name, this->slice_sec, + name); + + for (int i = 0; i < this->num_cols; i++) { + write_p += snprintf(write_p, end_p - write_p, "%u,", + (uint32_t)1 << this->buckets[i]); + } + + if (this->num_cols > 0) { + write_p--; + } + + *write_p++ = ';'; + *write_p = 0; + + cf_dyn_buf_append_string(db_p, output); + + pthread_mutex_unlock(&this->rows_lock); +} + + +//========================================================== +// Private Functions +// + +//------------------------------------------------ +// Get row pointer for specified row count. Note +// that row_size is determined dynamically, so we +// can't just do rows[i]. +// +static inline row* +get_row(cf_hist_track* this, uint32_t row_n) +{ + return (row*)((uint8_t*)this->rows + + ((row_n % this->num_rows) * this->row_size)); +} + +//------------------------------------------------ +// Find row at or after timestamp specified by +// back_sec. +// +static uint32_t +get_start_row_n(cf_hist_track* this, uint32_t back_sec) +{ + // Must be at least two rows to get a slice. + if (this->write_row_n < 2) { + return -1; + } + + uint32_t now_ts = (uint32_t)time(NULL); + + // In case we call this with default back_sec (0) or back_sec more than UTC + // epoch to now - start from the beginning. + if (back_sec == 0 || back_sec >= now_ts) { + return this->oldest_row_n; + } + + uint32_t start_ts = now_ts - back_sec; + + // Find the most recent slice interval. + uint32_t last_row_n = this->write_row_n - 1; + uint32_t slice_sec = get_row(this, last_row_n)->timestamp - + get_row(this, last_row_n - 1)->timestamp; + + // Use recent slice interval to guess how many rows back to look. + uint32_t back_row_n = back_sec / slice_sec; + uint32_t guess_row_n = last_row_n > back_row_n ? + last_row_n - back_row_n : 0; + + if (guess_row_n < this->oldest_row_n) { + guess_row_n = this->oldest_row_n; + } + + // Begin at guessed row, and iterate to find exact row to start at. + uint32_t guess_ts = get_row(this, guess_row_n)->timestamp; + uint32_t start_row_n; + + if (guess_ts < start_ts) { + for (start_row_n = guess_row_n + 1; start_row_n < last_row_n; + start_row_n++) { + if (get_row(this, start_row_n)->timestamp >= start_ts) { + break; + } + } + } + else if (guess_ts > start_ts) { + for (start_row_n = guess_row_n; start_row_n > this->oldest_row_n; + start_row_n--) { + if (get_row(this, start_row_n - 1)->timestamp < start_ts) { + break; + } + } + } + else { + start_row_n = guess_row_n; + } + + // Make sure when default query is run (e.g. latency:), we return at least + // valid last data instead of returning an error. This case happens when the + // query is timed such that it's right when histogram is being dumped. + if (start_row_n == last_row_n) { + start_row_n = last_row_n - 1; + } + + // Can't get a slice if there isn't at least one row after the start row. + return start_row_n < last_row_n ? start_row_n : -1; +} + +//------------------------------------------------ +// Make info "header" and append it to db_p. +// +static void +output_header(cf_hist_track* this, uint32_t start_ts, uint32_t num_cols, + cf_hist_track_info_format info_fmt, cf_dyn_buf* db_p) +{ + cf_dyn_buf_append_string(db_p, ((histogram*)this)->name); + + const char* time_fmt; + const char* rate_fmt; + const char* pcts_fmt; + char line_sep; + + switch (info_fmt) { + case CF_HIST_TRACK_FMT_PACKED: + default: + time_fmt = ":%T-GMT"; + rate_fmt = ",ops/sec"; + pcts_fmt = ",>%ums"; + line_sep = ';'; + break; + case CF_HIST_TRACK_FMT_TABLE: + time_fmt = ":\n%T GMT % > (ms)"; + rate_fmt = "\n to ops/sec"; + pcts_fmt = " %6u"; + line_sep = '\n'; + break; + } + + char output[MAX_FORMATTED_ROW_SIZE]; + char* write_p = output; + char* end_p = output + MAX_FORMATTED_ROW_SIZE - 2; + time_t start_ts_time_t = (time_t)start_ts; + struct tm start_tm; + + gmtime_r(&start_ts_time_t, &start_tm); + write_p += strftime(output, MAX_FORMATTED_ROW_SIZE - 2, time_fmt, &start_tm); + write_p += snprintf(write_p, end_p - write_p, "%s", rate_fmt); + + for (int i = 0; i < num_cols; i++) { + write_p += snprintf(write_p, end_p - write_p, pcts_fmt, + (uint32_t)(1 << this->buckets[i])); + } + + *write_p++ = line_sep; + *write_p = 0; + + cf_dyn_buf_append_string(db_p, output); +} + +//------------------------------------------------ +// Calculate output info for slice defined by two +// rows, and append to db_p. +// +static void +output_slice(cf_hist_track* this, row* prev_row_p, row* row_p, + uint32_t diff_sec, uint32_t num_cols, + cf_hist_track_info_format info_fmt, cf_dyn_buf* db_p) +{ + const char* time_fmt; + const char* rate_fmt; + const char* pcts_fmt; + char line_sep; + + switch (info_fmt) { + case CF_HIST_TRACK_FMT_PACKED: + default: + time_fmt = "%T"; + rate_fmt = ",%.1f"; + pcts_fmt = ",%.2f"; + line_sep = ';'; + break; + case CF_HIST_TRACK_FMT_TABLE: + time_fmt = "%T"; + rate_fmt = " %9.1f"; + pcts_fmt = " %6.2f"; + line_sep = '\n'; + break; + } + + char output[MAX_FORMATTED_ROW_SIZE]; + char* write_p = output; + char* end_p = output + MAX_FORMATTED_ROW_SIZE - 2; + time_t row_ts_time_t = (time_t)row_p->timestamp; + struct tm row_tm; + + gmtime_r(&row_ts_time_t, &row_tm); + write_p += strftime(output, MAX_FORMATTED_ROW_SIZE - 2, time_fmt, &row_tm); + + uint64_t diff_total = row_p->total - prev_row_p->total; + double ops_per_sec = (double)(diff_total) / diff_sec; + + write_p += snprintf(write_p, end_p - write_p, rate_fmt, ops_per_sec); + + for (int i = 0; i < num_cols; i++) { + // We "freeze" the histogram to calculate "overs", so it shouldn't be + // possible for an "over" to be less than the one in the previous row. + uint64_t diff_overs = row_p->overs[i] - prev_row_p->overs[i]; + double pcts_over_i = diff_total != 0 ? + (double)(diff_overs * 100) / diff_total : 0; + + write_p += snprintf(write_p, end_p - write_p, pcts_fmt, pcts_over_i); + } + + *write_p++ = line_sep; + *write_p = 0; + + cf_dyn_buf_append_string(db_p, output); +} + +//------------------------------------------------ +// Convert threshold milliseconds to bucket index. +// +static int +threshold_to_bucket(int threshold) +{ + if (threshold < 1) { + return -1; + } + + int n = threshold; + int b = 0; + + while (n > 1) { + n >>= 1; + b++; + } + + // Check that threshold is an exact power of 2. + return (1 << b) == threshold ? b : -1; +} + +//------------------------------------------------ +// Convert thresholds string to buckets array. +// +static int +thresholds_to_buckets(const char* thresholds, uint32_t buckets[]) +{ + // Copy since strtok() is destructive. + char toks[strlen(thresholds) + 1]; + + strcpy(toks, thresholds); + + int i = 0; + char* tok = strtok(toks, ","); + + while (tok) { + if (i == MAX_NUM_COLS) { + return -1; + } + + int b = threshold_to_bucket(atoi(tok)); + + // Make sure it's a rising sequence of valid bucket indexes. + if (b < 0 || b > MAX_BUCKET || (i > 0 && b <= buckets[i - 1])) { + return -1; + } + + buckets[i++] = (uint32_t)b; + + tok = strtok(NULL, ","); + } + + return i; +} diff --git a/cf/src/linear_hist.c b/cf/src/linear_hist.c new file mode 100644 index 00000000..14f233bb --- /dev/null +++ b/cf/src/linear_hist.c @@ -0,0 +1,366 @@ +/* + * linear_hist.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + +//========================================================== +// Includes. +// + +#include "linear_hist.h" + +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" + +#include "dynbuf.h" +#include "fault.h" + + +//========================================================== +// Private class data. +// + +#define LINEAR_HIST_NAME_SIZE 512 +#define INFO_SNAPSHOT_SIZE 2048 + +struct linear_hist_s { + char name[LINEAR_HIST_NAME_SIZE]; + + pthread_mutex_t info_lock; + char info_snapshot[INFO_SNAPSHOT_SIZE]; + + uint32_t num_buckets; + uint64_t *counts; + + uint32_t start; + uint32_t bucket_width; +}; + + +//========================================================== +// Public API. +// + +//------------------------------------------------ +// Create a linear histogram. +// +linear_hist* +linear_hist_create(const char *name, uint32_t start, uint32_t max_offset, + uint32_t num_buckets) +{ + if (! (name && strlen(name) < LINEAR_HIST_NAME_SIZE)) { + cf_crash(AS_INFO, "linear_hist_create - bad name %s", + name ? name : ""); + } + + if (start + max_offset < start) { + cf_crash(AS_INFO, "linear_hist_create - max_offset overflow"); + } + + if (num_buckets == 0) { + cf_crash(AS_INFO, "linear_hist_create - 0 num_buckets"); + } + + linear_hist *h = cf_malloc(sizeof(linear_hist)); + + strcpy(h->name, name); + + if (0 != pthread_mutex_init(&h->info_lock, NULL)) { + cf_crash(AS_INFO, "linear_hist_create - mutex init failed"); + } + + h->info_snapshot[0] = 0; + + h->num_buckets = num_buckets; + h->counts = cf_malloc(sizeof(uint64_t) * num_buckets); + + linear_hist_clear(h, start, max_offset); + + return h; +} + +//------------------------------------------------ +// Destroy a linear histogram. +// +void +linear_hist_destroy(linear_hist *h) +{ + pthread_mutex_destroy(&h->info_lock); + cf_free(h->counts); + cf_free(h); +} + +//------------------------------------------------ +// Clear, re-scale/re-size a linear histogram. +// +void +linear_hist_reset(linear_hist *h, uint32_t start, uint32_t max_offset, + uint32_t num_buckets) +{ + if (h->num_buckets == num_buckets) { + linear_hist_clear(h, start, max_offset); + return; + } + + h->num_buckets = num_buckets; + h->counts = cf_realloc(h->counts, sizeof(uint64_t) * num_buckets); + linear_hist_clear(h, start, max_offset); +} + +//------------------------------------------------ +// Clear and (re-)scale a linear histogram. +// +void +linear_hist_clear(linear_hist *h, uint32_t start, uint32_t max_offset) +{ + h->start = start; + h->bucket_width = (max_offset + (h->num_buckets - 1)) / h->num_buckets; + + // Only needed to protect against max_offset 0. + if (h->bucket_width == 0) { + h->bucket_width = 1; + } + + memset((void *)h->counts, 0, sizeof(uint64_t) * h->num_buckets); +} + +//------------------------------------------------ +// Access method for total count. +// +uint64_t +linear_hist_get_total(linear_hist *h) +{ + uint64_t total_count = 0; + + for (uint32_t i = 0; i < h->num_buckets; i++) { + total_count += h->counts[i]; + } + + return total_count; +} + +//------------------------------------------------ +// Merge h2 into h1. +// +void +linear_hist_merge(linear_hist *h1, linear_hist *h2) +{ + if (! (h1->num_buckets == h2->num_buckets && h1->start == h2->start && + h1->bucket_width == h2->bucket_width)) { + cf_crash(AS_INFO, "linear_hist_merge - dissimilar histograms"); + } + + for (uint32_t i = 0; i < h1->num_buckets; i++) { + h1->counts[i] += h2->counts[i]; + } +} + +//------------------------------------------------ +// Insert a data point. Points out of range will +// end up in the bucket at the appropriate end. +// +void +linear_hist_insert_data_point(linear_hist *h, uint32_t point) +{ + int32_t offset = (int32_t)(point - h->start); + int32_t bucket = 0; + + if (offset > 0) { + bucket = offset / h->bucket_width; + + if (bucket >= (int32_t)h->num_buckets) { + bucket = h->num_buckets - 1; + } + } + + h->counts[bucket]++; +} + +//------------------------------------------------ +// Get the low edge of the "threshold" bucket - +// the bucket in which the specified percentage of +// total count is exceeded (accumulating from low +// bucket). +// +uint64_t +linear_hist_get_threshold_for_fraction(linear_hist *h, uint32_t tenths_pct, + linear_hist_threshold *p_threshold) +{ + return linear_hist_get_threshold_for_subtotal(h, + (linear_hist_get_total(h) * (uint64_t)tenths_pct) / 1000, + p_threshold); +} + +//------------------------------------------------ +// Get the low edge of the "threshold" bucket - +// the bucket in which the specified subtotal +// count is exceeded (accumulating from low +// bucket). +// +uint64_t +linear_hist_get_threshold_for_subtotal(linear_hist *h, uint64_t subtotal, + linear_hist_threshold *p_threshold) +{ + p_threshold->bucket_width = h->bucket_width; + p_threshold->target_count = subtotal; + + uint64_t count = 0; + uint32_t i; + + for (i = 0; i < h->num_buckets; i++) { + count += h->counts[i]; + + if (count > subtotal) { + break; + } + } + + if (i == h->num_buckets) { + // This means subtotal >= h->total_count. + p_threshold->value = 0xFFFFffff; + p_threshold->bucket_index = 0; // irrelevant + p_threshold->bucket_count = 0; // irrelevant + return count; + } + + p_threshold->value = h->start + (i * h->bucket_width); + p_threshold->bucket_index = i; + p_threshold->bucket_count = h->counts[i]; + + // Return subtotal of everything below "threshold" bucket. + return count - h->counts[i]; +} + +//------------------------------------------------ +// Dump a linear histogram to log. +// +// Note - DO NOT change the log output format in +// this method - public documentation assumes this +// format. +// +void +linear_hist_dump(linear_hist *h) +{ + uint32_t i = h->num_buckets; + uint32_t j = 0; + uint32_t k = 0; + uint64_t total_count = 0; + + for (uint32_t b = 0; b < h->num_buckets; b++) { + if (h->counts[b] != 0) { + if (i > b) { + i = b; + } + + j = b; + k++; + total_count += h->counts[b]; + } + } + + char buf[100]; + int pos = 0; + int n = 0; + + buf[0] = '\0'; + + cf_debug(AS_NSUP, "linear histogram dump: %s [%u %u]/[%u] (%lu total)", + h->name, h->start, h->start + (h->num_buckets * h->bucket_width), + h->bucket_width, total_count); + + if (k > 100) { + // For now, just don't bother if there's too much to dump. + cf_debug(AS_NSUP, "... (%u buckets with non-zero count)", k); + return; + } + + for ( ; i <= j; i++) { + if (h->counts[i] == 0) { // print only non-zero columns + continue; + } + + int bytes = sprintf(buf + pos, " (%02u: %010lu)", i, h->counts[i]); + + if (bytes <= 0) { + cf_debug(AS_NSUP, "linear histogram dump error"); + return; + } + + pos += bytes; + + if ((n & 3) == 3) { // maximum of 4 printed columns per log line + cf_debug(AS_NSUP, "%s", buf); + pos = 0; + buf[0] = '\0'; + } + + n++; + } + + if (pos > 0) { + cf_debug(AS_NSUP, "%s", buf); + } +} + +//------------------------------------------------ +// Save a linear histogram "snapshot". +// +void +linear_hist_save_info(linear_hist *h) +{ + pthread_mutex_lock(&h->info_lock); + + if (h->num_buckets > 100) { + // For now, just don't bother if there's too much to save. + sprintf(h->info_snapshot, "%u,%u ...", h->num_buckets, h->bucket_width); + + pthread_mutex_unlock(&h->info_lock); + return; + } + + // Write num_buckets, the bucket width, and the first bucket's count. + int i = 0; + int pos = snprintf(h->info_snapshot, INFO_SNAPSHOT_SIZE, "%u,%u,%lu", + h->num_buckets, h->bucket_width, h->counts[i++]); + + while (pos < INFO_SNAPSHOT_SIZE && i < h->num_buckets) { + pos += snprintf(h->info_snapshot + pos, INFO_SNAPSHOT_SIZE - pos, + ",%lu", h->counts[i++]); + } + + pthread_mutex_unlock(&h->info_lock); +} + +//------------------------------------------------ +// Append a linear histogram "snapshot" to db. +// +void +linear_hist_get_info(linear_hist *h, cf_dyn_buf *db) +{ + pthread_mutex_lock(&h->info_lock); + cf_dyn_buf_append_string(db, h->info_snapshot); + pthread_mutex_unlock(&h->info_lock); +} diff --git a/cf/src/meminfo.c b/cf/src/meminfo.c new file mode 100644 index 00000000..12f1fdb4 --- /dev/null +++ b/cf/src/meminfo.c @@ -0,0 +1,152 @@ +/* + * meminfo.c + * + * Copyright (C) 2008 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "meminfo.h" + +#include +#include +#include +#include +#include +#include +#include +#include + + +int +cf_meminfo(uint64_t *physmem, uint64_t *freemem, int *freepct, bool *swapping) +{ + // do this without a malloc, because we might be in trouble, malloc-wise + char buf[4096]; + memset(buf, 0, sizeof(buf)); // makes valgrind happy? + + // be a little oversafe + if (physmem) *physmem = 0; + if (freemem) *freemem = 0; + if (freepct) *freepct = 0; + if (swapping) *swapping = 0; + + // open /proc/meminfo + int fd = open("/proc/meminfo", O_RDONLY , 0 /*mask not used if not creating*/ ); + if (fd < 0) { + fprintf(stderr, "meminfo failed: can't open proc file\n"); + return(-1); + } + + // this loop is overkill. proc read won't block, realistically + int pos = 0, lim = sizeof(buf); + int rv = 0; + do { + + rv = read(fd, &buf[pos], lim - pos); + if (rv > 0) + pos += rv; + else if (rv < 0) { + fprintf(stderr, "meminfo failed: read returned %d errno %d pos %d\n",rv,errno,pos); + close(fd); + return(-1); + } + + } while ((rv > 0) && (pos < lim)); + + close(fd); + + char *physMemStr = "MemTotal"; uint64_t physMem = 0; + char *freeMemStr = "MemFree"; uint64_t freeMem = 0; + char *activeMemStr = "Active"; uint64_t activeMem = 0; + char *inactiveMemStr = "Inactive"; uint64_t inactiveMem = 0; + char *cachedMemStr = "Cached"; uint64_t cachedMem = 0; + char *buffersMemStr = "Buffers"; uint64_t buffersMem = 0; + char *swapTotalStr = "SwapTotal"; uint64_t swapTotal = 0; + char *swapFreeStr = "SwapFree"; uint64_t swapFree = 0; + char *sharedMemStr = "Shmem"; uint64_t sharedMem = 0; + + // parse each line - always three tokens, the name, the integer, and 'kb' + char *cur = buf; + char *saveptr = 0, *tok1, *tok2, *tok3; + do { + tok1 = tok2 = tok3 = 0; + tok1 = strtok_r(cur,": \r\n" , &saveptr); + cur = 0; + tok2 = strtok_r(cur,": \r\n" , &saveptr); + tok3 = strtok_r(cur,": \r\n" , &saveptr); + + if (tok1 && tok3) { + if (strcmp(tok1, physMemStr) == 0) + physMem = atoi(tok2); + else if (strcmp(tok1, freeMemStr) == 0) + freeMem = atoi(tok2); + else if (strcmp(tok1, swapTotalStr) == 0) + swapTotal = atoi(tok2); + else if (strcmp(tok1, swapFreeStr) == 0) + swapFree = atoi(tok2); + else if (strcmp(tok1, activeMemStr) == 0) + activeMem = atoi(tok2); + else if (strcmp(tok1, inactiveMemStr) == 0) + inactiveMem = atoi(tok2); + else if (strcmp(tok1, cachedMemStr) == 0) + cachedMem = atoi(tok2); + else if (strcmp(tok1, buffersMemStr) == 0) + buffersMem = atoi(tok2); + else if (strcmp(tok1, sharedMemStr) == 0) + sharedMem = atoi(tok2); + } + + } while(tok1 && tok2 && tok3); + + // + // Calculate available memory: + // Start with the total physical memory in the system. + // Next, subtract out the total of the active and inactive VM. + // Finally, add back in the cached memory and buffers, which are effectively available if & when needed. + // Caution: Subtract the shared memory, which is included in the cached memory, but is not available. + // + uint64_t availableMem = physMem - activeMem - inactiveMem + cachedMem + buffersMem - sharedMem; + + if (physmem) *physmem = physMem * 1024L; + if (freemem) *freemem = availableMem * 1024L; + + // just easier to do this kind of thing in one place + if (freepct) *freepct = (100L * availableMem) / physMem; + + if (swapping) { + *swapping = false; +#if 0 + uint64_t swapUsedPct = ((swapTotal - swapFree)*100)/swapTotal; + if (swapUsedPct > 10) { + *swapping = true; + fprintf(stderr, " SWAPPING: %"PRIu64" %"PRIu64" %"PRIu64, + swapUsedPct, swapTotal, swapFree); + } +#else + // Silence compiler warnings. + (void) swapFree; + (void) swapTotal; + (void) freeMem; +#endif + } + +// fprintf(stderr, "%u swapTotal %u swapFree %u swapFreePct ::: swapping %d\n", +// (unsigned int) swapTotal,(unsigned int)swapFree,(int)swapUsedPct,(int) *swapping); + + return(0); +} diff --git a/cf/src/msg.c b/cf/src/msg.c new file mode 100644 index 00000000..e68dd808 --- /dev/null +++ b/cf/src/msg.c @@ -0,0 +1,1205 @@ +/* + * msg.c + * + * Copyright (C) 2008-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + + +//========================================================== +// Includes. +// + +#include "msg.h" + +#include +#include +#include +#include +#include + +#include "aerospike/as_msgpack.h" +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_atomic.h" +#include "citrusleaf/cf_byte_order.h" +#include "citrusleaf/cf_vector.h" + +#include "dynbuf.h" +#include "fault.h" + + +//========================================================== +// Typedefs & constants. +// + +typedef struct msg_type_entry_s { + const msg_template *mt; + uint16_t entry_count; + uint32_t scratch_sz; +} msg_type_entry; + +// msg field header on wire. +typedef struct msg_field_hdr_s { + uint16_t id; + uint8_t type; + uint8_t content[]; +} __attribute__ ((__packed__)) msg_field_hdr; + + +//========================================================== +// Globals. +// + +// Total number of "msg" objects allocated: +cf_atomic_int g_num_msgs = 0; + +// Total number of "msg" objects allocated per type: +cf_atomic_int g_num_msgs_by_type[M_TYPE_MAX] = { 0 }; + +static msg_type_entry g_mte[M_TYPE_MAX]; + + +//========================================================== +// Forward declarations. +// + +static size_t msg_get_field_wire_size(msg_field_type type, size_t field_sz); +static uint32_t msg_field_stamp(const msg_field *mf, msg_type mtype, uint8_t *buf); +static void msg_field_save(msg *m, msg_field *mf); + + +//========================================================== +// Inlines. +// + +static inline msg_field_type +mf_type(const msg_field *mf, msg_type type) +{ + return g_mte[type].mt[mf->id].type; +} + +static inline void +mf_destroy(msg_field *mf) +{ + if (mf->is_set) { + if (mf->is_free) { + cf_free(mf->u.any_buf); + mf->is_free = false; + } + + mf->is_set = false; + } +} + + +//========================================================== +// Public API - object accounting. +// + +// Call this instead of freeing msg directly, to keep track of all msgs. +void +msg_put(msg *m) +{ + cf_atomic_int_decr(&g_num_msgs); + cf_atomic_int_decr(&g_num_msgs_by_type[m->type]); + cf_rc_free(m); +} + + +//========================================================== +// Public API - lifecycle. +// + +void +msg_type_register(msg_type type, const msg_template *mt, size_t mt_sz, + size_t scratch_sz) +{ + cf_assert(type >= 0 && type < M_TYPE_MAX, CF_MSG, "invalid type %d", type); + + msg_type_entry *mte = &g_mte[type]; + uint16_t mt_count = (uint16_t)(mt_sz / sizeof(msg_template)); + + if (mte->mt) { + // This happens on the heartbeat version jump - handle gently for now. + cf_info(CF_MSG, "msg_type_register() type %d already registered", type); + return; + } + + cf_assert(mt_count != 0, CF_MSG, "msg_type_register() empty template"); + + uint16_t max_id = 0; + + for (uint16_t i = 0; i < mt_count; i++) { + if (mt[i].id >= max_id) { + max_id = mt[i].id; + } + } + + mte->entry_count = max_id + 1; + + msg_template *table = cf_calloc(mte->entry_count, sizeof(msg_template)); + + for (uint16_t i = 0; i < mt_count; i++) { + table[mt[i].id] = mt[i]; + } + + mte->mt = table; + mte->scratch_sz = (uint32_t)scratch_sz; +} + +msg * +msg_create(msg_type type) +{ + // Caller validates type is in range - this validates it's not unused. + if (! g_mte[type].mt) { + return NULL; + } + + const msg_type_entry *mte = &g_mte[type]; + uint16_t mt_count = mte->entry_count; + size_t u_sz = sizeof(msg) + (sizeof(msg_field) * mt_count); + size_t a_sz = u_sz + (size_t)mte->scratch_sz; + msg *m = cf_rc_alloc(a_sz); + + m->n_fields = mt_count; + m->bytes_used = (uint32_t)u_sz; + m->bytes_alloc = (uint32_t)a_sz; + m->just_parsed = false; + m->type = type; + + for (uint16_t i = 0; i < mt_count; i++) { + msg_field *mf = &m->f[i]; + + mf->id = i; + mf->is_set = false; + mf->is_free = false; + } + + // Keep track of allocated msgs. + cf_atomic_int_incr(&g_num_msgs); + cf_atomic_int_incr(&g_num_msgs_by_type[type]); + + return m; +} + +void +msg_destroy(msg *m) +{ + int cnt = cf_rc_release(m); + + if (cnt == 0) { + for (uint32_t i = 0; i < m->n_fields; i++) { + mf_destroy(&m->f[i]); + } + + msg_put(m); + } + else { + cf_assert(cnt > 0, CF_MSG, "msg_destroy(%p) extra call", m); + } +} + +void +msg_incr_ref(msg *m) +{ + cf_rc_reserve(m); +} + + +//========================================================== +// Public API - pack messages into flattened data. +// + +size_t +msg_get_wire_size(const msg *m) +{ + size_t sz = sizeof(msg_hdr); + + for (uint16_t i = 0; i < m->n_fields; i++) { + const msg_field *mf = &m->f[i]; + + if (mf->is_set) { + sz += msg_get_field_wire_size(mf_type(mf, m->type), mf->field_sz); + } + } + + return sz; +} + +size_t +msg_get_template_fixed_sz(const msg_template *mt, size_t mt_count) +{ + size_t sz = sizeof(msg_hdr); + + for (size_t i = 0; i < mt_count; i++) { + sz += msg_get_field_wire_size(mt[i].type, 0); + } + + return sz; +} + +size_t +msg_to_wire(const msg *m, uint8_t *buf) +{ + msg_hdr *hdr = (msg_hdr *)buf; + + hdr->type = cf_swap_to_be16(m->type); + + buf += sizeof(msg_hdr); + + const uint8_t *body = buf; + + for (uint16_t i = 0; i < m->n_fields; i++) { + const msg_field *mf = &m->f[i]; + + if (mf->is_set) { + buf += msg_field_stamp(mf, m->type, buf); + } + } + + uint32_t body_sz = (uint32_t)(buf - body); + + hdr->size = cf_swap_to_be32(body_sz); + + return sizeof(msg_hdr) + body_sz; +} + + +//========================================================== +// Public API - parse flattened data into messages. +// + +int +msg_parse(msg *m, const uint8_t *buf, size_t bufsz) +{ + if (bufsz < sizeof(msg_hdr)) { + return -1; + } + + const msg_hdr *hdr = (const msg_hdr *)buf; + buf += sizeof(msg_hdr); + + uint32_t sz = cf_swap_from_be32(hdr->size); + uint16_t type = cf_swap_from_be16(hdr->type); + + if (bufsz < sz + sizeof(msg_hdr)) { + return -2; + } + + if (m->type != type) { + cf_ticker_warning(CF_MSG, "parsed type %d for msg type %d", type, m->type); + return -3; + } + + const uint8_t *eob = buf + sz; + size_t left = sz; + + while (left != 0) { + if (left < sizeof(msg_field_hdr) + sizeof(uint32_t)) { + return -4; + } + + const msg_field_hdr *fhdr = (const msg_field_hdr *)buf; + buf += sizeof(msg_field_hdr); + + uint32_t id = (uint32_t)cf_swap_from_be16(fhdr->id); + msg_field_type ft = (msg_field_type)fhdr->type; + size_t fsz; + uint32_t size = 0; + + switch (ft) { + case M_FT_UINT32: + fsz = sizeof(uint32_t); + break; + case M_FT_UINT64: + fsz = sizeof(uint64_t); + break; + default: + size = sizeof(uint32_t); + fsz = cf_swap_from_be32(*(const uint32_t *)buf); + buf += sizeof(uint32_t); + break; + } + + if (left < sizeof(msg_field_hdr) + size + fsz) { + return -5; + } + + msg_field *mf; + + if (id >= m->n_fields) { + mf = NULL; + } + else { + mf = &m->f[id]; + } + + if (mf && ft != mf_type(mf, m->type)) { + cf_ticker_warning(CF_MSG, "msg type %d: parsed type %d for field type %d", m->type, ft, mf_type(mf, m->type)); + mf = NULL; + } + + if (mf) { + mf->is_set = true; + + switch (mf_type(mf, m->type)) { + case M_FT_UINT32: + mf->u.ui32 = cf_swap_from_be32(*(uint32_t *)buf); + break; + case M_FT_UINT64: + mf->u.ui64 = cf_swap_from_be64(*(uint64_t *)buf); + break; + case M_FT_STR: + case M_FT_BUF: + case M_FT_ARRAY_UINT32: + case M_FT_ARRAY_UINT64: + case M_FT_ARRAY_STR: + case M_FT_ARRAY_BUF: + case M_FT_MSGPACK: + mf->field_sz = (uint32_t)fsz; + mf->u.any_buf = (void *)buf; + mf->is_free = false; + break; + default: + cf_ticker_detail(CF_MSG, "msg_parse: field type %d not supported - skipping", mf_type(mf, m->type)); + mf->is_set = false; + break; + } + } + + if (eob < buf) { + break; + } + + buf += fsz; + left = (size_t)(eob - buf); + } + + m->just_parsed = true; + + return 0; +} + +int +msg_get_initial(uint32_t *size_r, msg_type *type_r, const uint8_t *buf, + uint32_t bufsz) +{ + if (bufsz < sizeof(msg_hdr)) { + return -1; + } + + const msg_hdr *hdr = (const msg_hdr *)buf; + + *size_r = cf_swap_from_be32(hdr->size) + (uint32_t)sizeof(msg_hdr); + *type_r = (msg_type)cf_swap_from_be16(hdr->type); + + return 0; +} + +void +msg_reset(msg *m) +{ + m->bytes_used = (uint32_t)((m->n_fields * sizeof(msg_field)) + sizeof(msg)); + m->just_parsed = false; + + for (uint16_t i = 0; i < m->n_fields; i++) { + mf_destroy(&m->f[i]); + } +} + +void +msg_preserve_fields(msg *m, uint32_t n_field_ids, ...) +{ + bool reflect[m->n_fields]; + + for (uint16_t i = 0; i < m->n_fields; i++) { + reflect[i] = false; + } + + va_list argp; + va_start(argp, n_field_ids); + + for (uint32_t n = 0; n < n_field_ids; n++) { + reflect[va_arg(argp, int)] = true; + } + + va_end(argp); + + for (uint32_t i = 0; i < m->n_fields; i++) { + msg_field *mf = &m->f[i]; + + if (mf->is_set) { + if (reflect[i]) { + if (m->just_parsed) { + msg_field_save(m, mf); + } + } + else { + mf->is_set = false; + } + } + } + + m->just_parsed = false; +} + +void +msg_preserve_all_fields(msg *m) +{ + if (! m->just_parsed) { + return; + } + + for (uint32_t i = 0; i < m->n_fields; i++) { + msg_field *mf = &m->f[i]; + + if (mf->is_set) { + msg_field_save(m, mf); + } + } + + m->just_parsed = false; +} + + +//========================================================== +// Public API - set fields in messages. +// + +int +msg_set_uint32(msg *m, int field_id, uint32_t v) +{ + m->f[field_id].is_set = true; + m->f[field_id].u.ui32 = v; + + return 0; +} + +int +msg_set_uint64(msg *m, int field_id, uint64_t v) +{ + m->f[field_id].is_set = true; + m->f[field_id].u.ui64 = v; + + return 0; +} + +int +msg_set_str(msg *m, int field_id, const char *v, msg_set_type type) +{ + msg_field *mf = &m->f[field_id]; + + mf_destroy(mf); + + mf->field_sz = (uint32_t)strlen(v) + 1; + + if (type == MSG_SET_COPY) { + uint32_t fsz = mf->field_sz; + + if (m->bytes_alloc - m->bytes_used >= fsz) { + mf->u.str = (char *)m + m->bytes_used; + m->bytes_used += fsz; + mf->is_free = false; + memcpy(mf->u.str, v, fsz); + } + else { + mf->u.str = cf_strdup(v); + mf->is_free = true; + } + } + else if (type == MSG_SET_HANDOFF_MALLOC) { + mf->u.str = (char *)v; + mf->is_free = (v != NULL); + + if (! v) { + cf_warning(CF_MSG, "handoff malloc with null pointer"); + } + } + + mf->is_set = true; + + return 0; +} + +int +msg_set_buf(msg *m, int field_id, const uint8_t *v, size_t sz, + msg_set_type type) +{ + msg_field *mf = &m->f[field_id]; + + mf_destroy(mf); + + mf->field_sz = (uint32_t)sz; + + if (type == MSG_SET_COPY) { + if (m->bytes_alloc - m->bytes_used >= sz) { + mf->u.buf = (uint8_t *)m + m->bytes_used; + m->bytes_used += (uint32_t)sz; + mf->is_free = false; + } + else { + mf->u.buf = cf_malloc(sz); + mf->is_free = true; + } + + memcpy(mf->u.buf, v, sz); + + } + else if (type == MSG_SET_HANDOFF_MALLOC) { + mf->u.buf = (void *)v; + mf->is_free = (v != NULL); + + if (! v) { + cf_warning(CF_MSG, "handoff malloc with null pointer"); + } + } + + mf->is_set = true; + + return 0; +} + +int +msg_set_uint32_array_size(msg *m, int field_id, uint32_t count) +{ + msg_field *mf = &m->f[field_id]; + + cf_assert(! mf->is_set, CF_MSG, "msg_set_uint32_array_size() field already set"); + + mf->field_sz = (uint32_t)(count * sizeof(uint32_t)); + mf->u.ui32_a = cf_malloc(mf->field_sz); + mf->is_set = true; + mf->is_free = true; + + return 0; +} + +int +msg_set_uint32_array(msg *m, int field_id, uint32_t idx, uint32_t v) +{ + msg_field *mf = &m->f[field_id]; + + cf_assert(mf->is_set, CF_MSG, "msg_set_uint32_array() field not set"); + cf_assert(idx < (mf->field_sz >> 2), CF_MSG, "msg_set_uint32_array() idx out of bounds"); + + mf->u.ui32_a[idx] = cf_swap_to_be32(v); + + return 0; +} + +int +msg_set_uint64_array_size(msg *m, int field_id, uint32_t count) +{ + msg_field *mf = &m->f[field_id]; + + cf_assert(! mf->is_set, CF_MSG, "msg_set_uint64_array_size() field already set"); + + mf->field_sz = (uint32_t)(count * sizeof(uint64_t)); + mf->u.ui64_a = cf_malloc(mf->field_sz); + mf->is_set = true; + mf->is_free = true; + + return 0; +} + +int +msg_set_uint64_array(msg *m, int field_id, uint32_t idx, uint64_t v) +{ + msg_field *mf = &m->f[field_id]; + + cf_assert(mf->is_set, CF_MSG, "msg_set_uint64_array() field not set"); + cf_assert(idx < (mf->field_sz >> 3), CF_MSG, "msg_set_uint64_array() idx out of bounds"); + + mf->u.ui64_a[idx] = cf_swap_to_be64(v); + + return 0; +} + +void +msg_msgpack_list_set_uint32(msg *m, int field_id, const uint32_t *buf, + uint32_t count) +{ + msg_field *mf = &m->f[field_id]; + uint32_t a_sz = as_pack_list_header_get_size(count); + + mf_destroy(mf); + + for (uint32_t i = 0; i < count; i++) { + a_sz += as_pack_uint64_size((uint64_t)buf[i]); + } + + mf->field_sz = a_sz; + mf->u.any_buf = cf_malloc(a_sz); + + as_packer pk = { + .buffer = mf->u.any_buf, + .offset = 0, + .capacity = (int)a_sz, + }; + + int e = as_pack_list_header(&pk, count); + + cf_assert(e == 0, CF_MSG, "as_pack_list_header failed"); + + for (uint32_t i = 0; i < count; i++) { + e = as_pack_uint64(&pk, (uint64_t)buf[i]); + cf_assert(e == 0, CF_MSG, "as_pack_str failed"); + } + + mf->is_free = true; + mf->is_set = true; +} + +void +msg_msgpack_list_set_buf(msg *m, int field_id, const cf_vector *v) +{ + msg_field *mf = &m->f[field_id]; + uint32_t count = cf_vector_size(v); + uint32_t a_sz = as_pack_list_header_get_size(count); + + mf_destroy(mf); + + for (uint32_t i = 0; i < count; i++) { + const msg_buf_ele *ele = cf_vector_getp((cf_vector *)v, i); + + if (! ele->ptr) { + a_sz++; // TODO - add to common later + } + else { + a_sz += as_pack_str_size(ele->sz); + } + } + + mf->field_sz = a_sz; + mf->u.any_buf = cf_malloc(a_sz); + + as_packer pk = { + .buffer = mf->u.any_buf, + .offset = 0, + .capacity = (int)a_sz, + }; + + int e = as_pack_list_header(&pk, count); + + cf_assert(e == 0, CF_MSG, "as_pack_list_header failed"); + + for (uint32_t i = 0; i < count; i++) { + const msg_buf_ele *ele = cf_vector_getp((cf_vector *)v, i); + + if (! ele->ptr) { + pk.buffer[pk.offset++] = 0xc0; // TODO - add to common later + } + else { + e = as_pack_str(&pk, ele->ptr, ele->sz); + cf_assert(e == 0, CF_MSG, "as_pack_str failed"); + } + } + + mf->is_free = true; + mf->is_set = true; +} + + +//========================================================== +// Public API - get fields from messages. +// + +msg_field_type +msg_field_get_type(const msg *m, int field_id) +{ + return mf_type(&m->f[field_id], m->type); +} + +bool +msg_is_set(const msg *m, int field_id) +{ + cf_assert(field_id >= 0 && field_id < (int)m->n_fields, CF_MSG, "invalid field_id %d", field_id); + + return m->f[field_id].is_set; +} + +int +msg_get_uint32(const msg *m, int field_id, uint32_t *val_r) +{ + if (! m->f[field_id].is_set) { + return -1; + } + + *val_r = m->f[field_id].u.ui32; + + return 0; +} + +int +msg_get_uint64(const msg *m, int field_id, uint64_t *val_r) +{ + if (! m->f[field_id].is_set) { + return -1; + } + + *val_r = m->f[field_id].u.ui64; + + return 0; +} + +int +msg_get_str(const msg *m, int field_id, char **str_r, size_t *sz_r, + msg_get_type type) +{ + if (! m->f[field_id].is_set) { + return -1; + } + + if (type == MSG_GET_DIRECT) { + *str_r = m->f[field_id].u.str; + } + else if (type == MSG_GET_COPY_MALLOC) { + *str_r = cf_strdup(m->f[field_id].u.str); + } + else { + cf_crash(CF_MSG, "msg_get_str: illegal msg_get_type"); + } + + if (sz_r) { + *sz_r = m->f[field_id].field_sz; + } + + return 0; +} + +int +msg_get_buf(const msg *m, int field_id, uint8_t **buf_r, size_t *sz_r, + msg_get_type type) +{ + if (! m->f[field_id].is_set) { + return -1; + } + + if (type == MSG_GET_DIRECT) { + *buf_r = m->f[field_id].u.buf; + } + else if (type == MSG_GET_COPY_MALLOC) { + *buf_r = cf_malloc(m->f[field_id].field_sz); + memcpy(*buf_r, m->f[field_id].u.buf, m->f[field_id].field_sz); + } + else { + cf_crash(CF_MSG, "msg_get_buf: illegal msg_get_type"); + } + + if (sz_r) { + *sz_r = m->f[field_id].field_sz; + } + + return 0; +} + +int +msg_get_uint32_array(const msg *m, int field_id, uint32_t index, + uint32_t *val_r) +{ + const msg_field *mf = &m->f[field_id]; + + if (! mf->is_set) { + return -1; + } + + *val_r = cf_swap_from_be32(mf->u.ui32_a[index]); + + return 0; +} + +int +msg_get_uint64_array_count(const msg *m, int field_id, uint32_t *count_r) +{ + const msg_field *mf = &m->f[field_id]; + + if (! mf->is_set) { + return -1; + } + + *count_r = mf->field_sz >> 3; + + return 0; +} + +int +msg_get_uint64_array(const msg *m, int field_id, uint32_t index, + uint64_t *val_r) +{ + const msg_field *mf = &m->f[field_id]; + + if (! mf->is_set) { + return -1; + } + + *val_r = cf_swap_from_be64(mf->u.ui64_a[index]); + + return 0; +} + +bool +msg_msgpack_container_get_count(const msg *m, int field_id, uint32_t *count_r) +{ + const msg_field *mf = &m->f[field_id]; + + if (! mf->is_set) { + return false; + } + + as_unpacker pk = { + .buffer = (const uint8_t *)mf->u.any_buf, + .offset = 0, + .length = (int)mf->field_sz + }; + + as_val_t type = as_unpack_peek_type(&pk); + int64_t count; + + switch (type) { + case AS_LIST: + count = as_unpack_list_header_element_count(&pk); + break; + case AS_MAP: + count = as_unpack_map_header_element_count(&pk); + break; + default: + cf_ticker_warning(CF_MSG, "type %d not a packed container", type); + return false; + } + + if (count < 0) { + cf_ticker_warning(CF_MSG, "invalid packed container type %d", type); + return false; + } + + *count_r = (uint32_t)count; + + return true; +} + +bool +msg_msgpack_list_get_uint32_array(const msg *m, int field_id, uint32_t *buf_r, + uint32_t *count_r) +{ + cf_assert(buf_r, CF_MSG, "buf_r is null"); + + const msg_field *mf = &m->f[field_id]; + + if (! mf->is_set) { + return false; + } + + as_unpacker pk = { + .buffer = (const uint8_t *)mf->u.any_buf, + .offset = 0, + .length = (int)mf->field_sz + }; + + as_val_t type = as_unpack_peek_type(&pk); + int64_t count; + + switch (type) { + case AS_LIST: + count = as_unpack_list_header_element_count(&pk); + break; + default: + cf_ticker_warning(CF_MSG, "msg_msgpack_array_get_uint32_array() type %d but expected list", type); + return false; + } + + if (count < 0) { + cf_ticker_warning(CF_MSG, "invalid packed list type %d", type); + return false; + } + + if (*count_r < (uint32_t)count) { + cf_warning(CF_MSG, "count_r %u < %ld too small", *count_r, count); + return false; + } + + for (int64_t i = 0; i < count; i++) { + uint64_t val; + int ret = as_unpack_uint64(&pk, &val); + + if (ret != 0 || (val & (0xFFFFffffUL << 32)) != 0) { + cf_warning(CF_MSG, "i %ld/%ld invalid packed uint32 ret %d val 0x%lx", i, count, ret, val); + return false; + } + + buf_r[i] = (uint32_t)val; + } + + *count_r = (uint32_t)count; + + return true; +} + +bool +msg_msgpack_list_get_buf_array(const msg *m, int field_id, cf_vector *v_r, + bool init_vec) +{ + const msg_field *mf = &m->f[field_id]; + + if (! mf->is_set) { + return false; + } + + as_unpacker pk = { + .buffer = (const uint8_t *)mf->u.any_buf, + .offset = 0, + .length = (int)mf->field_sz + }; + + as_val_t type = as_unpack_peek_type(&pk); + int64_t count; + + switch (type) { + case AS_LIST: + count = as_unpack_list_header_element_count(&pk); + break; + default: + cf_ticker_warning(CF_MSG, "msg_msgpack_array_get_buf_vec_with_init() type %d but expected list", type); + return false; + } + + if (count < 0) { + cf_ticker_warning(CF_MSG, "invalid packed list type %d", type); + return false; + } + + if (init_vec) { + if (cf_vector_init(v_r, sizeof(msg_buf_ele), (uint32_t)count, 0) != 0) { + cf_warning(CF_MSG, "vector malloc failed - count %ld", count); + return false; + } + } + else if ((uint32_t)count > v_r->capacity) { // TODO - wrap to avoid access of private members? + cf_warning(CF_MSG, "count %ld > vector cap %u", count, v_r->capacity); + return false; + } + + for (int64_t i = 0; i < count; i++) { + msg_buf_ele ele; + int saved_offset = pk.offset; + + ele.ptr = (uint8_t *)as_unpack_str(&pk, &ele.sz); + + if (! ele.ptr) { + pk.offset = saved_offset; + ele.sz = 0; + + if (as_unpack_size(&pk) <= 0) { + if (init_vec) { + cf_vector_destroy(v_r); + } + + cf_warning(CF_MSG, "i %ld/%ld invalid msgpack element with type %d", i, count, type); + + return false; + } + } + + cf_vector_append(v_r, &ele); + } + + return true; +} + + +//========================================================== +// Public API - debugging only. +// + +void +msg_dump(const msg *m, const char *info) +{ + cf_info(CF_MSG, "msg_dump: %s: msg %p rc %d n-fields %u bytes-used %u bytes-alloc'd %u type %d", + info, m, (int)cf_rc_count((void*)m), m->n_fields, m->bytes_used, + m->bytes_alloc, m->type); + + for (uint32_t i = 0; i < m->n_fields; i++) { + const msg_field *mf = &m->f[i]; + + cf_info(CF_MSG, "mf %02u: id %u is-set %d", i, mf->id, mf->is_set); + + if (mf->is_set) { + switch (mf_type(mf, m->type)) { + case M_FT_UINT32: + cf_info(CF_MSG, " type UINT32 value %u", mf->u.ui32); + break; + case M_FT_UINT64: + cf_info(CF_MSG, " type UINT64 value %lu", mf->u.ui64); + break; + case M_FT_STR: + cf_info(CF_MSG, " type STR sz %u free %c value %s", + mf->field_sz, mf->is_free ? 't' : 'f', mf->u.str); + break; + case M_FT_BUF: + cf_info_binary(CF_MSG, mf->u.buf, mf->field_sz, + CF_DISPLAY_HEX_COLUMNS, + " type BUF sz %u free %c value ", + mf->field_sz, mf->is_free ? 't' : 'f'); + break; + case M_FT_ARRAY_UINT32: + cf_info(CF_MSG, " type ARRAY_UINT32: count %u n-uint32 %u free %c", + mf->field_sz, mf->field_sz >> 2, + mf->is_free ? 't' : 'f'); + { + uint32_t n_ints = mf->field_sz >> 2; + for (uint32_t j = 0; j < n_ints; j++) { + cf_info(CF_MSG, " idx %u value %u", + j, ntohl(mf->u.ui32_a[j])); + } + } + break; + case M_FT_ARRAY_UINT64: + cf_info(CF_MSG, " type ARRAY_UINT64: count %u n-uint64 %u free %c", + mf->field_sz, mf->field_sz >> 3, + mf->is_free ? 't' : 'f'); + { + uint32_t n_ints = mf->field_sz >> 3; + for (uint32_t j = 0; j < n_ints; j++) { + cf_info(CF_MSG, " idx %u value %lu", + j, __bswap_64(mf->u.ui64_a[j])); + } + } + break; + default: + cf_info(CF_MSG, " type %d unknown", mf_type(mf, m->type)); + break; + } + } + } +} + + +//========================================================== +// Local helpers. +// + +static size_t +msg_get_field_wire_size(msg_field_type type, size_t field_sz) +{ + switch (type) { + case M_FT_UINT32: + return sizeof(msg_field_hdr) + sizeof(uint32_t); + case M_FT_UINT64: + return sizeof(msg_field_hdr) + sizeof(uint64_t); + case M_FT_STR: + case M_FT_BUF: + case M_FT_ARRAY_UINT32: + case M_FT_ARRAY_UINT64: + case M_FT_ARRAY_STR: + case M_FT_ARRAY_BUF: + case M_FT_MSGPACK: + break; + default: + cf_crash(CF_MSG, "unexpected field type %d", type); + break; + } + + return sizeof(msg_field_hdr) + sizeof(uint32_t) + field_sz; +} + +// Returns the number of bytes written. +static uint32_t +msg_field_stamp(const msg_field *mf, msg_type mtype, uint8_t *buf) +{ + msg_field_hdr *hdr = (msg_field_hdr *)buf; + msg_field_type type = mf_type(mf, mtype); + + buf += sizeof(msg_field_hdr); + + hdr->id = cf_swap_to_be16((uint16_t)mf->id); + hdr->type = (uint8_t)type; + + switch (type) { + case M_FT_UINT32: + *(uint32_t *)buf = cf_swap_to_be32(mf->u.ui32); + return sizeof(msg_field_hdr) + sizeof(uint32_t); + case M_FT_UINT64: + *(uint64_t *)buf = cf_swap_to_be64(mf->u.ui64); + return sizeof(msg_field_hdr) + sizeof(uint64_t); + default: + break; + } + + uint32_t fsz; + uint32_t *p_fsz = (uint32_t *)buf; + + buf += sizeof(uint32_t); + + switch (type) { + case M_FT_STR: + case M_FT_BUF: + case M_FT_ARRAY_UINT32: + case M_FT_ARRAY_UINT64: + case M_FT_ARRAY_STR: + case M_FT_ARRAY_BUF: + case M_FT_MSGPACK: + fsz = mf->field_sz; + memcpy(buf, mf->u.any_buf, fsz); + break; + default: + cf_crash(CF_MSG, "unexpected field type %d", type); + return 0; + } + + *p_fsz = cf_swap_to_be32(fsz); + + return (uint32_t)(sizeof(msg_field_hdr) + sizeof(uint32_t) + fsz); +} + +static void +msg_field_save(msg *m, msg_field *mf) +{ + switch (mf_type(mf, m->type)) { + case M_FT_UINT32: + case M_FT_UINT64: + break; + case M_FT_STR: + case M_FT_BUF: + case M_FT_ARRAY_UINT32: + case M_FT_ARRAY_UINT64: + case M_FT_ARRAY_STR: + case M_FT_ARRAY_BUF: + case M_FT_MSGPACK: + // Should only preserve received messages where buffer pointers point + // directly into a fabric buffer. + cf_assert(! mf->is_free, CF_MSG, "invalid msg preserve"); + + if (m->bytes_alloc - m->bytes_used >= mf->field_sz) { + void *buf = ((uint8_t *)m) + m->bytes_used; + + memcpy(buf, mf->u.any_buf, mf->field_sz); + mf->u.any_buf = buf; + m->bytes_used += mf->field_sz; + mf->is_free = false; + } + else { + void *buf = cf_malloc(mf->field_sz); + + memcpy(buf, mf->u.any_buf, mf->field_sz); + mf->u.any_buf = buf; + mf->is_free = true; + } + break; + default: + break; + } +} diff --git a/cf/src/node.c b/cf/src/node.c new file mode 100644 index 00000000..5dedc489 --- /dev/null +++ b/cf/src/node.c @@ -0,0 +1,67 @@ +/* + * node.c + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#include "node.h" + +#include +#include +#include + +#include "citrusleaf/alloc.h" + +#include "fault.h" + + +uint32_t +cf_nodeid_shash_fn(const void *key) +{ + cf_node id = *(const cf_node *)key; + + return (uint32_t)(id >> 32) | (uint32_t)id; +} + +uint32_t +cf_nodeid_rchash_fn(const void *key, uint32_t key_size) +{ + (void)key_size; + + return cf_nodeid_shash_fn(key); +} + +char * +cf_node_name() +{ + char buffer[1024]; + int res = gethostname(buffer, sizeof(buffer)); + + if (res == (int)sizeof(buffer) || (res < 0 && errno == ENAMETOOLONG)) { + cf_crash(CF_MISC, "host name too long"); + } + + if (res < 0) { + cf_warning(CF_MISC, "error while determining host name: %d (%s)", + errno, cf_strerror(errno)); + buffer[0] = 0; + } + + return cf_strdup(buffer); +} diff --git a/cf/src/olock.c b/cf/src/olock.c new file mode 100644 index 00000000..65ea3282 --- /dev/null +++ b/cf/src/olock.c @@ -0,0 +1,114 @@ +/* + * olock.c + * + * Copyright (C) 2008-2014 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +/* + * The object lock system gives a list + * + */ + +#include "olock.h" + +#include +#include +#include +#include + +#include +#include + +#include + + +// This ruins the notion that olocks are a generic class, but... +// (Perhaps better in index.c or record.c, if we ever make a record.h?) +olock *g_record_locks; + + +// an interesting detail: since this digest is used to choose among +// servers, you must use different bits to choose which OLOCK + +// +// ASSUMES d is DIGEST and ol is OLOCK * +// + +#define OLOCK_HASH(__ol, __d) ( ( (__d->digest[2] << 8) | (__d->digest[3]) ) & __ol->mask ) + +void +olock_lock(olock *ol, cf_digest *d) +{ + uint32_t n = OLOCK_HASH(ol, d); + + cf_mutex_lock(&ol->locks[n]); +} + +void +olock_vlock(olock *ol, cf_digest *d, cf_mutex **vlock) +{ + uint32_t n = OLOCK_HASH(ol, d); + + *vlock = &ol->locks[n]; + + cf_mutex_lock(*vlock); +} + +void +olock_unlock(olock *ol, cf_digest *d) +{ + uint32_t n = OLOCK_HASH(ol, d); + + cf_mutex_unlock(&ol->locks[n]); +} + +olock * +olock_create(uint32_t n_locks, bool mutex) +{ + olock *ol = cf_malloc(sizeof(olock) + (sizeof(cf_mutex) * n_locks)); + + uint32_t mask = n_locks - 1; + + if ((mask & n_locks) != 0) { + fprintf(stderr, "olock: make sure your number of locks is a power of 2, n_locks aint\n"); + return 0; + } + + ol->n_locks = n_locks; + ol->mask = mask; + + if (mutex) { + memset(ol->locks, 0, sizeof(cf_mutex) * n_locks); + } + else { + fprintf(stderr, "olock: todo add reader writer locks\n"); + } + + return ol; +} + +void +olock_destroy(olock *ol) +{ + for (int i = 0; i < ol->n_locks; i++) { + cf_mutex_destroy(&ol->locks[i]); + } + + cf_free(ol); +} diff --git a/cf/src/shash.c b/cf/src/shash.c new file mode 100644 index 00000000..df5cbf62 --- /dev/null +++ b/cf/src/shash.c @@ -0,0 +1,712 @@ +/* + * shash.c + * + * Copyright (C) 2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "shash.h" + +#include +#include +#include +#include +#include + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_hash_math.h" + +#include "fault.h" + + +//========================================================== +// Typedefs & constants. +// + +// TODO - in_use is wasteful, especially when not first in bucket. +typedef struct cf_shash_ele_s { + struct cf_shash_ele_s *next; + bool in_use; + uint8_t data[]; +} cf_shash_ele; + + +//========================================================== +// Forward declarations. +// + +static inline void cf_shash_clear_table(cf_shash *h); +static inline void cf_shash_destroy_elements(cf_shash *h); +static inline uint32_t cf_shash_calculate_hash(cf_shash *h, const void *key); +static inline pthread_mutex_t *cf_shash_lock(cf_shash *h, uint32_t i); +static inline void cf_shash_unlock(pthread_mutex_t *l); +static inline cf_shash_ele *cf_shash_get_bucket(cf_shash *h, uint32_t i); +static inline void cf_shash_fill_element(cf_shash_ele *e, cf_shash *h, const void *key, const void *value); +static inline void cf_shash_size_incr(cf_shash *h); +static inline void cf_shash_size_decr(cf_shash *h); +int cf_shash_delete_or_pop(cf_shash *h, const void *key, void *value); + + +//========================================================== +// Inlines & macros. +// + +#define ELE_KEY(_h, _e) ((void *)_e->data) +#define ELE_VALUE(_h, _e) ((void *)(_e->data + _h->key_size)) + + +//========================================================== +// Public API - useful hash functions. +// + +// Interpret first 4 bytes of key as (host-ordered) uint32_t. (Note - caller +// is responsible for ensuring key size is at least 4 bytes.) +uint32_t +cf_shash_fn_u32(const void *key) +{ + return *(const uint32_t *)key; +} + +// Useful if key is a pointer. +uint32_t +cf_shash_fn_ptr(const void *key) +{ + return cf_hash_ptr32(key); +} + +// Useful if key is a null-terminated string. (Note - using fixed-size keys, so +// key must still be padded to correctly compare keys in a bucket.) +uint32_t +cf_shash_fn_zstr(const void *key) +{ + return cf_hash_fnv32((const uint8_t *)key, strlen(key)); +} + + +//========================================================== +// Public API. +// + +cf_shash * +cf_shash_create(cf_shash_hash_fn h_fn, uint32_t key_size, uint32_t value_size, + uint32_t n_buckets, uint32_t flags) +{ + cf_assert(h_fn && key_size != 0 && n_buckets != 0, CF_MISC, "bad param"); + // Note - value_size 0 works, and is used. + + cf_shash *h = cf_malloc(sizeof(cf_shash)); + + h->h_fn = h_fn; + h->key_size = key_size; + h->value_size = value_size; + h->ele_size = sizeof(cf_shash_ele) + key_size + value_size; + h->n_buckets = n_buckets; + h->flags = flags; + h->n_elements = 0; + + // Can't have both lock options, but can opt for no locks at all. + cf_assert((flags & CF_SHASH_BIG_LOCK) == 0 || + (flags & CF_SHASH_MANY_LOCK) == 0, CF_MISC, "bad flags param"); + + h->table = (cf_shash_ele *)cf_malloc(n_buckets * h->ele_size); + + cf_shash_clear_table(h); + + if ((flags & CF_SHASH_BIG_LOCK) != 0) { + pthread_mutex_init(&h->big_lock, NULL); + } + else if ((flags & CF_SHASH_MANY_LOCK) != 0) { + h->bucket_locks = cf_malloc(sizeof(pthread_mutex_t) * n_buckets); + + for (uint32_t i = 0; i < n_buckets; i++) { + pthread_mutex_init(&h->bucket_locks[i], NULL); + } + } + + return h; +} + +void +cf_shash_destroy(cf_shash *h) +{ + if (! h) { + return; + } + + cf_shash_destroy_elements(h); + + if ((h->flags & CF_SHASH_BIG_LOCK) != 0) { + pthread_mutex_destroy(&h->big_lock); + } + else if ((h->flags & CF_SHASH_MANY_LOCK) != 0) { + for (uint32_t i = 0; i < h->n_buckets; i++) { + pthread_mutex_destroy(&h->bucket_locks[i]); + } + + cf_free(h->bucket_locks); + } + + cf_free(h->table); + cf_free(h); +} + +uint32_t +cf_shash_get_size(cf_shash *h) +{ + cf_assert(h, CF_MISC, "bad param"); + + // For now, not bothering with different methods per lock mode. + return cf_atomic32_get(h->n_elements); +} + +void +cf_shash_put(cf_shash *h, const void *key, const void *value) +{ + cf_assert(h && key && value, CF_MISC, "bad param"); + + uint32_t hash = cf_shash_calculate_hash(h, key); + pthread_mutex_t *l = cf_shash_lock(h, hash); + cf_shash_ele *e = cf_shash_get_bucket(h, hash); + + // Most common case should be insert into empty bucket. + if (! e->in_use) { + cf_shash_fill_element(e, h, key, value); + cf_shash_unlock(l); + return; + } + + cf_shash_ele *e_head = e; + + while (e) { + if (memcmp(ELE_KEY(h, e), key, h->key_size) == 0) { + // Replace the previous value with the new value. + memcpy(ELE_VALUE(h, e), value, h->value_size); + cf_shash_unlock(l); + return; + } + + e = e->next; + } + + e = (cf_shash_ele *)cf_malloc(h->ele_size); + + cf_shash_fill_element(e, h, key, value); + + // Insert just after head. + e->next = e_head->next; + e_head->next = e; + + cf_shash_unlock(l); +} + +int +cf_shash_put_unique(cf_shash *h, const void *key, const void *value) +{ + cf_assert(h && key && value, CF_MISC, "bad param"); + + uint32_t hash = cf_shash_calculate_hash(h, key); + pthread_mutex_t *l = cf_shash_lock(h, hash); + cf_shash_ele *e = cf_shash_get_bucket(h, hash); + + // Most common case should be insert into empty bucket. + if (! e->in_use) { + cf_shash_fill_element(e, h, key, value); + cf_shash_unlock(l); + return CF_SHASH_OK; + } + + cf_shash_ele *e_head = e; + + while (e) { + if (memcmp(ELE_KEY(h, e), key, h->key_size) == 0) { + cf_shash_unlock(l); + return CF_SHASH_ERR_FOUND; + } + + e = e->next; + } + + e = (cf_shash_ele *)cf_malloc(h->ele_size); + + cf_shash_fill_element(e, h, key, value); + + // Insert just after head. + e->next = e_head->next; + e_head->next = e; + + cf_shash_unlock(l); + + return CF_SHASH_OK; +} + +// FIXME - replace with cf_shash_put_unique_or_get_vlock()? +void +cf_shash_update(cf_shash *h, const void *key, void *value_old, void *value_new, + cf_shash_update_fn update_fn, void *udata) +{ + cf_assert(h && key && update_fn, CF_MISC, "bad param"); + + uint32_t hash = cf_shash_calculate_hash(h, key); + pthread_mutex_t *l = cf_shash_lock(h, hash); + cf_shash_ele *e = cf_shash_get_bucket(h, hash); + + // Insert new value into empty bucket. + if (! e->in_use) { + (update_fn)(key, NULL, value_new, udata); + cf_shash_fill_element(e, h, key, value_new); + cf_shash_unlock(l); + return; + } + + cf_shash_ele *e_head = e; + + while (e) { + if (memcmp(ELE_KEY(h, e), key, h->key_size) == 0) { + if (value_old) { + memcpy(value_old, ELE_VALUE(h, e), h->value_size); + } + + (update_fn)(key, value_old, value_new, udata); + + memcpy(ELE_VALUE(h, e), value_new, h->value_size); + cf_shash_unlock(l); + + return; + } + + e = e->next; + } + + (update_fn)(key, NULL, value_new, udata); + + e = (cf_shash_ele *)cf_malloc(h->ele_size); + + cf_shash_fill_element(e, h, key, value_new); + + // Insert just after head. + e->next = e_head->next; + e_head->next = e; + + cf_shash_unlock(l); +} + +int +cf_shash_get(cf_shash *h, const void *key, void *value) +{ + cf_assert(h && key, CF_MISC, "bad param"); + + uint32_t hash = cf_shash_calculate_hash(h, key); + pthread_mutex_t *l = cf_shash_lock(h, hash); + cf_shash_ele *e = cf_shash_get_bucket(h, hash); + + while (e && e->in_use) { + if (memcmp(ELE_KEY(h, e), key, h->key_size) == 0) { + if (value) { + memcpy(value, ELE_VALUE(h, e), h->value_size); + } + + cf_shash_unlock(l); + return CF_SHASH_OK; + } + + e = e->next; + } + + cf_shash_unlock(l); + + return CF_SHASH_ERR_NOT_FOUND; +} + +int +cf_shash_get_vlock(cf_shash *h, const void *key, void **value_r, + pthread_mutex_t **vlock_r) +{ + cf_assert(h && key && value_r && vlock_r, CF_MISC, "bad param"); + + uint32_t hash = cf_shash_calculate_hash(h, key); + pthread_mutex_t *l = cf_shash_lock(h, hash); + cf_shash_ele *e = cf_shash_get_bucket(h, hash); + + while (e && e->in_use) { + if (memcmp(ELE_KEY(h, e), key, h->key_size) == 0) { + *value_r = ELE_VALUE(h, e); + *vlock_r = l; + return CF_SHASH_OK; + } + + e = e->next; + } + + cf_shash_unlock(l); + + return CF_SHASH_ERR_NOT_FOUND; +} + +int +cf_shash_delete(cf_shash *h, const void *key) +{ + return cf_shash_delete_or_pop(h, key, NULL); +} + +int +cf_shash_delete_lockfree(cf_shash *h, const void *key) +{ + cf_assert(h && key, CF_MISC, "bad param"); + + uint32_t hash = cf_shash_calculate_hash(h, key); + cf_shash_ele *e = cf_shash_get_bucket(h, hash); + + cf_shash_ele *e_prev = NULL; + + // Look for the element, remove and release if found. + while (e && e->in_use) { + if (memcmp(ELE_KEY(h, e), key, h->key_size) != 0) { + e_prev = e; + e = e->next; + continue; + } + // else - found it, remove from hash, free (if needed). + + // If not at head, patch pointers and free element. + if (e_prev) { + e_prev->next = e->next; + cf_free(e); + } + // If at head with no next, empty head. + else if (! e->next) { + e->in_use = false; + } + // If at head with a next, copy next into head and free next. + else { + cf_shash_ele *free_e = e->next; + + memcpy(e, e->next, h->ele_size); + cf_free(free_e); + } + + cf_shash_size_decr(h); + + return CF_SHASH_OK; + } + + return CF_SHASH_ERR_NOT_FOUND; +} + +// TODO - Rename to cf_shash_pop()? +int +cf_shash_get_and_delete(cf_shash *h, const void *key, void *value) +{ + cf_assert(value, CF_MISC, "bad param"); + + return cf_shash_delete_or_pop(h, key, value); +} + +void +cf_shash_delete_all(cf_shash *h) +{ + cf_assert(h, CF_MISC, "bad param"); + + if ((h->flags & CF_SHASH_BIG_LOCK) != 0) { + pthread_mutex_lock(&h->big_lock); + } + + uint8_t *bucket = (uint8_t*)h->table; + + for (uint32_t i = 0; i < h->n_buckets; i++) { + pthread_mutex_t *bucket_lock = NULL; + + if ((h->flags & CF_SHASH_MANY_LOCK) != 0) { + bucket_lock = &h->bucket_locks[i]; + pthread_mutex_lock(bucket_lock); + } + + cf_shash_ele *e = ((cf_shash_ele *)bucket)->next; + + while (e) { + cf_shash_ele *temp = e->next; + + cf_free(e); + e = temp; + + cf_shash_size_decr(h); + } + + if (((cf_shash_ele *)bucket)->in_use) { + ((cf_shash_ele *)bucket)->in_use = false; + ((cf_shash_ele *)bucket)->next = NULL; + + cf_shash_size_decr(h); + } + + if (bucket_lock) { + pthread_mutex_unlock(bucket_lock); + } + + bucket += h->ele_size; + } + + if ((h->flags & CF_SHASH_BIG_LOCK) != 0) { + pthread_mutex_unlock(&h->big_lock); + } +} + +int +cf_shash_reduce(cf_shash *h, cf_shash_reduce_fn reduce_fn, void *udata) +{ + cf_assert(h && reduce_fn, CF_MISC, "bad param"); + + if ((h->flags & CF_SHASH_BIG_LOCK) != 0) { + pthread_mutex_lock(&h->big_lock); + } + + uint8_t *bucket = (uint8_t*)h->table; + + for (uint32_t i = 0; i < h->n_buckets; i++) { + pthread_mutex_t *bucket_lock = NULL; + + if ((h->flags & CF_SHASH_MANY_LOCK) != 0) { + bucket_lock = &h->bucket_locks[i]; + pthread_mutex_lock(bucket_lock); + } + + cf_shash_ele *e = (cf_shash_ele *)bucket; + cf_shash_ele *e_prev = NULL; + + while (e && e->in_use) { + int rv = reduce_fn(ELE_KEY(h, e), ELE_VALUE(h, e), udata); + + if (rv == CF_SHASH_OK) { + // Caller says keep going - most common case. + + e_prev = e; + e = e->next; + } + else if (rv == CF_SHASH_REDUCE_DELETE) { + // Caller says delete this element and keep going. + + // If not at head, patch pointers and free element. + if (e_prev) { + e_prev->next = e->next; + cf_free(e); + e = e_prev->next; + } + // If at head with no next, empty head. + else if (! e->next) { + e->in_use = false; + } + // If at head with a next, copy next into head and free next. + else { + cf_shash_ele *free_e = e->next; + + memcpy(e, e->next, h->ele_size); + cf_free(free_e); + } + + cf_shash_size_decr(h); + } + else { + // Caller says stop iterating. + + if (bucket_lock) { + pthread_mutex_unlock(bucket_lock); + } + + if ((h->flags & CF_SHASH_BIG_LOCK) != 0) { + pthread_mutex_unlock(&h->big_lock); + } + + return rv; + } + } + + if (bucket_lock) { + pthread_mutex_unlock(bucket_lock); + } + + bucket += h->ele_size; + } + + if ((h->flags & CF_SHASH_BIG_LOCK) != 0) { + pthread_mutex_unlock(&h->big_lock); + } + + return CF_SHASH_OK; +} + + +//========================================================== +// Local helpers. +// + +static inline void +cf_shash_clear_table(cf_shash *h) +{ + uint8_t *bucket = (uint8_t*)h->table; + uint8_t *end = bucket + (h->n_buckets * h->ele_size); + + while (bucket < end) { + ((cf_shash_ele *)bucket)->next = NULL; + ((cf_shash_ele *)bucket)->in_use = false; + bucket += h->ele_size; + } +} + +static inline void +cf_shash_destroy_elements(cf_shash *h) +{ + uint8_t *bucket = (uint8_t*)h->table; + uint8_t *end = bucket + (h->n_buckets * h->ele_size); + + while (bucket < end) { + cf_shash_ele *e = ((cf_shash_ele *)bucket)->next; + + while (e) { + cf_shash_ele *temp = e->next; + + cf_free(e); + e = temp; + } + + bucket += h->ele_size; + } +} + +static inline uint32_t +cf_shash_calculate_hash(cf_shash *h, const void *key) +{ + return h->h_fn(key) % h->n_buckets; +} + +static inline pthread_mutex_t * +cf_shash_lock(cf_shash *h, uint32_t i) +{ + pthread_mutex_t *l = NULL; + + if ((h->flags & CF_SHASH_BIG_LOCK) != 0) { + l = &h->big_lock; + } + else if ((h->flags & CF_SHASH_MANY_LOCK) != 0) { + l = &h->bucket_locks[i]; + } + + if (l) { + pthread_mutex_lock(l); + } + + return l; +} + +static inline void +cf_shash_unlock(pthread_mutex_t *l) +{ + if (l) { + pthread_mutex_unlock(l); + } +} + +static inline cf_shash_ele * +cf_shash_get_bucket(cf_shash *h, uint32_t i) +{ + return (cf_shash_ele *)((uint8_t *)h->table + (h->ele_size * i)); +} + +static inline void +cf_shash_fill_element(cf_shash_ele *e, cf_shash *h, const void *key, + const void *value) +{ + memcpy(ELE_KEY(h, e), key, h->key_size); + memcpy(ELE_VALUE(h, e), value, h->value_size); + e->in_use = true; + cf_shash_size_incr(h); +} + +static inline void +cf_shash_size_incr(cf_shash *h) +{ + // For now, not bothering with different methods per lock mode. + cf_atomic32_incr(&h->n_elements); +} + +static inline void +cf_shash_size_decr(cf_shash *h) +{ + // For now, not bothering with different methods per lock mode. + cf_atomic32_decr(&h->n_elements); +} + +int +cf_shash_delete_or_pop(cf_shash *h, const void *key, void *value) +{ + cf_assert(h && key, CF_MISC, "bad param"); + + uint32_t hash = cf_shash_calculate_hash(h, key); + pthread_mutex_t *l = cf_shash_lock(h, hash); + cf_shash_ele *e = cf_shash_get_bucket(h, hash); + + cf_shash_ele *e_prev = NULL; + + // Look for the element, remove and release if found. + while (e && e->in_use) { + if (memcmp(ELE_KEY(h, e), key, h->key_size) != 0) { + e_prev = e; + e = e->next; + continue; + } + // else - found it, remove from hash, free (if needed) outside lock. + + // Return value. + if (value) { + memcpy(value, ELE_VALUE(h, e), h->value_size); + } + + // Save pointer to free. + cf_shash_ele *free_e = NULL; + + // If not at head, patch pointers and free element. + if (e_prev) { + e_prev->next = e->next; + free_e = e; + } + // If at head with no next, empty head. + else if (! e->next) { + e->in_use = false; + } + // If at head with a next, copy next into head and free next. + else { + free_e = e->next; + memcpy(e, e->next, h->ele_size); + } + + cf_shash_size_decr(h); + cf_shash_unlock(l); + + if (free_e) { + cf_free(free_e); + } + + return CF_SHASH_OK; + } + + cf_shash_unlock(l); + + return CF_SHASH_ERR_NOT_FOUND; +} diff --git a/cf/src/socket.c b/cf/src/socket.c new file mode 100644 index 00000000..b48564f8 --- /dev/null +++ b/cf/src/socket.c @@ -0,0 +1,2551 @@ +/* + * socket.c + * + * Copyright (C) 2008-2017 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#define CF_SOCKET_PRIVATE +#include "socket.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fault.h" +#include "tls.h" + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_digest.h" + +void +cf_ip_addr_to_string_safe(const cf_ip_addr *addr, char *string, size_t size) +{ + if (cf_ip_addr_to_string(addr, string, size) < 0) { + cf_crash(CF_SOCKET, "String buffer overflow"); + } +} + +int32_t +cf_ip_addr_to_string_multi(const cf_ip_addr *addrs, uint32_t n_addrs, char *string, size_t size) +{ + size_t off = 0; + + for (uint32_t i = 0; i < n_addrs; ++i) { + if (i > 0) { + if (off >= size) { + cf_warning(CF_SOCKET, "Output buffer overflow"); + return -1; + } + + string[off] = ','; + ++off; + } + + int32_t len = cf_ip_addr_to_string(&addrs[i], string + off, size - off); + + if (len < 0) { + return -1; + } + + off += len; + } + + if (off >= size) { + cf_warning(CF_SOCKET, "Output buffer overflow"); + return -1; + } + + string[off] = 0; + return off; +} + +void +cf_ip_addr_to_string_multi_safe(const cf_ip_addr *addrs, uint32_t n_addrs, char *string, + size_t size) +{ + if (cf_ip_addr_to_string_multi(addrs, n_addrs, string, size) < 0) { + cf_crash(CF_SOCKET, "String buffer overflow"); + } +} + +int32_t +cf_ip_addr_from_string(const char *string, cf_ip_addr *addr) +{ + cf_ip_addr addrs[CF_SOCK_CFG_MAX]; + uint32_t n_addrs = CF_SOCK_CFG_MAX; + + if (cf_ip_addr_from_string_multi(string, addrs, &n_addrs) < 0) { + return -1; + } + + cf_ip_addr_copy(&addrs[0], addr); + return 0; +} + +void +cf_ip_addr_sort(cf_ip_addr *addrs, uint32_t n_addrs) +{ + int32_t n = n_addrs; + bool swapped; + + do { + swapped = false; + + for (int32_t i = 0; i < n - 1; ++i) { + if (cf_ip_addr_compare(&addrs[i], &addrs[i + 1]) < 0) { + cf_ip_addr tmp; + cf_ip_addr_copy(&addrs[i], &tmp); + cf_ip_addr_copy(&addrs[i + 1], &addrs[i]); + cf_ip_addr_copy(&tmp, &addrs[i + 1]); + swapped = true; + } + } + + --n; + } + while (swapped); +} + +static int32_t +validate_dns_label(const char *label) +{ + int32_t i; + + for (i = 0; label[i] != 0 && label[i] != '.'; ++i) { + bool ok = (label[i] >= '0' && label[i] <= '9') || + (label[i] >= 'a' && label[i] <= 'z') || + (label[i] >= 'A' && label[i] <= 'Z') || + label[i] == '-'; + + if (!ok) { + return -1; + } + } + + if (i == 0) { + return -1; + } + + return i; +} + +bool +cf_ip_addr_is_dns_name(const char *string) +{ + if (cf_inter_is_inter_name(string)) { + return false; + } + + if (string[0] >= '0' && string[0] <= '9') { + return false; + } + + int32_t n_labels = 0; + int32_t i = 0; + + while (string[i] != 0) { + int32_t len = validate_dns_label(string + i); + + if (len < 0) { + return false; + } + + i += len; + ++n_labels; + + if (string[i] == '.') { + ++i; + } + } + + return n_labels > 1; +} + +int32_t +cf_ip_port_from_string(const char *string, cf_ip_port *port) +{ + char *end; + uint64_t tmp = strtoul(string, &end, 10); + + if (*end != 0 || tmp > 65535) { + cf_warning(CF_SOCKET, "Invalid port '%s'", string); + return -1; + } + + *port = (cf_ip_port)tmp; + return 0; +} + +int32_t +cf_ip_port_to_string(cf_ip_port port, char *string, size_t size) +{ + int32_t count = snprintf(string, size, "%hu", port); + + if ((size_t)count >= size) { + cf_warning(CF_SOCKET, "Output buffer overflow"); + return -1; + } + + return count; +} + +void +cf_ip_port_to_string_safe(cf_ip_port port, char *string, size_t size) +{ + if (cf_ip_port_to_string(port, string, size) < 0) { + cf_crash(CF_SOCKET, "String buffer overflow"); + } +} + +int32_t +cf_ip_port_from_binary(const uint8_t *binary, size_t size, cf_ip_port *port) +{ + if (size < 2) { + cf_warning(CF_SOCKET, "Input buffer underflow"); + return -1; + } + + *port = (binary[0] << 8) | binary[1]; + return 2; +} + +int32_t +cf_ip_port_to_binary(cf_ip_port port, uint8_t *binary, size_t size) +{ + if (size < 2) { + cf_warning(CF_SOCKET, "Output buffer overflow"); + return -1; + } + + binary[0] = port >> 8; + binary[1] = port & 255; + return 2; +} + +void +cf_ip_port_from_node_id(cf_node id, cf_ip_port *port) +{ + uint8_t *buff = (uint8_t *)&id; + memcpy(port, buff + 6, 2); +} + +void +cf_sock_addr_to_string_safe(const cf_sock_addr *addr, char *string, size_t size) +{ + if (cf_sock_addr_to_string(addr, string, size) < 0) { + cf_crash(CF_SOCKET, "String buffer overflow"); + } +} + +int32_t +cf_sock_addr_from_binary(const uint8_t *binary, size_t size, cf_sock_addr *addr) +{ + int32_t total = 0; + int32_t count = cf_ip_addr_from_binary(binary, size, &addr->addr); + + if (count < 0) { + return -1; + } + + total += count; + count = cf_ip_port_from_binary(binary + total, size - total, &addr->port); + + if (count < 0) { + return -1; + } + + total += count; + return total; +} + +int32_t +cf_sock_addr_to_binary(const cf_sock_addr *addr, uint8_t *binary, size_t size) +{ + int32_t total = 0; + int32_t count = cf_ip_addr_to_binary(&addr->addr, binary, size); + + if (count < 0) { + return -1; + } + + total += count; + count = cf_ip_port_to_binary(addr->port, binary + total, size - total); + + if (count < 0) { + return -1; + } + + total += count; + return total; +} + +int32_t +cf_sock_addr_from_host_port(const char *host, cf_ip_port port, cf_sock_addr *addr) +{ + if (cf_ip_addr_from_string(host, &addr->addr) < 0) { + cf_warning(CF_SOCKET, "Invalid host address '%s'", host); + return -1; + } + + addr->port = port; + return 0; +} + +void +cf_sock_addr_from_addr_port(const cf_ip_addr *ip_addr, cf_ip_port port, cf_sock_addr *addr) +{ + cf_ip_addr_copy(ip_addr, &addr->addr); + addr->port = port; +} + +int32_t +cf_sock_addr_compare(const cf_sock_addr *lhs, const cf_sock_addr *rhs) +{ + int32_t res = cf_ip_addr_compare(&lhs->addr, &rhs->addr); + + if (res != 0) { + return res; + } + + if (lhs->port == rhs->port) { + return 0; + } + + return (int32_t)lhs->port - (int32_t)rhs->port; +} + +void +cf_sock_addr_copy(const cf_sock_addr *from, cf_sock_addr *to) +{ + cf_ip_addr_copy(&from->addr, &to->addr); + to->port = from->port; +} + +void +cf_sock_addr_set_any(cf_sock_addr *addr) +{ + cf_ip_addr_set_any(&addr->addr); + addr->port = 0; +} + +bool +cf_sock_addr_is_any(const cf_sock_addr *addr) +{ + return cf_ip_addr_is_any(&addr->addr) && addr->port == 0; +} + +void +cf_sock_cfg_init(cf_sock_cfg *cfg, cf_sock_owner owner) +{ + cfg->owner = owner; + cfg->port = 0; + cf_ip_addr_set_any(&cfg->addr); +} + +void +cf_sock_cfg_copy(const cf_sock_cfg *from, cf_sock_cfg *to) +{ + to->owner = from->owner; + to->port = from->port; + cf_ip_addr_copy(&from->addr, &to->addr); +} + +void +cf_serv_cfg_init(cf_serv_cfg *cfg) +{ + cfg->n_cfgs = 0; +} + +int32_t +cf_serv_cfg_add_sock_cfg(cf_serv_cfg *serv_cfg, const cf_sock_cfg *sock_cfg) +{ + if (serv_cfg->n_cfgs >= CF_SOCK_CFG_MAX) { + cf_warning(CF_SOCKET, "Too many socket configurations in server configuration"); + return -1; + } + + uint32_t n = serv_cfg->n_cfgs; + + for (uint32_t i = 0; i < n; ++i) { + cf_sock_cfg *walker = &serv_cfg->cfgs[i]; + + if (walker->owner == sock_cfg->owner && walker->port == sock_cfg->port && + cf_ip_addr_compare(&walker->addr, &sock_cfg->addr) == 0) { + return 0; + } + } + + cf_sock_cfg_copy(sock_cfg, &serv_cfg->cfgs[n]); + serv_cfg->n_cfgs = ++n; + return 0; +} + +void +cf_sockets_init(cf_sockets *socks) +{ + socks->n_socks = 0; +} + +bool +cf_sockets_has_socket(const cf_sockets *socks, const cf_socket *sock) +{ + return socks != NULL && sock >= &socks->socks[0] && sock < &socks->socks[socks->n_socks]; +} + +void +cf_sockets_close(cf_sockets *socks) +{ + for (uint32_t i = 0; i < socks->n_socks; ++i) { + cf_socket_close(&socks->socks[i]); + cf_socket_term(&socks->socks[i]); + } +} + +static int32_t +safe_fcntl(int32_t fd, int32_t cmd, int32_t arg) +{ + int32_t res = fcntl(fd, cmd, arg); + + if (res < 0) { + cf_crash(CF_SOCKET, "fcntl(%d) failed on FD %d: %d (%s)", + cmd, fd, errno, cf_strerror(errno)); + } + + return res; +} + +static int32_t +safe_ioctl(int32_t fd, int32_t req, int32_t *arg) +{ + int32_t res = ioctl(fd, req, arg); + + if (res < 0) { + cf_crash(CF_SOCKET, "ioctl(%d) failed on FD %d: %d (%s)", + req, fd, errno, cf_strerror(errno)); + } + + return res; +} + +static void +safe_setsockopt(int32_t fd, int32_t level, int32_t name, const void *val, socklen_t len) +{ + if (setsockopt(fd, level, name, val, len) < 0) { + cf_crash(CF_SOCKET, "setsockopt(%d¸ %d) failed on FD %d: %d (%s)", + level, name, fd, errno, cf_strerror(errno)); + } +} + +static void +safe_getsockopt(int32_t fd, int32_t level, int32_t name, void *val, socklen_t *len) +{ + if (getsockopt(fd, level, name, val, len) < 0) { + cf_crash(CF_SOCKET, "getsockopt(%d, %d) failed on FD %d: %d (%s)", + level, name, fd, errno, cf_strerror(errno)); + } +} + +static int32_t +safe_wait(int32_t efd, struct epoll_event *events, int32_t max, int32_t timeout) +{ + while (true) { + cf_debug(CF_SOCKET, "Waiting on epoll FD %d", efd); + int32_t count = epoll_wait(efd, events, max, timeout); + + if (count < 0) { + if (errno == EINTR) { + cf_debug(CF_SOCKET, "Interrupted"); + continue; + } + + cf_crash(CF_SOCKET, "epoll_wait() failed on epoll FD %d: %d (%s)", + efd, errno, cf_strerror(errno)); + } + + return count; + } +} + +static void +safe_close(int32_t fd) +{ + if (close(fd) < 0) { + cf_crash(CF_SOCKET, "Error while closing FD %d: %d (%s)", + fd, errno, cf_strerror(errno)); + } +} + +void +cf_fd_disable_blocking(int32_t fd) +{ + int32_t flags = safe_fcntl(fd, F_GETFL, 0); + safe_fcntl(fd, F_SETFL, flags | O_NONBLOCK); +} + +void +cf_socket_disable_blocking(cf_socket *sock) +{ + cf_fd_disable_blocking(sock->fd); +} + +void +cf_socket_enable_blocking(cf_socket *sock) +{ + int32_t flags = safe_fcntl(sock->fd, F_GETFL, 0); + safe_fcntl(sock->fd, F_SETFL, flags & ~O_NONBLOCK); +} + +void +cf_socket_disable_nagle(cf_socket *sock) +{ + static const int32_t flag = 1; + safe_setsockopt(sock->fd, SOL_TCP, TCP_NODELAY, &flag, sizeof(flag)); +} + +void +cf_socket_enable_nagle(cf_socket *sock) +{ + static const int32_t flag = 0; + safe_setsockopt(sock->fd, SOL_TCP, TCP_NODELAY, &flag, sizeof(flag)); +} + +void +cf_socket_keep_alive(cf_socket *sock, int32_t idle, int32_t interval, int32_t count) +{ + static const int32_t flag = 1; + safe_setsockopt(sock->fd, SOL_SOCKET, SO_KEEPALIVE, &flag, sizeof(flag)); + + if (idle > 0) { + safe_setsockopt(sock->fd, SOL_TCP, TCP_KEEPIDLE, &idle, sizeof(idle)); + } + + if (interval > 0) { + safe_setsockopt(sock->fd, SOL_TCP, TCP_KEEPINTVL, &interval, sizeof(interval)); + } + + if (count > 0) { + safe_setsockopt(sock->fd, SOL_TCP, TCP_KEEPCNT, &count, sizeof(count)); + } +} + +void +cf_socket_set_send_buffer(cf_socket *sock, int32_t size) +{ + safe_setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, &size, sizeof(size)); +} + +void +cf_socket_set_receive_buffer(cf_socket *sock, int32_t size) +{ + safe_setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size)); +} + +void +cf_socket_set_window(cf_socket *sock, int32_t size) +{ + safe_setsockopt(sock->fd, SOL_TCP, TCP_WINDOW_CLAMP, &size, sizeof(size)); +} + +void +cf_socket_init(cf_socket *sock) +{ + sock->fd = -1; + sock->state = CF_SOCKET_STATE_NON_TLS; + sock->cfg = NULL; + tls_socket_init(sock); +} + +bool +cf_socket_exists(cf_socket *sock) +{ + return sock->fd >= 0; +} + +int32_t +cf_socket_init_server(cf_serv_cfg *cfg, cf_sockets *socks) +{ + int32_t res = -1; + + if (cfg->n_cfgs < 1) { + cf_warning(CF_SOCKET, "Missing service socket configuration"); + goto cleanup0; + } + + cf_socket_fix_bind(cfg); + + cf_debug(CF_SOCKET, "Initializing %u server socket(s)", cfg->n_cfgs); + uint32_t n; + cf_socket *sock; + + for (n = 0; n < cfg->n_cfgs; ++n) { + sock = &socks->socks[n]; + + if (cfg->cfgs[n].port == 0) { + cf_warning(CF_SOCKET, "Missing service port"); + goto cleanup1; + } + + cf_sock_addr addr; + cf_sock_addr_from_addr_port(&cfg->cfgs[n].addr, cfg->cfgs[n].port, &addr); + + struct sockaddr_storage sas; + cf_sock_addr_to_native(&addr, (struct sockaddr *)&sas); + + cf_debug(CF_SOCKET, "Initializing server for %s", cf_sock_addr_print(&addr)); + int32_t fd = socket(sas.ss_family, SOCK_STREAM, 0); + + if (fd < 0) { + cf_warning(CF_SOCKET, "Error while creating socket for %s: %d (%s)", + cf_sock_addr_print(&addr), errno, cf_strerror(errno)); + goto cleanup1; + } + + cf_socket_init(sock); + sock->fd = fd; + fd = -1; + + cf_socket_fix_server(sock); + cf_socket_disable_blocking(sock); + + // No Nagle here. It will be disabled for the accepted connections. + + static const int32_t flag = 1; + safe_setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &flag, sizeof(flag)); + + while (bind(sock->fd, (struct sockaddr *)&sas, + cf_socket_addr_len((struct sockaddr *)&sas)) < 0) { + if (errno != EADDRINUSE) { + cf_warning(CF_SOCKET, "Error while binding to %s: %d (%s)", + cf_sock_addr_print(&addr), errno, cf_strerror(errno)); + goto cleanup2; + } + + cf_warning(CF_SOCKET, "Socket %s in use, waiting", cf_sock_addr_print(&addr)); + usleep(5 * 1000 * 1000); + } + + if (listen(sock->fd, 512) < 0) { + cf_warning(CF_SOCKET, "Error while listening on %s: %d (%s)", + cf_sock_addr_print(&addr), errno, cf_strerror(errno)); + goto cleanup2; + } + + sock->cfg = &cfg->cfgs[n]; + } + + socks->n_socks = n; + res = 0; + goto cleanup0; + +cleanup2: + cf_socket_close(sock); + cf_socket_term(sock); + +cleanup1: + for (uint32_t i = 0; i < n; ++i) { + cf_socket_close(&socks->socks[i]); + cf_socket_term(&socks->socks[i]); + } + +cleanup0: + return res; +} + +void +cf_socket_show_server(cf_fault_context cont, const char *tag, const cf_sockets *socks) +{ + for (uint32_t i = 0; i < socks->n_socks; ++i) { + cf_sock_cfg *cfg = socks->socks[i].cfg; + cf_sock_addr addr; + cf_sock_addr_from_addr_port(&cfg->addr, cfg->port, &addr); + cf_info(cont, "Started %s endpoint %s", tag, cf_sock_addr_print(&addr)); + } +} + +static int32_t +connect_socket(const cf_socket *sock, struct sockaddr *sa, int32_t timeout) +{ + cf_debug(CF_SOCKET, "Connecting FD %d", sock->fd); + int32_t res = -1; + int32_t rv = connect(sock->fd, sa, cf_socket_addr_len(sa)); + + if (rv == 0) { + cf_debug(CF_SOCKET, "FD %d connected [1]", sock->fd); + res = 0; + goto cleanup0; + } + + if (errno != EINPROGRESS) { + cf_ticker_warning(CF_SOCKET, "Error while connecting: %d (%s)", errno, cf_strerror(errno)); + goto cleanup0; + } + + if (timeout == 0) { + cf_debug(CF_SOCKET, "FD %d still connecting, but no timeout", sock->fd); + res = 0; + goto cleanup0; + } + + int32_t efd = epoll_create(1); + + if (efd < 0) { + cf_crash(CF_SOCKET, "epoll_create() failed: %d (%s)", errno, cf_strerror(errno)); + } + + struct epoll_event event = { .data.fd = sock->fd, .events = EPOLLOUT }; + + if (epoll_ctl(efd, EPOLL_CTL_ADD, sock->fd, &event) < 0) { + cf_crash(CF_SOCKET, "epoll_ctl() failed for FD %d: %d (%s)", + sock->fd, errno, cf_strerror(errno)); + } + + int32_t count = safe_wait(efd, &event, 1, timeout); + + if (count == 0) { + cf_ticker_warning(CF_SOCKET, "Timeout while connecting"); + goto cleanup1; + } + + int32_t err; + socklen_t err_len = sizeof(err); + safe_getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, &err, &err_len); + + if (err != 0) { + cf_ticker_warning(CF_SOCKET, "Error while connecting: %d (%s)", err, cf_strerror(err)); + goto cleanup1; + } + + cf_debug(CF_SOCKET, "FD %d connected [2]", sock->fd); + res = 0; + +cleanup1: + if (epoll_ctl(efd, EPOLL_CTL_DEL, sock->fd, NULL) < 0) { + cf_crash(CF_SOCKET, "epoll_ctl() failed for FD %d: %d (%s)", + sock->fd, errno, cf_strerror(errno)); + } + + safe_close(efd); + +cleanup0: + return res; +} + +int32_t +cf_socket_init_client(cf_sock_cfg *cfg, int32_t timeout, cf_socket *sock) +{ + int32_t res = -1; + + if (cf_ip_addr_is_any(&cfg->addr)) { + cf_warning(CF_SOCKET, "Missing IP address"); + goto cleanup0; + } + + if (cfg->port == 0) { + cf_warning(CF_SOCKET, "Missing port"); + goto cleanup0; + } + + cf_sock_addr addr; + cf_sock_addr_from_addr_port(&cfg->addr, cfg->port, &addr); + + struct sockaddr_storage sas; + cf_sock_addr_to_native(&addr, (struct sockaddr *)&sas); + + cf_debug(CF_SOCKET, "Initializing client for %s", cf_sock_addr_print(&addr)); + int32_t fd = socket(sas.ss_family, SOCK_STREAM, 0); + + if (fd < 0) { + cf_warning(CF_SOCKET, "Error while creating socket for %s: %d (%s)", + cf_sock_addr_print(&addr), errno, cf_strerror(errno)); + goto cleanup0; + } + + cf_socket_init(sock); + sock->fd = fd; + fd = -1; + + cf_socket_fix_client(sock); + cf_socket_disable_blocking(sock); + cf_socket_disable_nagle(sock); + + if (connect_socket(sock, (struct sockaddr *)&sas, timeout) < 0) { + cf_ticker_warning(CF_SOCKET, "Error while connecting socket to %s", + cf_sock_addr_print(&addr)); + goto cleanup1; + } + + sock->cfg = cfg; + res = 0; + goto cleanup0; + +cleanup1: + cf_socket_close(sock); + cf_socket_term(sock); + +cleanup0: + return res; +} + +int32_t +cf_socket_accept(cf_socket *lsock, cf_socket *sock, cf_sock_addr *addr) +{ + int32_t res = -1; + + struct sockaddr_storage sas; + struct sockaddr *sa = NULL; + socklen_t sa_len = 0; + + if (addr != NULL) { + sa = (struct sockaddr *)&sas; + sa_len = sizeof(sas); + } + + int32_t fd = accept(lsock->fd, sa, &sa_len); + + if (fd < 0) { + cf_debug(CF_SOCKET, "Error while accepting from FD %d: %d (%s)", + lsock->fd, errno, cf_strerror(errno)); + goto cleanup0; + } + + if (addr != NULL) { + cf_sock_addr_from_native(sa, addr); + } + + cf_socket_init(sock); + sock->fd = fd; + fd = -1; + + cf_socket_disable_blocking(sock); + cf_socket_disable_nagle(sock); + + sock->cfg = lsock->cfg; + res = 0; + +cleanup0: + return res; +} + +typedef int32_t (*name_func)(int32_t fd, struct sockaddr *sa, socklen_t *sa_len); + +static int32_t +x_name(name_func func, const char *which, int32_t fd, cf_sock_addr *addr) +{ + struct sockaddr_storage sas; + socklen_t sas_len = sizeof(sas); + + if (func(fd, (struct sockaddr *)&sas, &sas_len) < 0) { + cf_warning(CF_SOCKET, "Error while getting %s name: %d (%s)", + which, errno, cf_strerror(errno)); + return -1; + } + + cf_sock_addr_from_native((struct sockaddr *)&sas, addr); + return 0; +} + +int32_t +cf_socket_remote_name(const cf_socket *sock, cf_sock_addr *addr) +{ + return x_name(getpeername, "remote", sock->fd, addr); +} + +int32_t +cf_socket_local_name(const cf_socket *sock, cf_sock_addr *addr) +{ + return x_name(getsockname, "local", sock->fd, addr); +} + +int32_t +cf_socket_available(cf_socket *sock) +{ + int32_t size; + safe_ioctl(sock->fd, FIONREAD, &size); + + size += tls_socket_pending(sock); + + return size; +} + +int32_t +cf_socket_send_to(cf_socket *sock, const void *buff, size_t size, int32_t flags, const cf_sock_addr *addr) +{ + cf_assert(sock->ssl == NULL, CF_SOCKET, "cannot use cf_socket_send_to() with TLS"); + + struct sockaddr_storage sas; + struct sockaddr *sa = NULL; + socklen_t sa_len = 0; + + if (addr != NULL) { + cf_sock_addr_to_native(addr, (struct sockaddr *)&sas); + sa = (struct sockaddr *)&sas; + sa_len = cf_socket_addr_len((struct sockaddr *)&sas); + } + + int32_t res = sendto(sock->fd, buff, size, flags | MSG_NOSIGNAL, sa, sa_len); + + if (res < 0) { + cf_debug(CF_SOCKET, "Error while sending on FD %d: %d (%s)", + sock->fd, errno, cf_strerror(errno)); + } + + return res; +} + +int32_t +cf_socket_send(cf_socket *sock, const void *buff, size_t size, int32_t flags) +{ + if (sock->ssl) { + ssize_t rv = tls_socket_send(sock, buff, size, flags, 0); + if (rv < 0) { + // errno is set by tls_socket_send. + if (errno == ETIMEDOUT) { + errno = EAGAIN; + } + return -1; + } + else { + // This might be a partial return. + return rv; + } + } + else { + return cf_socket_send_to(sock, buff, size, flags, NULL); + } +} + +int32_t +cf_socket_recv_from(cf_socket *sock, void *buff, size_t size, int32_t flags, cf_sock_addr *addr) +{ + cf_assert(sock->ssl == NULL, CF_SOCKET, "cannot use cf_socket_recv_from() with TLS"); + + struct sockaddr_storage sas; + struct sockaddr *sa = NULL; + socklen_t sa_len = 0; + + if (addr != NULL) { + sa = (struct sockaddr *)&sas; + sa_len = sizeof(sas); + } + + int32_t res = recvfrom(sock->fd, buff, size, flags, sa, &sa_len); + + if (res < 0) { + cf_debug(CF_SOCKET, "Error while receiving on FD %d: %d (%s)", + sock->fd, errno, cf_strerror(errno)); + } + else if (addr != NULL) { + cf_sock_addr_from_native(sa, addr); + } + + return res; +} + +int32_t +cf_socket_recv(cf_socket *sock, void *buff, size_t size, int32_t flags) +{ + if (sock->ssl) { + ssize_t rv = tls_socket_recv(sock, buff, size, flags, 0); + if (rv < 0) { + // errno is set by tls_socket_send. + if (errno == ETIMEDOUT) { + errno = EAGAIN; + } + return -1; + } + else { + // This might be a partial return. + return rv; + } + } + else { + return cf_socket_recv_from(sock, buff, size, flags, NULL); + } +} + +static bool +socket_wait(const cf_socket *sock, uint16_t events, int32_t timeout) +{ + cf_detail(CF_SOCKET, "Waiting for events 0x%x on FD %d with timeout %d", + events, sock->fd, timeout); + + struct pollfd pfd = { .fd = sock->fd, .events = events | POLLRDHUP }; + + while (true) { + int32_t count = poll(&pfd, 1, timeout); + + if (count < 0) { + if (errno == EINTR) { + continue; + } + + cf_crash(CF_SOCKET, "Error while polling FD %d: %d (%s)", + pfd.fd, errno, cf_strerror(errno)); + } + + if (count > 1) { + cf_crash(CF_SOCKET, "Unexpected number of events on FD %d: %d", sock->fd, count); + } + + if (count == 0) { + cf_detail(CF_SOCKET, "Timeout while waiting on FD %d", sock->fd); + return false; + } + + cf_detail(CF_SOCKET, "Got events 0x%x on FD %d", pfd.revents, sock->fd); + return true; + } +} + +int32_t +cf_socket_send_to_all(cf_socket *sock, const void *buffp, size_t size, int32_t flags, + const cf_sock_addr *addr, int32_t timeout) +{ + cf_assert(sock->ssl == NULL, CF_SOCKET, "cannot use cf_socket_send_to_all() with TLS"); + + uint8_t *buff = (uint8_t *) buffp; + cf_detail(CF_SOCKET, "Blocking send on FD %d, size = %zu", sock->fd, size); + size_t off = 0; + + while (off < size) { + ssize_t count = cf_socket_send_to(sock, buff + off, size - off, flags, addr); + + if (count < 0) { + if (errno == EAGAIN) { + cf_debug(CF_SOCKET, "FD %d is blocking", sock->fd); + + if (socket_wait(sock, POLLOUT, timeout)) { + continue; + } + + cf_debug(CF_SOCKET, "Timeout during blocking send on FD %d", sock->fd); + errno = ETIMEDOUT; + return -1; + } + + return -1; + } + + if (count == 0) { + // TODO - remove warning if this turns out to be normal. + cf_warning(CF_SOCKET, "Sent 0 bytes on FD %d", sock->fd); + errno = ENOTCONN; + return -1; + } + + off += count; + } + + cf_detail(CF_SOCKET, "Blocking send on FD %d complete", sock->fd); + return 0; +} + +int32_t +cf_socket_send_all(cf_socket *sock, const void *buff, size_t size, int32_t flags, + int32_t timeout) +{ + if (sock->ssl) { + return tls_socket_send(sock, buff, size, flags, timeout); + } + else { + return cf_socket_send_to_all(sock, buff, size, flags, NULL, timeout); + } +} + +int32_t +cf_socket_recv_from_all(cf_socket *sock, void *buffp, size_t size, int32_t flags, + cf_sock_addr *addr, int32_t timeout) +{ + cf_assert(sock->ssl == NULL, CF_SOCKET, "cannot use cf_socket_recv_from_all() with TLS"); + + uint8_t *buff = (uint8_t *) buffp; + cf_detail(CF_SOCKET, "Blocking receive on FD %d, size = %zu", sock->fd, size); + size_t off = 0; + + while (off < size) { + ssize_t count = cf_socket_recv_from(sock, buff + off, size - off, flags, addr); + + if (count < 0) { + if (errno == EAGAIN) { + cf_debug(CF_SOCKET, "FD %d is blocking", sock->fd); + + if (socket_wait(sock, POLLIN, timeout)) { + continue; + } + + cf_debug(CF_SOCKET, "Timeout during blocking receive on FD %d", sock->fd); + errno = ETIMEDOUT; + return -1; + } + + return -1; + } + + if (count == 0) { + errno = ENOTCONN; + return -1; + } + + off += count; + } + + cf_detail(CF_SOCKET, "Blocking receive on FD %d complete", sock->fd); + return 0; +} + +int32_t +cf_socket_recv_all(cf_socket *sock, void *buff, size_t size, int32_t flags, int32_t timeout) +{ + if (sock->ssl) { + return tls_socket_recv(sock, buff, size, flags, timeout); + } + else { + return cf_socket_recv_from_all(sock, buff, size, flags, NULL, timeout); + } +} + +static void +x_shutdown(cf_socket *sock, int32_t how) +{ + if (sock->ssl) { + tls_socket_shutdown(sock); + } + + if (shutdown(sock->fd, how) < 0) { + if (errno != ENOTCONN) { + cf_crash(CF_SOCKET, "shutdown() failed on FD %d: %d (%s)", + sock->fd, errno, cf_strerror(errno)); + } + else { + cf_debug(CF_SOCKET, "shutdown() on disconnected FD %d: %d (%s)", + sock->fd, errno, cf_strerror(errno)); + } + } +} + +void +cf_socket_write_shutdown(cf_socket *sock) +{ + cf_debug(CF_SOCKET, "Shutting down write channel of FD %d", sock->fd); + x_shutdown(sock, SHUT_WR); +} + +void +cf_socket_shutdown(cf_socket *sock) +{ + cf_debug(CF_SOCKET, "Shutting down FD %d", sock->fd); + x_shutdown(sock, SHUT_RDWR); +} + +void +cf_socket_close(cf_socket *sock) +{ + cf_debug(CF_SOCKET, "Closing FD %d", sock->fd); + tls_socket_close(sock); + safe_close(sock->fd); + sock->fd = -1; +} + +void +cf_socket_drain_close(cf_socket *sock) +{ + cf_debug(CF_SOCKET, "Draining and closing FD %d", sock->fd); + int32_t efd = epoll_create(1); + + if (efd < 0) { + cf_crash(CF_SOCKET, "epoll_create() failed: %d (%s)", errno, cf_strerror(errno)); + } + + struct epoll_event event = { .data.fd = sock->fd, .events = EPOLLRDHUP }; + + if (epoll_ctl(efd, EPOLL_CTL_ADD, sock->fd, &event) < 0) { + cf_crash(CF_SOCKET, "epoll_ctl() failed for FD %d: %d (%s)", + sock->fd, errno, cf_strerror(errno)); + } + + cf_socket_shutdown(sock); + int32_t count = safe_wait(efd, &event, 1, 5000); + + if (count == 0) { + cf_warning(CF_SOCKET, "Timeout while waiting for FD %d to drain", sock->fd); + goto cleanup1; + } + + cf_debug(CF_SOCKET, "FD %d drained", sock->fd); + +cleanup1: + if (epoll_ctl(efd, EPOLL_CTL_DEL, sock->fd, NULL) < 0) { + cf_crash(CF_SOCKET, "epoll_ctl() failed for FD %d: %d (%s)", + sock->fd, errno, cf_strerror(errno)); + } + + safe_close(efd); + cf_socket_close(sock); + cf_socket_term(sock); +} + +void +cf_socket_term(cf_socket *sock) +{ + tls_socket_term(sock); + sock->fd = -1; +} + +void +cf_msock_cfg_init(cf_msock_cfg *cfg, cf_sock_owner owner) +{ + cfg->owner = owner; + cfg->port = 0; + cf_ip_addr_set_any(&cfg->addr); + cf_ip_addr_set_any(&cfg->if_addr); + cfg->ttl = 0; +} + +void +cf_msock_cfg_copy(const cf_msock_cfg *from, cf_msock_cfg *to) +{ + to->owner = from->owner; + to->port = from->port; + cf_ip_addr_copy(&from->addr, &to->addr); + cf_ip_addr_copy(&from->if_addr, &to->if_addr); + to->ttl = from->ttl; +} + +void +cf_mserv_cfg_init(cf_mserv_cfg *cfg) +{ + cfg->n_cfgs = 0; +} + +int32_t +cf_mserv_cfg_add_msock_cfg(cf_mserv_cfg *serv_cfg, const cf_msock_cfg *sock_cfg) +{ + if (serv_cfg->n_cfgs >= CF_SOCK_CFG_MAX) { + cf_warning(CF_SOCKET, "Too many socket configurations in server configuration"); + return -1; + } + + uint32_t n = serv_cfg->n_cfgs; + + for (uint32_t i = 0; i < n; ++i) { + cf_msock_cfg *walker = &serv_cfg->cfgs[i]; + + if (walker->owner == sock_cfg->owner && walker->port == sock_cfg->port && + cf_ip_addr_compare(&walker->addr, &sock_cfg->addr) == 0 && + cf_ip_addr_compare(&walker->if_addr, &sock_cfg->if_addr) == 0 && + walker->ttl == sock_cfg->ttl) { + return 0; + } + } + + cf_msock_cfg_copy(sock_cfg, &serv_cfg->cfgs[n]); + serv_cfg->n_cfgs = ++n; + return 0; +} + +int32_t +cf_socket_mcast_init(cf_mserv_cfg *cfg, cf_sockets *socks) +{ + int32_t res = -1; + + if (cfg->n_cfgs < 1) { + cf_warning(CF_SOCKET, "Missing multicast socket configuration"); + goto cleanup0; + } + + cf_debug(CF_SOCKET, "Initializing %u multicast socket(s)", cfg->n_cfgs); + uint32_t n; + cf_socket *sock; + + for (n = 0; n < cfg->n_cfgs; ++n) { + sock = &socks->socks[n]; + + if (cfg->cfgs[n].port == 0) { + cf_warning(CF_SOCKET, "Missing multicast port"); + goto cleanup1; + } + + cf_sock_addr addr; + cf_sock_addr_from_addr_port(&cfg->cfgs[n].addr, cfg->cfgs[n].port, &addr); + + struct sockaddr_storage sas; + cf_sock_addr_to_native(&addr, (struct sockaddr *)&sas); + + cf_debug(CF_SOCKET, "Initializing multicast socket for %s", cf_sock_addr_print(&addr)); + int32_t fd = socket(sas.ss_family, SOCK_DGRAM, 0); + + if (fd < 0) { + cf_warning(CF_SOCKET, "Error while creating socket for %s: %d (%s)", + cf_sock_addr_print(&addr), errno, cf_strerror(errno)); + goto cleanup1; + } + + cf_socket_init(sock); + sock->fd = fd; + fd = -1; + + cf_socket_fix_client(sock); + cf_socket_fix_server(sock); + + static const int32_t yes = 1; + safe_setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)); + + if (!cf_ip_addr_is_any(&cfg->cfgs[n].if_addr)) { + cf_info(CF_SOCKET, "Setting multicast interface address: %s", + cf_ip_addr_print(&cfg->cfgs[n].if_addr)); + + if (cf_socket_mcast_set_inter(sock, &cfg->cfgs[n].if_addr) < 0) { + cf_warning(CF_SOCKET, "Error while binding to interface %s", + cf_ip_addr_print(&cfg->cfgs[n].if_addr)); + goto cleanup2; + } + } + + uint8_t ttl = cfg->cfgs[n].ttl; + + if (ttl > 0) { + cf_info(CF_SOCKET, "Setting multicast TTL: %d", ttl); + + if (cf_socket_mcast_set_ttl(sock, ttl) < 0) { + cf_warning(CF_SOCKET, "Error while setting multicast TTL"); + goto cleanup2; + } + } + + while (bind(sock->fd, (struct sockaddr *)&sas, + cf_socket_addr_len((struct sockaddr *)&sas)) < 0) { + if (errno != EADDRINUSE) { + cf_warning(CF_SOCKET, "Error while binding to %s: %d (%s)", + cf_sock_addr_print(&addr), errno, cf_strerror(errno)); + goto cleanup2; + } + + cf_warning(CF_SOCKET, "Socket %s in use, waiting", cf_sock_addr_print(&addr)); + usleep(5 * 1000 * 1000); + } + + cf_info(CF_SOCKET, "Joining multicast group: %s", cf_ip_addr_print(&addr.addr)); + + if (cf_socket_mcast_join_group(sock, &cfg->cfgs[n].if_addr, &addr.addr) < 0) { + cf_warning(CF_SOCKET, "Error while joining multicast group %s", + cf_ip_addr_print(&addr.addr)); + goto cleanup2; + } + + sock->cfg = &cfg->cfgs[n]; + } + + socks->n_socks = n; + res = 0; + goto cleanup0; + +cleanup2: + cf_socket_close(sock); + cf_socket_term(sock); + +cleanup1: + for (uint32_t i = 0; i < n; ++i) { + cf_socket_close(&socks->socks[i]); + cf_socket_term(&socks->socks[i]); + } + +cleanup0: + return res; +} + +void +cf_socket_mcast_show(cf_fault_context cont, const char *tag, const cf_sockets *socks) +{ + for (uint32_t i = 0; i < socks->n_socks; ++i) { + cf_msock_cfg *cfg = socks->socks[i].cfg; + cf_sock_addr addr; + cf_sock_addr_from_addr_port(&cfg->if_addr, cfg->port, &addr); + cf_info(cont, "Started %s endpoint %s", tag, cf_sock_addr_print(&addr)); + } +} + +// #define VERY_CHATTY + +void +cf_poll_create(cf_poll *poll) +{ + int32_t fd = epoll_create(1); + + if (fd < 0) { + cf_crash(CF_SOCKET, "Error while creating epoll instance: %d (%s)", + errno, cf_strerror(errno)); + } + + *poll = (cf_poll){ .fd = fd }; + cf_debug(CF_SOCKET, "Created new epoll instance with FD %d", fd); +} + +void +cf_poll_add_fd(cf_poll poll, int32_t fd, uint32_t events, void *data) +{ + cf_debug(CF_SOCKET, + "Adding FD %d to epoll instance with FD %d, events = 0x%x", + fd, poll.fd, events); + struct epoll_event ev = { .events = events, .data.ptr = data }; + + if (epoll_ctl(poll.fd, EPOLL_CTL_ADD, fd, &ev) < 0) { + cf_crash(CF_SOCKET, + "Error while adding FD %d to epoll instance %d: %d (%s)", + fd, poll.fd, errno, cf_strerror(errno)); + } +} + +void +cf_poll_add_socket(cf_poll poll, const cf_socket *sock, uint32_t events, void *data) +{ + cf_poll_add_fd(poll, sock->fd, events, data); +} + +int32_t +cf_poll_modify_socket_forgiving(cf_poll poll, const cf_socket *sock, uint32_t events, void *data, + uint32_t n_err_ok, int32_t *err_ok) +{ +#if defined VERY_CHATTY + cf_detail(CF_SOCKET, "Modifying FD %d in epoll instance with FD %d, events = 0x%x", + sock->fd, poll.fd, events); +#endif + + struct epoll_event ev = { .events = events, .data.ptr = data }; + + if (epoll_ctl(poll.fd, EPOLL_CTL_MOD, sock->fd, &ev) < 0) { + for (uint32_t i = 0; i < n_err_ok; ++i) { + if (errno == err_ok[i]) { + return errno; + } + } + + cf_crash(CF_SOCKET, "Error while modifying FD %d in epoll instance %d: %d (%s)", + sock->fd, poll.fd, errno, cf_strerror(errno)); + } + + return 0; +} + +int32_t +cf_poll_delete_socket_forgiving(cf_poll poll, const cf_socket *sock, uint32_t n_err_ok, + int32_t *err_ok) +{ + cf_detail(CF_SOCKET, "Deleting FD %d from epoll instance with FD %d", sock->fd, poll.fd); + + if (epoll_ctl(poll.fd, EPOLL_CTL_DEL, sock->fd, NULL) < 0) { + for (uint32_t i = 0; i < n_err_ok; ++i) { + if (errno == err_ok[i]) { + return errno; + } + } + + cf_crash(CF_SOCKET, "Error while deleting FD %d from epoll instance %d: %d (%s)", + sock->fd, poll.fd, errno, cf_strerror(errno)); + } + + return 0; +} + +void +cf_poll_add_sockets(cf_poll poll, cf_sockets *socks, uint32_t events) +{ + for (uint32_t i = 0; i < socks->n_socks; ++i) { + cf_poll_add_socket(poll, &socks->socks[i], events, &socks->socks[i]); + } +} + +void +cf_poll_delete_sockets(cf_poll poll, cf_sockets *socks) +{ + for (uint32_t i = 0; i < socks->n_socks; ++i) { + cf_poll_delete_socket(poll, &socks->socks[i]); + } +} + +int32_t +cf_poll_wait(cf_poll poll, cf_poll_event *events, int32_t limit, int32_t timeout) +{ +#if defined VERY_CHATTY + cf_detail(CF_SOCKET, "Waiting on epoll instance with FD %d", poll.fd); +#endif + + while (true) { + int32_t res = epoll_wait(poll.fd, (struct epoll_event *)events, limit, timeout); + + if (res >= 0) { +#if defined VERY_CHATTY + if (cf_fault_filter[CF_SOCKET] >= CF_DETAIL) { + cf_detail(CF_SOCKET, "Epoll instance with FD %d reports %d event(s)", poll.fd, res); + + for (int32_t i = 0; i < res; ++i) { + cf_detail(CF_SOCKET, "Event #%d: 0x%x, %p", + i, events[i].events, events[i].data); + } + } +#endif + + return res; + } + + if (errno != EINTR) { + cf_crash(CF_SOCKET, "Error while waiting for events on epoll instance %d: %d (%s)", + poll.fd, errno, cf_strerror(errno)); + } + } +} + +void +cf_poll_destroy(cf_poll poll) +{ + cf_debug(CF_SOCKET, "Destroying epoll instance with FD %d", poll.fd); + + if (close(poll.fd) < 0) { + cf_crash(CF_SOCKET, "Error while closing epoll instance: %d (%s)", + errno, cf_strerror(errno)); + } +} + +#define RESP_SIZE (2 * 1024 * 1024) +#define MAX_INTERS 500 +#define MAX_ADDRS 20 + +typedef struct inter_entry_s { + uint32_t index; + char name[50]; + bool def_route; + bool up; + uint32_t mtu; + uint32_t mac_addr_len; + uint8_t mac_addr[50]; + uint32_t n_addrs; + cf_ip_addr addrs[MAX_ADDRS]; + + union { + struct inter_entry_s *entry; + uint32_t index; + } master; +} inter_entry; + +typedef struct inter_info_s { + uint32_t n_inters; + inter_entry inters[MAX_INTERS]; +} inter_info; + +typedef struct inter_filter_s { + bool allow_v6; + bool def_route; + bool up; + const char *if_name; +} inter_filter; + +typedef struct cb_context_s { + bool has_label; + bool has_address; + bool has_local; + bool has_index; + bool has_priority; + char curr_label[50]; + cf_ip_addr curr_address; + uint32_t curr_index; + uint32_t curr_priority; + bool allow_v6; + inter_info *inter; +} cb_context; + +typedef void (*reset_cb)(cb_context *cont); +typedef void (*data_cb)(cb_context *cont, void *info, int32_t type, void *data, size_t len); +typedef void (*post_cb)(cb_context *cont); + +static int32_t +netlink_dump(int32_t type, int32_t filter1, int32_t filter2a, int32_t filter2b, int32_t filter2c, + int32_t filter2d, size_t size, reset_cb reset_fn, data_cb data_fn, post_cb post_fn, + cb_context *cont) +{ + int32_t res = -1; + int32_t nls = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + + if (nls < 0) { + cf_warning(CF_SOCKET, "Error while creating netlink socket: %d (%s)", + errno, cf_strerror(errno)); + goto cleanup0; + } + + struct sockaddr_nl loc; + memset(&loc, 0, sizeof(loc)); + loc.nl_family = AF_NETLINK; + + if (bind(nls, (struct sockaddr *)&loc, sizeof(loc)) < 0) { + cf_warning(CF_SOCKET, "Error while binding netlink socket: %d (%s)", + errno, cf_strerror(errno)); + goto cleanup1; + } + + static cf_atomic32 seq = 0; + struct { + struct nlmsghdr h; + struct rtgenmsg m; + } req; + + memset(&req, 0, sizeof(req)); + req.h.nlmsg_len = NLMSG_LENGTH(sizeof(req.m)); + req.h.nlmsg_type = type; + req.h.nlmsg_flags = NLM_F_REQUEST | NLM_F_ROOT; + req.h.nlmsg_seq = cf_atomic32_add(&seq, 1); + req.m.rtgen_family = PF_UNSPEC; + + struct sockaddr_nl rem; + memset(&rem, 0, sizeof(rem)); + rem.nl_family = AF_NETLINK; + + struct iovec iov; + memset(&iov, 0, sizeof(iov)); + iov.iov_base = &req; + iov.iov_len = req.h.nlmsg_len; + + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_name = &rem; + msg.msg_namelen = sizeof(rem); + + if (sendmsg(nls, &msg, 0) < 0) { + cf_warning(CF_SOCKET, "Error while sending netlink request: %d (%s)", + errno, cf_strerror(errno)); + goto cleanup1; + } + + uint8_t *resp = cf_malloc(RESP_SIZE); + + memset(resp, 0, RESP_SIZE); + bool done = false; + + while (!done) { + memset(&rem, 0, sizeof(rem)); + memset(&iov, 0, sizeof(iov)); + iov.iov_base = resp; + iov.iov_len = RESP_SIZE; + + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_name = &rem; + msg.msg_namelen = sizeof(rem); + + ssize_t len = recvmsg(nls, &msg, 0); + + if (len < 0) { + cf_warning(CF_SOCKET, "Error while receiving netlink response: %d (%s)", + errno, cf_strerror(errno)); + goto cleanup2; + } + + if ((msg.msg_flags & MSG_TRUNC) != 0) { + cf_warning(CF_SOCKET, "Received truncated netlink message"); + goto cleanup2; + } + + struct nlmsghdr *h = (struct nlmsghdr *)resp; + + while (NLMSG_OK(h, len)) { + if (h->nlmsg_type == NLMSG_NOOP) { + h = NLMSG_NEXT(h, len); + continue; + } + + if (h->nlmsg_type == NLMSG_ERROR) { + int32_t *err = NLMSG_DATA(h); + cf_warning(CF_SOCKET, "Received netlink error message: %d (%s)", + -*err, cf_strerror(-*err)); + goto cleanup2; + } + + if (h->nlmsg_type == NLMSG_DONE) { + done = true; + break; + } + + if (h->nlmsg_type == NLMSG_OVERRUN) { + cf_warning(CF_SOCKET, "Received netlink overrun message"); + goto cleanup2; + } + + if (h->nlmsg_type == filter1) { + if (reset_fn != NULL) { + reset_fn(cont); + } + + void *info = NLMSG_DATA(h); + uint32_t a_len = h->nlmsg_len - NLMSG_LENGTH(size); + struct rtattr *a = (struct rtattr *)((uint8_t *)info + NLMSG_ALIGN(size)); + + while (RTA_OK(a, a_len)) { + if (a->rta_type == filter2a || a->rta_type == filter2b || + a->rta_type == filter2c || a->rta_type == filter2d) { + data_fn(cont, info, a->rta_type, RTA_DATA(a), RTA_PAYLOAD(a)); + } + + a = RTA_NEXT(a, a_len); + } + + if (post_fn != NULL) { + post_fn(cont); + } + } + + if ((h->nlmsg_flags & NLM_F_MULTI) == 0) { + done = true; + break; + } + + h = NLMSG_NEXT(h, len); + } + } + + res = 0; + +cleanup2: + cf_free(resp); + +cleanup1: + close(nls); + +cleanup0: + return res; +} + +static void +reset_fn(cb_context *cont) +{ + cont->has_label = false; + cont->has_address = false; + cont->has_local = false; + cont->has_index = false; + cont->has_priority = false; + memset(&cont->curr_label, 0, sizeof(cont->curr_label)); + cf_ip_addr_set_any(&cont->curr_address); + cont->curr_index = 0; + cont->curr_priority = 0; +} + +static void +link_fn(cb_context *cont, void *info_, int32_t type, void *data, size_t len) +{ + struct ifinfomsg *info = info_; + inter_info *inter = cont->inter; + inter_entry *entry = NULL; + + for (uint32_t i = 0; i < inter->n_inters; ++i) { + if (inter->inters[i].index == info->ifi_index) { + entry = &inter->inters[i]; + break; + } + } + + if (entry == NULL) { + uint32_t i = inter->n_inters; + + if (i >= MAX_INTERS) { + cf_crash(CF_SOCKET, "Too many interfaces"); + } + + entry = &inter->inters[i]; + ++inter->n_inters; + + entry->index = info->ifi_index; + entry->up = (info->ifi_flags & (IFF_UP | IFF_RUNNING)) == (IFF_UP | IFF_RUNNING); + } + + if (type == IFLA_IFNAME) { + if (len > sizeof(entry->name)) { + cf_crash(CF_SOCKET, "Interface name too long: %s", (char *)data); + } + + // Length includes terminating NUL. + memcpy(entry->name, data, len); + cf_detail(CF_SOCKET, "Collected interface name %s", entry->name); + } + else if (type == IFLA_ADDRESS) { + if (len > sizeof(entry->mac_addr)) { + cf_crash(CF_SOCKET, "MAC address too long"); + } + + entry->mac_addr_len = (uint32_t)len; + memcpy(entry->mac_addr, data, len); + } + else if (type == IFLA_MTU) { + if (len != 4) { + cf_crash(CF_SOCKET, "MTU value has invalid length: %zu", len); + } + + memcpy(&entry->mtu, data, len); + cf_detail(CF_SOCKET, "Collected interface MTU %s -> %u", entry->name, entry->mtu); + } + else if (type == IFLA_MASTER) { + if (len != 4) { + cf_crash(CF_SOCKET, "Master index has invalid length: %zu", len); + } + + memcpy(&entry->master.index, data, len); + cf_detail(CF_SOCKET, "Collected interface master index %s -> %u", + entry->name, entry->master.index); + } +} + +static void +addr_fn(cb_context *cont, void *info_, int32_t type, void *data, size_t len) +{ + struct ifaddrmsg *info = info_; + + if (cont->curr_index == 0) { + cont->curr_index = info->ifa_index; + } + + if (type == IFA_LABEL) { + if (len > sizeof(cont->curr_label)) { + cf_crash(CF_SOCKET, "Interface label too long: %s", (char *)data); + } + + // Length includes terminating NUL. + memcpy(cont->curr_label, data, len); + cont->has_label = true; + cf_detail(CF_SOCKET, "Collected interface label %s", cont->curr_label); + } + else if (type == IFA_ADDRESS) { + // IFA_LOCAL takes precedence over IFA_ADDRESS. + if (cont->has_local) { + cf_detail(CF_SOCKET, "Prioritizing local address"); + return; + } + + if (cf_socket_parse_netlink(cont->allow_v6, info->ifa_family, info->ifa_flags, + data, len, &cont->curr_address) < 0) { + return; + } + + cont->has_address = true; + cf_detail(CF_SOCKET, "Considering interface address %s", + cf_ip_addr_print(&cont->curr_address)); + } + else if (type == IFA_LOCAL) { + if (cf_socket_parse_netlink(cont->allow_v6, info->ifa_family, info->ifa_flags, + data, len, &cont->curr_address) < 0) { + return; + } + + cont->has_local = true; + cf_detail(CF_SOCKET, "Considering local interface address %s", + cf_ip_addr_print(&cont->curr_address)); + } +} + +static void +addr_fix_fn(cb_context *cont) +{ + if (!cont->has_address && !cont->has_local) { + return; + } + + inter_info *inter = cont->inter; + inter_entry *by_index = NULL; + inter_entry *by_label = NULL; + + for (uint32_t i = 0; i < inter->n_inters; ++i) { + if (inter->inters[i].index == cont->curr_index) { + by_index = &inter->inters[i]; + break; + } + } + + if (by_index == NULL) { + cf_crash(CF_SOCKET, "Invalid interface index: %u", cont->curr_index); + } + + if (cont->has_label) { + for (uint32_t i = 0; i < inter->n_inters; ++i) { + if (strcmp(inter->inters[i].name, cont->curr_label) == 0) { + by_label = &inter->inters[i]; + break; + } + } + + if (by_label == NULL) { + cf_detail(CF_SOCKET, "New interface for label %s", cont->curr_label); + uint32_t i = inter->n_inters; + + if (i >= MAX_INTERS) { + cf_crash(CF_SOCKET, "Too many interfaces"); + } + + by_label = &inter->inters[i]; + ++inter->n_inters; + + by_label->index = by_index->index; + by_label->up = by_index->up; + memcpy(&by_label->mac_addr, &by_index->mac_addr, sizeof(by_label->mac_addr)); + by_label->mac_addr_len = by_index->mac_addr_len; + by_label->mtu = by_index->mtu; + + memcpy(&by_label->name, cont->curr_label, sizeof(by_label->name)); + } + } + + inter_entry *entry = by_label != NULL ? by_label : by_index; + uint32_t i = entry->n_addrs; + + if (i >= MAX_ADDRS) { + cf_crash(CF_SOCKET, "Too many addresses for interface %s", entry->name); + } + + cf_ip_addr *addr = &entry->addrs[i]; + cf_ip_addr_copy(&cont->curr_address, addr); + + ++entry->n_addrs; + cf_detail(CF_SOCKET, "Collected interface address %s -> %s", + entry->name, cf_ip_addr_print(addr)); +} + +static void +route_fn(cb_context *cont, void *info_, int32_t type, void *data, size_t len) +{ + struct rtmsg *info = info_; + + // Ignore entries with RTM_F_CLONED, because they are route cache entries. + if ((info->rtm_flags & RTM_F_CLONED) != 0) { + return; + } + + if (type == RTA_DST) { + if (cf_socket_parse_netlink(cont->allow_v6, info->rtm_family, 0, + data, len, &cont->curr_address) < 0) { + // If the address is not allowed, set to a non-zero address, because + // zero means default route. + cf_ip_addr_set_local(&cont->curr_address); + } + + cont->has_address = true; + } + else if (type == RTA_OIF) { + if (len != 4) { + cf_detail(CF_SOCKET, "Invalid interface index"); + return; + } + + cont->curr_index = *(uint32_t *)data; + cont->has_index = true; + } + else if (type == RTA_PRIORITY) { + if (len != 4) { + cf_detail(CF_SOCKET, "Invalid route priority"); + return; + } + + cont->curr_priority = *(uint32_t *)data; + cont->has_priority = true; + } +} + +static void +route_fix_fn(cb_context *cont) +{ + // It's not a default route, if it has an address and the address isn't zero. + if (cont->has_address && !cf_ip_addr_is_any(&cont->curr_address)) { + return; + } + + // It's one of the catch-all entries. + if (cont->has_priority && cont->curr_priority == UINT32_MAX) { + return; + } + + // It doesn't have an interface index. + if (!cont->has_index) { + return; + } + + inter_info *inter = cont->inter; + bool found = false; + + for (uint32_t i = 0; i < inter->n_inters; ++i) { + inter_entry *entry = &inter->inters[i]; + + if (inter->inters[i].index == cont->curr_index) { + found = true; + entry->def_route = true; + cf_detail(CF_SOCKET, "Collected default route %s -> %s", + entry->name, cf_ip_addr_print(&cont->curr_address)); + // Don't stop after the first match. Aliases share the same index. + } + } + + if (!found) { + cf_crash(CF_SOCKET, "Invalid interface index: %u", cont->curr_index); + } +} + +static void +enumerate_inter(inter_info *inter, bool allow_v6) +{ + cb_context cont; + memset(&cont, 0, sizeof(cont)); + cont.inter = inter; + cont.allow_v6 = allow_v6; + + reset_fn(&cont); + + if (netlink_dump(RTM_GETLINK, RTM_NEWLINK, IFLA_IFNAME, IFLA_ADDRESS, IFLA_MTU, IFLA_MASTER, + sizeof(struct ifinfomsg), NULL, link_fn, NULL, &cont) < 0) { + cf_crash(CF_SOCKET, "Error while enumerating network links"); + } + + if (netlink_dump(RTM_GETADDR, RTM_NEWADDR, IFA_LABEL, IFA_ADDRESS, IFA_LOCAL, -1, + sizeof(struct ifaddrmsg), reset_fn, addr_fn, addr_fix_fn, &cont) < 0) { + cf_crash(CF_SOCKET, "Error while enumerating network addresses"); + } + + if (netlink_dump(RTM_GETROUTE, RTM_NEWROUTE, RTA_DST, RTA_OIF, RTA_PRIORITY, -1, + sizeof(struct rtmsg), reset_fn, route_fn, route_fix_fn, &cont) < 0) { + cf_crash(CF_SOCKET, "Error while enumerating network routes"); + } + + for (int32_t i = 0; i < inter->n_inters; ++i) { + inter_entry *entry = &inter->inters[i]; + cf_ip_addr_sort(entry->addrs, entry->n_addrs); + + if (entry->master.index == 0) { + entry->master.entry = NULL; + continue; + } + + inter_entry *master = NULL; + + for (int32_t k = 0; k < inter->n_inters; ++k) { + inter_entry *cand = &inter->inters[k]; + + if (cand->index == entry->master.index) { + master = cand; + break; + } + } + + if (master == NULL) { + cf_crash(CF_SOCKET, "Invalid master index: %u", entry->master.index); + } + + entry->master.entry = master; + } + + if (cf_fault_filter[CF_SOCKET] >= CF_DETAIL) { + cf_detail(CF_SOCKET, "%u interface(s)", inter->n_inters); + + for (uint32_t i = 0; i < inter->n_inters; ++i) { + inter_entry *entry = &inter->inters[i]; + cf_detail(CF_SOCKET, "Name = %s", entry->name); + cf_detail(CF_SOCKET, "MAC address = %02x:%02x:%02x:%02x:%02x:%02x", + entry->mac_addr[0], entry->mac_addr[1], entry->mac_addr[2], + entry->mac_addr[3], entry->mac_addr[4], entry->mac_addr[5]); + cf_detail(CF_SOCKET, "Default route = %d", (int32_t)entry->def_route); + cf_detail(CF_SOCKET, "Up = %d", (int32_t)entry->up); + cf_detail(CF_SOCKET, "MTU = %u", entry->mtu); + + for (int32_t k = 0; k < entry->n_addrs; ++k) { + cf_ip_addr *addr = &entry->addrs[k]; + cf_detail(CF_SOCKET, "Address = %s", cf_ip_addr_print(addr)); + } + + cf_detail(CF_SOCKET, "Master = %s", + entry->master.entry != NULL ? entry->master.entry->name : "(none)"); + } + } +} + +static int32_t +inter_get_addr(cf_ip_addr *addrs, uint32_t *n_addrs, inter_filter *filter) +{ + inter_info inter; + memset(&inter, 0, sizeof(inter)); + enumerate_inter(&inter, filter->allow_v6); + + uint32_t count = 0; + + for (uint32_t i = 0; i < inter.n_inters; ++i) { + inter_entry *entry = &inter.inters[i]; + + if (filter->def_route && !entry->def_route) { + continue; + } + + if (filter->up && !entry->up) { + continue; + } + + if (filter->if_name != NULL && strcmp(filter->if_name, entry->name) != 0) { + continue; + } + + for (uint32_t k = 0; k < entry->n_addrs; ++k) { + cf_ip_addr *addr = &entry->addrs[k]; + + if (count >= *n_addrs) { + cf_warning(CF_SOCKET, "Buffer overflow while enumerating interface addresses"); + return -1; + } + + cf_ip_addr_copy(addr, &addrs[count]); + ++count; + } + } + + *n_addrs = count; + return 0; +} + +int32_t +cf_inter_get_addr_all(cf_ip_addr *addrs, uint32_t *n_addrs) +{ + static inter_filter filter = { + .allow_v6 = true, .def_route = false, .up = true, .if_name = NULL + }; + + return inter_get_addr(addrs, n_addrs, &filter); +} + +int32_t +cf_inter_get_addr_all_legacy(cf_ip_addr *addrs, uint32_t *n_addrs) +{ + static inter_filter filter = { + .allow_v6 = false, .def_route = false, .up = true, .if_name = NULL + }; + + return inter_get_addr(addrs, n_addrs, &filter); +} + +int32_t +cf_inter_get_addr_def(cf_ip_addr *addrs, uint32_t *n_addrs) +{ + static inter_filter filter = { + .allow_v6 = true, .def_route = true, .up = true, .if_name = NULL + }; + + return inter_get_addr(addrs, n_addrs, &filter); +} + +int32_t +cf_inter_get_addr_def_legacy(cf_ip_addr *addrs, uint32_t *n_addrs) +{ + static inter_filter filter = { + .allow_v6 = false, .def_route = true, .up = true, .if_name = NULL + }; + + return inter_get_addr(addrs, n_addrs, &filter); +} + +int32_t +cf_inter_get_addr_name(cf_ip_addr *addrs, uint32_t *n_addrs, const char *if_name) +{ + inter_filter filter = { + .allow_v6 = true, .def_route = false, .up = false, .if_name = if_name + }; + + return inter_get_addr(addrs, n_addrs, &filter); +} + +bool +cf_inter_is_inter_name(const char *if_name) +{ + inter_info inter; + memset(&inter, 0, sizeof(inter)); + enumerate_inter(&inter, true); + + for (uint32_t i = 0; i < inter.n_inters; ++i) { + if (strcmp(inter.inters[i].name, if_name) == 0) { + return true; + } + } + + return false; +} + +int32_t +cf_inter_addr_to_index_and_name(const cf_ip_addr *addr, int32_t *index, char **name) +{ + inter_info inter; + memset(&inter, 0, sizeof(inter)); + enumerate_inter(&inter, true); + + for (uint32_t i = 0; i < inter.n_inters; ++i) { + inter_entry *entry = &inter.inters[i]; + + for (uint32_t k = 0; k < entry->n_addrs; ++k) { + if (cf_ip_addr_compare(&entry->addrs[k], addr) == 0) { + if (name != NULL) { + *name = cf_strdup(entry->name); + } + + if (index != NULL) { + *index = (int32_t)entry->index; + } + + return 0; + } + } + } + + return -1; +} + +void +cf_inter_expand_bond(const char *if_name, char **out_names, uint32_t *n_out) +{ + inter_info inter; + memset(&inter, 0, sizeof(inter)); + enumerate_inter(&inter, true); + + uint32_t n = 0; + + for (uint32_t i = 0; i < inter.n_inters; ++i) { + inter_entry *entry = &inter.inters[i]; + + if (entry->master.entry == NULL || strcmp(entry->master.entry->name, if_name) != 0) { + continue; + } + + if (n >= *n_out) { + cf_crash(CF_SOCKET, "Output buffer overflow"); + } + + out_names[n] = cf_strdup(entry->name); + ++n; + } + + if (n == 0) { + out_names[0] = cf_strdup(if_name); + n = 1; + } + + *n_out = n; +} + +int32_t +cf_inter_mtu(const cf_ip_addr *inter_addr) +{ + inter_info inter; + memset(&inter, 0, sizeof(inter)); + enumerate_inter(&inter, true); + + for (uint32_t i = 0; i < inter.n_inters; ++i) { + inter_entry *entry = &inter.inters[i]; + + for (uint32_t k = 0; k < entry->n_addrs; ++k) { + cf_ip_addr *entry_addr = &entry->addrs[k]; + + if (cf_ip_addr_compare(inter_addr, entry_addr) == 0) { + return entry->mtu; + } + } + } + + return -1; +} + +int32_t +cf_inter_min_mtu(void) +{ + uint32_t min = UINT32_MAX; + inter_info inter; + memset(&inter, 0, sizeof(inter)); + enumerate_inter(&inter, true); + + for (uint32_t i = 0; i < inter.n_inters; ++i) { + inter_entry *entry = &inter.inters[i]; + + if (entry->up && entry->mtu < min) { + min = entry->mtu; + } + } + + return (int32_t)min; +} + +static bool +detect_changes(bool legacy, cf_ip_addr *addrs, uint32_t *n_addrs, uint32_t limit) +{ + cf_ip_addr curr[CF_SOCK_CFG_MAX]; + uint32_t n_curr = CF_SOCK_CFG_MAX; + int32_t res; + + if (legacy) { + res = cf_inter_get_addr_all_legacy(curr, &n_curr); + } + else { + res = cf_inter_get_addr_all(curr, &n_curr); + } + + if (res < 0) { + cf_crash(AS_INFO, "Error while getting interface addresses"); + } + + if (n_curr > limit) { + cf_crash(AS_INFO, "Too many network interface addresses: %d", n_curr); + } + + cf_ip_addr_sort(curr, n_curr); + uint32_t n_filter = 0; + + for (uint32_t i = 0; i < n_curr; ++i) { + if (cf_ip_addr_is_local(&curr[i])) { + continue; + } + + if (i > n_filter) { + cf_ip_addr_copy(&curr[i], &curr[n_filter]); + } + + ++n_filter; + } + + n_curr = n_filter; + bool change = false; + + if (n_curr != *n_addrs) { + change = true; + } + else { + for (uint32_t i = 0; i < n_curr; ++i) { + if (cf_ip_addr_compare(&addrs[i], &curr[i]) != 0) { + change = true; + break; + } + } + } + + if (change) { + for (uint32_t i = 0; i < n_curr; ++i) { + cf_ip_addr_copy(&curr[i], &addrs[i]); + } + + *n_addrs = n_curr; + } + + return change; +} + +bool +cf_inter_detect_changes(cf_ip_addr *addrs, uint32_t *n_addrs, uint32_t limit) +{ + return detect_changes(false, addrs, n_addrs, limit); +} + +bool +cf_inter_detect_changes_legacy(cf_ip_addr *addrs, uint32_t *n_addrs, uint32_t limit) +{ + return detect_changes(true, addrs, n_addrs, limit); +} + +static const char *if_in_order[] = { + "eth", "bond", "wlan", + NULL +}; + +static const char *if_default[] = { + "^eth[[:digit:]]+$", "^bond[[:digit:]]+$", "^wlan[[:digit:]]+$", + "^em[[:digit:]]+_[[:digit:]]+$", "^p[[:digit:]]+p[[:digit:]]+_[[:digit:]]+$", + NULL +}; + +static const char *if_default2[] = { + "^em[[:digit:]]+$", "^p[[:digit:]]+p[[:digit:]]+$", NULL +}; + +static const char *if_any[] = { + "^.*$", + NULL +}; + +static bool +validate_inter(inter_entry *entry) +{ + cf_debug(CF_SOCKET, "Validating interface %s", entry->name); + + if (entry->n_addrs == 0) { + cf_debug(CF_SOCKET, "No IP addresses"); + return false; + } + + if (entry->mac_addr_len < 6) { + cf_debug(CF_SOCKET, "Invalid MAC address length: %d", entry->mac_addr_len); + return false; + } + + static const uint8_t all0[6] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + static const uint8_t all1[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + + if (memcmp(entry->mac_addr, all0, 6) == 0 || memcmp(entry->mac_addr, all1, 6) == 0) { + cf_debug(CF_SOCKET, "Invalid MAC address: %02x:%02x:%02x:%02x:%02x:%02x", + entry->mac_addr[0], entry->mac_addr[1], entry->mac_addr[2], + entry->mac_addr[3], entry->mac_addr[4], entry->mac_addr[5]); + return false; + } + + cf_debug(CF_SOCKET, "Interface OK"); + return true; +} + +static inter_entry * +find_inter(inter_info *inter, const char *name, bool validate) +{ + cf_debug(CF_SOCKET, "Looking for %s", name); + + for (uint32_t i = 0; i < inter->n_inters; ++i) { + inter_entry *entry = &inter->inters[i]; + cf_debug(CF_SOCKET, "Checking %s", entry->name); + + if (strcmp(entry->name, name) == 0 && (!validate || validate_inter(entry))) { + return entry; + } + } + + return NULL; +} + +static inter_entry * +match_inter(inter_info *inter, const char **patterns) +{ + for (uint32_t i = 0; i < inter->n_inters; ++i) { + inter_entry *entry = &inter->inters[i]; + cf_debug(CF_SOCKET, "Matching %s", entry->name); + + for (uint32_t k = 0; patterns[k] != NULL; ++k) { + cf_debug(CF_SOCKET, "Matching with %s", patterns[k]); + regex_t rex; + + if (regcomp(&rex, patterns[k], REG_EXTENDED | REG_NOSUB) != 0) { + cf_crash(CF_SOCKET, "Error while compiling regular expression %s", patterns[k]); + } + + bool ok = regexec(&rex, entry->name, 0, NULL, 0) == 0 && validate_inter(entry); + regfree(&rex); + + if (ok) { + return entry; + } + } + } + + return NULL; +} + +int32_t +cf_node_id_get(cf_ip_port port, const char *if_hint, cf_node *id) +{ + cf_debug(CF_SOCKET, "Getting node ID"); + inter_info inter; + memset(&inter, 0, sizeof(inter)); + enumerate_inter(&inter, true); + + inter_entry *entry; + + if (if_hint != NULL) { + cf_debug(CF_SOCKET, "Checking user-specified interface %s", if_hint); + entry = find_inter(&inter, if_hint, false); + + if (entry != NULL) { + goto success; + } + + cf_warning(CF_SOCKET, "Unable to find interface %s specified in configuration file", + if_hint); + return -1; + } + + cf_debug(CF_SOCKET, "Trying default interfaces in order"); + + for (int32_t i = 0; if_in_order[i] != NULL; ++i) { + for (int32_t k = 0; k < 11; ++k) { + char tmp[100]; + snprintf(tmp, sizeof(tmp), "%s%d", if_in_order[i], k); + entry = find_inter(&inter, tmp, true); + + if (entry != NULL) { + goto success; + } + } + } + + cf_debug(CF_SOCKET, "Trying default interfaces"); + entry = match_inter(&inter, if_default); + + if (entry != NULL) { + goto success; + } + + cf_debug(CF_SOCKET, "Trying secondary default interfaces"); + entry = match_inter(&inter, if_default2); + + if (entry != NULL) { + goto success; + } + + cf_debug(CF_SOCKET, "Trying any interface"); + entry = match_inter(&inter, if_any); + + if (entry != NULL) { + goto success; + } + + cf_warning(CF_SOCKET, "Unable to find any suitable network device for node ID"); + return -1; + +success: + ; + uint8_t *buff = (uint8_t *)id; + + if (entry->mac_addr_len == 6) { + memcpy(buff, entry->mac_addr, 6); + } + else { + cf_digest dig; + cf_digest_compute(entry->mac_addr, entry->mac_addr_len, &dig); + memcpy(buff, dig.digest, 6); + } + + memcpy(buff + 6, &port, 2); + + cf_info(CF_SOCKET, "Node port %d, node ID %" PRIx64, port, *id); + return 0; +} diff --git a/cf/src/socket_ce.c b/cf/src/socket_ce.c new file mode 100644 index 00000000..0fa6c72f --- /dev/null +++ b/cf/src/socket_ce.c @@ -0,0 +1,459 @@ +/* + * socket_ce.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +#define CF_SOCKET_PRIVATE +#include "socket.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "fault.h" + +#include "citrusleaf/alloc.h" + +static char * +safe_strndup(const char *string, size_t length) +{ + char *res = cf_strndup(string, length); + + if (res == NULL) { + cf_crash(CF_SOCKET, "Out of memory"); + } + + return res; +} + +void +cf_socket_set_advertise_ipv6(bool advertise) +{ + cf_warning(CF_SOCKET, "'advertise-ipv6' is relevant for enterprise only"); +} + +bool +cf_socket_advertises_ipv6(void) +{ + return false; +} + +int32_t +cf_ip_addr_from_string_multi(const char *string, cf_ip_addr *addrs, uint32_t *n_addrs) +{ + if (strcmp(string, "any") == 0) { + if (*n_addrs < 1) { + cf_warning(CF_SOCKET, "Too many IP addresses"); + return -1; + } + + cf_ip_addr_set_any(&addrs[0]); + *n_addrs = 1; + return 0; + } + + if (strcmp(string, "local") == 0) { + if (*n_addrs < 1) { + cf_warning(CF_SOCKET, "Too many IP addresses"); + return -1; + } + + cf_ip_addr_set_local(&addrs[0]); + *n_addrs = 1; + return 0; + } + + if (cf_inter_is_inter_name(string)) { + cf_ip_addr if_addrs[CF_SOCK_CFG_MAX]; + uint32_t n_if_addrs = CF_SOCK_CFG_MAX; + + if (cf_inter_get_addr_name(if_addrs, &n_if_addrs, string) < 0) { + cf_warning(CF_SOCKET, "Error while getting interface addresses for '%s'", string); + return -1; + } + + if (n_if_addrs == 0) { + cf_warning(CF_SOCKET, "Interface %s does not have any IP addresses", string); + return -1; + } + + if (n_if_addrs > *n_addrs) { + cf_warning(CF_SOCKET, "Too many IP addresses"); + return -1; + } + + for (uint32_t i = 0; i < n_if_addrs; ++i) { + cf_ip_addr_copy(&if_addrs[i], &addrs[i]); + } + + *n_addrs = n_if_addrs; + return 0; + } + + int32_t res = -1; + struct addrinfo *info = NULL; + static struct addrinfo hints = { + .ai_flags = 0, + .ai_family = AF_INET + }; + + int32_t x = getaddrinfo(string, NULL, &hints, &info); + + if (x != 0) { + cf_warning(CF_SOCKET, "Error while converting address '%s': %s", string, gai_strerror(x)); + goto cleanup0; + } + + uint32_t i = 0; + + for (struct addrinfo *walker = info; walker != NULL; walker = walker->ai_next) { + if (walker->ai_socktype == SOCK_STREAM) { + if (i >= *n_addrs) { + cf_warning(CF_SOCKET, "Too many IP addresses"); + goto cleanup1; + } + + struct sockaddr_in *sai = (struct sockaddr_in *)walker->ai_addr; + addrs[i].v4 = sai->sin_addr; + ++i; + } + } + + cf_ip_addr_sort(addrs, i); + *n_addrs = i; + res = 0; + +cleanup1: + freeaddrinfo(info); + +cleanup0: + return res; +} + +bool +cf_ip_addr_str_is_legacy(const char *string) +{ + (void)string; + return true; +} + +bool +cf_ip_addr_is_legacy(const cf_ip_addr* addr) +{ + (void)addr; + return true; +} + +bool +cf_ip_addr_legacy_only(void) +{ + return true; +} + +int32_t +cf_ip_addr_to_string(const cf_ip_addr *addr, char *string, size_t size) +{ + if (inet_ntop(AF_INET, &addr->v4, string, size) == NULL) { + cf_warning(CF_SOCKET, "Output buffer overflow"); + return -1; + } + + return strlen(string); +} + +int32_t +cf_ip_addr_from_binary(const uint8_t *binary, size_t size, cf_ip_addr *addr) +{ + if (size != 4) { + cf_debug(CF_SOCKET, "Input buffer size incorrect."); + return -1; + } + + memcpy(&addr->v4, binary, 4); + return 4; +} + +int32_t +cf_ip_addr_to_binary(const cf_ip_addr *addr, uint8_t *binary, size_t size) +{ + if (size < 4) { + cf_warning(CF_SOCKET, "Output buffer overflow"); + return -1; + } + + memcpy(binary, &addr->v4, 4); + return 4; +} + +void +cf_ip_addr_to_rack_aware_id(const cf_ip_addr *addr, uint32_t *id) +{ + *id = ntohl(addr->v4.s_addr); +} + +int32_t +cf_ip_addr_compare(const cf_ip_addr *lhs, const cf_ip_addr *rhs) +{ + return memcmp(&lhs->v4, &rhs->v4, 4); +} + +void +cf_ip_addr_copy(const cf_ip_addr *from, cf_ip_addr *to) +{ + to->v4 = from->v4; +} + +void +cf_ip_addr_set_local(cf_ip_addr *addr) +{ + addr->v4.s_addr = htonl(0x7f000001); +} + +bool +cf_ip_addr_is_local(const cf_ip_addr *addr) +{ + return (ntohl(addr->v4.s_addr) & 0xff000000) == 0x7f000000; +} + +void +cf_ip_addr_set_any(cf_ip_addr *addr) +{ + addr->v4.s_addr = 0; +} + +bool +cf_ip_addr_is_any(const cf_ip_addr *addr) +{ + return addr->v4.s_addr == 0; +} + +int32_t +cf_sock_addr_to_string(const cf_sock_addr *addr, char *string, size_t size) +{ + int32_t total = 0; + int32_t count = cf_ip_addr_to_string(&addr->addr, string, size); + + if (count < 0) { + return -1; + } + + total += count; + + if (size - total < 2) { + cf_warning(CF_SOCKET, "Output buffer overflow"); + return -1; + } + + string[total++] = ':'; + string[total] = 0; + + count = cf_ip_port_to_string(addr->port, string + total, size - total); + + if (count < 0) { + return -1; + } + + total += count; + return total; +} + +int32_t +cf_sock_addr_from_string(const char *string, cf_sock_addr *addr) +{ + int32_t res = -1; + const char *colon = strchr(string, ':'); + + if (colon == NULL) { + cf_warning(CF_SOCKET, "Missing ':' in socket address '%s'", string); + goto cleanup0; + } + + const char *host = safe_strndup(string, colon - string); + + if (cf_ip_addr_from_string(host, &addr->addr) < 0) { + cf_warning(CF_SOCKET, "Invalid host address '%s' in socket address '%s'", host, string); + goto cleanup1; + } + + if (cf_ip_port_from_string(colon + 1, &addr->port) < 0) { + cf_warning(CF_SOCKET, "Invalid port '%s' in socket address '%s'", colon + 1, string); + goto cleanup1; + } + + res = 0; + +cleanup1: + cf_free((void *)host); + +cleanup0: + return res; +} + +void +cf_sock_addr_from_native(const struct sockaddr *native, cf_sock_addr *addr) +{ + if (native->sa_family != AF_INET) { + cf_crash(CF_SOCKET, "Invalid address family: %d", native->sa_family); + } + + struct sockaddr_in *sai = (struct sockaddr_in *)native; + addr->addr.v4 = sai->sin_addr; + addr->port = ntohs(sai->sin_port); +} + +void +cf_sock_addr_to_native(const cf_sock_addr *addr, struct sockaddr *native) +{ + struct sockaddr_in *sai = (struct sockaddr_in *)native; + memset(sai, 0, sizeof(struct sockaddr_in)); + sai->sin_family = AF_INET; + sai->sin_addr = addr->addr.v4; + sai->sin_port = htons(addr->port); +} + +int32_t +cf_mserv_cfg_add_combo(cf_mserv_cfg *serv_cfg, cf_sock_owner owner, cf_ip_port port, + cf_ip_addr *addr, cf_ip_addr *if_addr, uint8_t ttl) +{ + cf_msock_cfg sock_cfg; + cf_msock_cfg_init(&sock_cfg, owner); + sock_cfg.port = port; + cf_ip_addr_copy(addr, &sock_cfg.addr); + cf_ip_addr_copy(if_addr, &sock_cfg.if_addr); + sock_cfg.ttl = ttl; + + return cf_mserv_cfg_add_msock_cfg(serv_cfg, &sock_cfg); +} + +int32_t +cf_socket_mcast_set_inter(cf_socket *sock, const cf_ip_addr *iaddr) +{ + struct ip_mreqn mr; + memset(&mr, 0, sizeof(mr)); + mr.imr_address = iaddr->v4; + + if (setsockopt(sock->fd, IPPROTO_IP, IP_MULTICAST_IF, &mr, sizeof(mr)) < 0) { + cf_warning(CF_SOCKET, "setsockopt(IP_MULTICAST_IF) failed on FD %d: %d (%s)", + sock->fd, errno, cf_strerror(errno)); + return -1; + } + + return 0; +} + +int32_t +cf_socket_mcast_set_ttl(cf_socket *sock, int32_t ttl) +{ + if (setsockopt(sock->fd, IPPROTO_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)) < 0) { + cf_warning(CF_SOCKET, "setsockopt(IP_MULTICAST_TTL) failed on FD %d: %d (%s)", + sock->fd, errno, cf_strerror(errno)); + return -1; + } + + return 0; +} + +int32_t +cf_socket_mcast_join_group(cf_socket *sock, const cf_ip_addr *iaddr, const cf_ip_addr *gaddr) +{ + struct ip_mreqn mr; + memset(&mr, 0, sizeof(mr)); + + if (!cf_ip_addr_is_any(iaddr)) { + mr.imr_address = iaddr->v4; + } + + mr.imr_multiaddr = gaddr->v4; + + if (setsockopt(sock->fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mr, sizeof(mr)) < 0) { + cf_warning(CF_SOCKET, "setsockopt(IP_ADD_MEMBERSHIP) failed on FD %d: %d (%s)", + sock->fd, errno, cf_strerror(errno)); + return -1; + } + +#ifdef IP_MULTICAST_ALL + // Only receive traffic from multicast groups this socket actually joins. + // Note: Bind address filtering takes precedence, so this is simply an extra level of + // restriction. + static const int32_t no = 0; + + if (setsockopt(sock->fd, IPPROTO_IP, IP_MULTICAST_ALL, &no, sizeof(no)) < 0) { + cf_warning(CF_SOCKET, "setsockopt(IP_MULTICAST_ALL) failed on FD %d: %d (%s)", + sock->fd, errno, cf_strerror(errno)); + return -1; + } +#endif + + return 0; +} + +size_t +cf_socket_addr_len(const struct sockaddr *sa) +{ + switch (sa->sa_family) { + case AF_INET: + return sizeof(struct sockaddr_in); + + default: + cf_crash(CF_SOCKET, "Invalid address family: %d", sa->sa_family); + return 0; + } +} + +int32_t +cf_socket_parse_netlink(bool allow_ipv6, uint32_t family, uint32_t flags, + const void *data, size_t len, cf_ip_addr *addr) +{ + (void)allow_ipv6; + (void)flags; + + if (family != AF_INET || len != 4) { + return -1; + } + + memcpy(&addr->v4, data, 4); + return 0; +} + +void +cf_socket_fix_client(cf_socket *sock) +{ + (void)sock; +} + +void +cf_socket_fix_bind(cf_serv_cfg *serv_cfg) +{ + (void)serv_cfg; +} + +void +cf_socket_fix_server(cf_socket *sock) +{ + (void)sock; +} diff --git a/cf/src/tls_ce.c b/cf/src/tls_ce.c new file mode 100644 index 00000000..c86438cf --- /dev/null +++ b/cf/src/tls_ce.c @@ -0,0 +1,159 @@ +/* + * tls.c + * + * Copyright (C) 2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ +#include + +#include "fault.h" +#include "socket.h" +#include "tls.h" + +void +tls_check_init() +{ +} + +void +tls_cleanup() +{ +} + +void +tls_thread_cleanup() +{ +} + +void +tls_socket_init(cf_socket *sock) +{ + sock->ssl = NULL; +} + +void +tls_socket_term(cf_socket *sock) +{ + if (sock->ssl) { + cf_crash(CF_TLS, "unexpected TLS state"); + } +} + +int +tls_socket_shutdown(cf_socket *sock) +{ + if (sock->ssl) { + cf_crash(CF_TLS, "unexpected TLS state"); + } + return -1; +} + +void +tls_socket_close(cf_socket *sock) +{ + if (sock->ssl) { + cf_crash(CF_TLS, "unexpected TLS state"); + } +} + +cf_tls_info * +tls_config_server_context(cf_tls_spec *tspec, bool auth_client, uint32_t n_peer_names, char **peer_names) +{ + cf_crash(CF_TLS, "unexpected TLS state"); + return NULL; +} + +cf_tls_info * +tls_config_intra_context(cf_tls_spec *tspec, const char *which) +{ + cf_crash(CF_TLS, "unexpected TLS state"); + return NULL; +} + +void +tls_socket_prepare_server(cf_tls_info *info, cf_socket *sock) +{ + cf_crash(CF_TLS, "unexpected TLS state"); +} + +void +tls_socket_prepare_client(cf_tls_info *info, cf_socket *sock) +{ + cf_crash(CF_TLS, "unexpected TLS state"); +} + +void +tls_socket_must_not_have_data(cf_socket *sock, const char *caller) +{ + if (sock->state == CF_SOCKET_STATE_NON_TLS) { + return; + } + + cf_crash(CF_TLS, "unexpected TLS state"); +} + +int +tls_socket_accept(cf_socket *sock) +{ + cf_crash(CF_TLS, "unexpected TLS state"); + return 1; +} + +int +tls_socket_connect(cf_socket *sock) +{ + cf_crash(CF_TLS, "unexpected TLS state"); + return 1; +} + +int +tls_socket_accept_block(cf_socket *sock) +{ + cf_crash(CF_TLS, "unexpected TLS state"); + return 1; +} + +int +tls_socket_connect_block(cf_socket *sock) +{ + cf_crash(CF_TLS, "unexpected TLS state"); + return 1; +} + +int +tls_socket_recv(cf_socket *sock, void *buf, size_t sz, int32_t flags, + uint64_t deadline_msec) +{ + cf_crash(CF_TLS, "unexpected TLS state"); + return 1; +} + +int +tls_socket_send(cf_socket *sock, void const *buf, size_t sz, int32_t flags, + uint64_t deadline_msec) +{ + cf_crash(CF_TLS, "unexpected TLS state"); + return 1; +} + +int +tls_socket_pending(cf_socket *sock) +{ + return 0; +} + diff --git a/cf/src/vmapx.c b/cf/src/vmapx.c new file mode 100644 index 00000000..c2ad4a9b --- /dev/null +++ b/cf/src/vmapx.c @@ -0,0 +1,398 @@ +/* + * vmapx.c + * + * Copyright (C) 2012-2016 Aerospike, Inc. + * + * Portions may be licensed to Aerospike, Inc. under one or more contributor + * license agreements. + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more + * details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ + */ + +//========================================================== +// Includes. +// + +#include "vmapx.h" + +#include +#include +#include +#include +#include + +#include "fault.h" + +#include "citrusleaf/alloc.h" +#include "citrusleaf/cf_hash_math.h" + + +//========================================================== +// Forward declarations. +// + +bool vhash_get(const vhash* h, const char* key, size_t key_len, uint32_t* p_value); + + +//========================================================== +// Public API. +// + +// Return memory size needed - includes cf_vmapx struct plus values vector. +size_t +cf_vmapx_sizeof(uint32_t value_size, uint32_t max_count) +{ + return sizeof(cf_vmapx) + ((size_t)value_size * (size_t)max_count); +} + +// Initialize an already allocated cf_vmapx object. +void +cf_vmapx_init(cf_vmapx* vmap, uint32_t value_size, uint32_t max_count, + uint32_t hash_size, uint32_t max_name_size) +{ + cf_assert(vmap, CF_VMAPX, "null vmap pointer"); + cf_assert((value_size & 3) == 0, CF_VMAPX, "bad value_size"); + cf_assert(max_count != 0, CF_VMAPX, "bad max_count"); + cf_assert(hash_size != 0, CF_VMAPX, "bad hash_size"); + cf_assert(max_name_size != 0 && max_name_size <= value_size, CF_VMAPX, + "bad max_name_size"); + + vmap->value_size = value_size; + vmap->max_count = max_count; + vmap->count = 0; + + vmap->key_size = max_name_size; + vmap->hash = vhash_create(max_name_size, hash_size); + + pthread_mutex_init(&vmap->write_lock, 0); +} + +// Don't call after failed cf_vmapx_create() or cf_vmapx_resume() call - those +// functions clean up on failure. +void +cf_vmapx_release(cf_vmapx* vmap) +{ + // Helps in handling bins vmap, which doesn't exist in single-bin mode. + if (! vmap) { + return; + } + + pthread_mutex_destroy(&vmap->write_lock); + + vhash_destroy(vmap->hash); +} + +// Return count. +uint32_t +cf_vmapx_count(const cf_vmapx* vmap) +{ + return vmap->count; +} + +// Get value by index. +cf_vmapx_err +cf_vmapx_get_by_index(const cf_vmapx* vmap, uint32_t index, void** pp_value) +{ + // This check is commented out for now to avoid the volatile access. + // TODO - ultimately, caller code can be simplified. (Especially if this + // just returned the value pointer.) And if necessary, we could make a + // "safe" version of this that does the check. + +// if (index >= vmap->count) { +// return CF_VMAPX_ERR_BAD_PARAM; +// } + + *pp_value = vmapx_value_ptr(vmap, index); + + return CF_VMAPX_OK; +} + +// Get value by null-terminated name. +cf_vmapx_err +cf_vmapx_get_by_name(const cf_vmapx* vmap, const char* name, void** pp_value) +{ + size_t name_len = strlen(name); + + if (name_len >= vmap->key_size) { + return CF_VMAPX_ERR_NAME_NOT_FOUND; + } + + uint32_t index; + + if (! vhash_get(vmap->hash, name, name_len, &index)) { + return CF_VMAPX_ERR_NAME_NOT_FOUND; + } + + *pp_value = vmapx_value_ptr(vmap, index); + + return CF_VMAPX_OK; +} + +// Get index by null-terminated name. May pass null p_index to check existence. +cf_vmapx_err +cf_vmapx_get_index(const cf_vmapx* vmap, const char* name, uint32_t* p_index) +{ + size_t name_len = strlen(name); + + if (name_len >= vmap->key_size) { + return CF_VMAPX_ERR_NAME_NOT_FOUND; + } + + return vhash_get(vmap->hash, name, name_len, p_index) ? + CF_VMAPX_OK : CF_VMAPX_ERR_NAME_NOT_FOUND; +} + +// Same as above, but non-null-terminated name. +cf_vmapx_err +cf_vmapx_get_index_w_len(const cf_vmapx* vmap, const char* name, + size_t name_len, uint32_t* p_index) +{ + if (name_len >= vmap->key_size) { + return CF_VMAPX_ERR_NAME_NOT_FOUND; + } + + return vhash_get(vmap->hash, name, name_len, p_index) ? + CF_VMAPX_OK : CF_VMAPX_ERR_NAME_NOT_FOUND; +} + +// The value must begin with a string which is its name. (The hash map is not +// stored in persistent memory, so names must be in the vector to enable us to +// rebuild the hash map on warm or cool restart.) +// +// If name is not found, add new name, clear rest of value in vector, and return +// newly assigned index (and CF_VMAPX_OK). If name is found, return index for +// existing value (with CF_VMAPX_ERR_NAME_EXISTS). May pass null p_index. +cf_vmapx_err +cf_vmapx_put_unique(cf_vmapx* vmap, const char* name, uint32_t* p_index) +{ + return cf_vmapx_put_unique_w_len(vmap, name, strlen(name), p_index); +} + +// Same as above, but with known name length. +cf_vmapx_err +cf_vmapx_put_unique_w_len(cf_vmapx* vmap, const char* name, size_t name_len, + uint32_t* p_index) +{ + // Make sure name fits in key's allocated size. + if (name_len >= vmap->key_size) { + return CF_VMAPX_ERR_BAD_PARAM; + } + + pthread_mutex_lock(&vmap->write_lock); + + // If name is found, return existing name's index, ignore p_value. + if (vhash_get(vmap->hash, name, name_len, p_index)) { + pthread_mutex_unlock(&vmap->write_lock); + return CF_VMAPX_ERR_NAME_EXISTS; + } + + // Make sure name has no illegal premature null-terminator. + for (uint32_t i = 0; i < name_len; i++) { + if (name[i] == 0) { + pthread_mutex_unlock(&vmap->write_lock); + return CF_VMAPX_ERR_BAD_PARAM; + } + } + + uint32_t count = vmap->count; + + // If vmap is full, can't add more. + if (count >= vmap->max_count) { + pthread_mutex_unlock(&vmap->write_lock); + return CF_VMAPX_ERR_FULL; + } + + // Add name to vector (and clear rest of value). + char* value_ptr = (char*)vmapx_value_ptr(vmap, count); + + memset((void*)value_ptr, 0, vmap->value_size); + memcpy((void*)value_ptr, name, name_len); + + // Increment count here so indexes returned by other public API calls (just + // after adding to hash below) are guaranteed to be valid. + vmap->count++; + + // Add to hash. + vhash_put(vmap->hash, value_ptr, name_len, count); + + pthread_mutex_unlock(&vmap->write_lock); + + if (p_index) { + *p_index = count; + } + + return CF_VMAPX_OK; +} + + +//========================================================== +// Private API - for enterprise separation only. +// + +// Return value pointer at trusted index. +void* +vmapx_value_ptr(const cf_vmapx* vmap, uint32_t index) +{ + return (void*)(vmap->values + (vmap->value_size * index)); +} + + +//========================================================== +// vhash "scoped class". +// + +// Custom hashmap for cf_vmapx usage. +// - Elements are added but never removed. +// - It's thread safe yet lockless. (Relies on cf_vmapx's write_lock.) +// - Element keys are null-terminated strings. +// - Element values are uint32_t's. + +struct vhash_s { + uint32_t key_size; + uint32_t ele_size; + uint32_t n_rows; + uint8_t* table; + bool row_usage[]; +}; + +typedef struct vhash_ele_s { + struct vhash_ele_s* next; + uint8_t data[]; // key_size bytes of key, 4 bytes of value +} vhash_ele; + +#define VHASH_ELE_KEY_PTR(_e) ((char*)_e->data) +#define VHASH_ELE_VALUE_PTR(_h, _e) ((uint32_t*)(_e->data + _h->key_size)) + +// Copy null-terminated key into hash, then pad with non-null characters. +// Padding ensures quicker compare in vhash_get() when key in hash is shorter, +// and prevents accidental match if key param has illegal null character(s). +static inline void +vhash_set_ele_key(char* ele_key, size_t key_size, const char* zkey, + size_t zkey_size) +{ + memcpy((void*)ele_key, (const void*)zkey, zkey_size); + memset((void*)(ele_key + zkey_size), 'x', key_size - zkey_size); +} + +// Create vhash with specified key size (max) and number or rows. +vhash* +vhash_create(uint32_t key_size, uint32_t n_rows) +{ + size_t row_usage_size = n_rows * sizeof(bool); + vhash* h = (vhash*)cf_malloc(sizeof(vhash) + row_usage_size); + + h->key_size = key_size; + h->ele_size = sizeof(vhash_ele) + key_size + sizeof(uint32_t); + h->n_rows = n_rows; + + size_t table_size = n_rows * h->ele_size; + + h->table = (uint8_t*)cf_malloc(table_size); + + memset((void*)h->row_usage, 0, row_usage_size); + memset((void*)h->table, 0, table_size); + + return h; +} + +// Destroy vhash. (Assumes it was fully created.) +void +vhash_destroy(vhash* h) +{ + vhash_ele* e_table = (vhash_ele*)h->table; + + for (uint32_t i = 0; i < h->n_rows; i++) { + if (e_table->next) { + vhash_ele* e = e_table->next; + + while (e) { + vhash_ele* t = e->next; + + cf_free(e); + e = t; + } + } + + e_table = (vhash_ele*)((uint8_t*)e_table + h->ele_size); + } + + cf_free(h->table); + cf_free(h); +} + +// Add element. Key must be null-terminated, although its length is known. +void +vhash_put(vhash* h, const char* zkey, size_t key_len, uint32_t value) +{ + uint64_t hashed_key = cf_hash_fnv32((const uint8_t*)zkey, key_len); + uint32_t row_i = (uint32_t)(hashed_key % h->n_rows); + + vhash_ele* e = (vhash_ele*)(h->table + (h->ele_size * row_i)); + + if (! h->row_usage[row_i]) { + vhash_set_ele_key(VHASH_ELE_KEY_PTR(e), h->key_size, zkey, key_len + 1); + *VHASH_ELE_VALUE_PTR(h, e) = value; + // TODO - need barrier? + h->row_usage[row_i] = true; + + return; + } + + vhash_ele* e_head = e; + + // This function is always called under write lock, after get, so we'll + // never encounter the key - don't bother checking it. + while (e) { + e = e->next; + } + + e = (vhash_ele*)cf_malloc(h->ele_size); + + vhash_set_ele_key(VHASH_ELE_KEY_PTR(e), h->key_size, zkey, key_len + 1); + *VHASH_ELE_VALUE_PTR(h, e) = value; + + e->next = e_head->next; + // TODO - need barrier? + e_head->next = e; +} + +// Get element value. Key may or may not be null-terminated. +bool +vhash_get(const vhash* h, const char* key, size_t key_len, uint32_t* p_value) +{ + uint64_t hashed_key = cf_hash_fnv32((const uint8_t*)key, key_len); + uint32_t row_i = (uint32_t)(hashed_key % h->n_rows); + + if (! h->row_usage[row_i]) { + return false; + } + + // TODO - need barrier? + vhash_ele* e = (vhash_ele*)(h->table + (h->ele_size * row_i)); + + while (e) { + if (VHASH_ELE_KEY_PTR(e)[key_len] == 0 && + memcmp(VHASH_ELE_KEY_PTR(e), key, key_len) == 0) { + if (p_value) { + *p_value = *VHASH_ELE_VALUE_PTR(h, e); + } + + return true; + } + + e = e->next; + } + + return false; +} diff --git a/make_in/Makefile.in b/make_in/Makefile.in new file mode 100644 index 00000000..de078017 --- /dev/null +++ b/make_in/Makefile.in @@ -0,0 +1,97 @@ +# Aerospike Server +# Makefile.in +# +# Define project global settings for compiler options. +# + +# [Note: "DEPTH" must be defined by the "include"ing Makefile.] + +# Common variable definitions: +include $(DEPTH)/make_in/Makefile.vars + +CF_INCLUDE_DIR = $(DEPTH)/common/src/include +CF_LIB_DIR = $(DEPTH)/common/target/$(PLATFORM)/lib + +AS_CFLAGS += -D_FILE_OFFSET_BITS=64 -std=gnu99 -D_REENTRANT -D_GNU_SOURCE + +# Use the enhanced memory allocator (rather than the default version in the Common module.) +AS_CFLAGS += -DENHANCED_ALLOC + +LIBRARIES += -lcrypto + +LIBRARIES += -lpthread -lrt -ldl -lz -lm + +# Location of source files being compiled: +# "" (the default) means "src". Automatically set to "../m4/" during non-zero macro expansion build phases. +SRCDIR = + +INCLUDES += -I$(JEMALLOC)/include +LIBRARIES := $(JEMALLOC)/lib/libjemalloc.a $(LIBRARIES) + +# Popular values: +# x86_64 for 64-bit Intel +# i686 for 32-bit Intel +MARCH_NATIVE = $(shell uname -m) + +# If GCC v4.4.7 or later, use DWARF version 4, othewise use version 2: +ifeq ($(shell $(DEPTH)/build/VersionCheck.py 'gcc -dumpversion' 4.4.7), 1) + DWARF_VERSION=4 +else + DWARF_VERSION=2 +endif + +# Overrride optimizations via: make O=n +O = 3 + +ifeq ($(DOPROFILE),1) + OPTFLAGS = -O$(O) -fPIE -pg -fprofile-arcs -ftest-coverage -DDOPROFILE +else + OPTFLAGS = -O$(O) +endif + +# Strict aliasing is really interesting. -fno-strict-aliasing relaxes the coding rules, but makes +# some code run a little slower. I'm not seeing a real difference at the moment, so turning it off +# I think to get the real speed, you turn on -fstrict-aliasing, and the appropriate -W, which +# generates a few warnings in our code where type punning is used for printf sanity. This +# also interacts with the restrict keyword, which I understand in theory, but attempts to use +# it throw errors out of the compiler so I don't understand it yet. +# +# Removing the frame pointers does add a few percent in speed, too, but we need better debugging +# at this point... +# +# And the jury's a little out on -mss3 and -msse4. They aren't turned on by -march=native, +# even though native should understand that those are the preferred types, and available. +# +# Tree vectorize is turned on the in the O3 mechanism. It's fascinating to turn on the tree vectorize +# debugs. Very rarely do we have loops that vectorize, because we often use functions in our loops. +# and, sometimes loops are vectorized but will need lengths greater than 20 or 30 to show speed improvements, +# loops of this size are unlikely in our code. +# +# O3 also enables -finline-functions, among other things. +COMMON_CFLAGS = -gdwarf-$(DWARF_VERSION) -g3 $(OPTFLAGS) -fno-common -fno-strict-aliasing -Wall $(AS_CFLAGS) $(AS_EE_CFLAGS) + +# Code generated for the "nocona" architecture has been determined to run well on a wide variety of current machines. +ifneq ($(ARCH),$(filter $(ARCH),ppc64 ppc64le)) + COMMON_CFLAGS += -march=nocona +endif + +# Generate dependency files. +COMMON_CFLAGS += -MMD + +# Require strict warning-free compilation. +COMMON_CFLAGS += -Werror + +CFLAGS = $(COMMON_CFLAGS) -DMARCH_$(MARCH_NATIVE) + +# Define a macro for the base source file name. +DEF_FN += -D__FILENAME__=\"$(notdir $<)\" + +# Alternative Compiler Flags Settings: +# Note: "native" is optimized for the build environment, which might not be the same as the deployment environment: +#CFAGS_NATIVE = $(COMMON_CFLAGS) -march=native +#CFLAGS_64 = $(COMMON_FLAGS) -DMARCH_x86_64 +#CFLAGS_32 = $(COMMON_CFLAGS) -DMARCH_i686 + +LDFLAGS = -rdynamic -L$(CF_LIB_DIR) + +STRIP = strip -p -v -s diff --git a/make_in/Makefile.targets b/make_in/Makefile.targets new file mode 100644 index 00000000..92f0904f --- /dev/null +++ b/make_in/Makefile.targets @@ -0,0 +1,16 @@ +# Aerospike Server +# Makefile.targets +# +# Common Makefile targets, dependencies, and pattern-matching rules. +# + +strip: $(SERVER) + $(STRIP) $(SERVER) -o $(SERVER).stripped + +-include $(DEPENDENCIES) + +$(OBJECT_DIR)/%.o: %.c + $(CC) $(CFLAGS) $(DEF_FN) -o $@$(SUFFIX) -c $(INCLUDES) $(SRCDIR)$< + +$(OBJECT_DIR)/%.o: %.cc + $(CXX) $(CXXFLAGS) $(CFLAGS) $(DEF_FN) -o $@$(SUFFIX) -c $(INCLUDES) $(SRCDIR)$< diff --git a/make_in/Makefile.vars b/make_in/Makefile.vars new file mode 100644 index 00000000..54296e80 --- /dev/null +++ b/make_in/Makefile.vars @@ -0,0 +1,98 @@ +# Aerospike Server +# Makefile.vars +# +# Common Makefile variables. +# +# To enable or disable the following features, add =(1|0) to the "make" command line. +# E.g., to build without JEMalloc support, use: +# +# prompt% make USE_JEM=0 +# +# To link with the static or dynamic version of a library, add "LD_=(static|dynamic)", +# where is "CRYPTO", "LUA", "LUAJIT", "JANSSON" or "JEM", to the "make" command line. +# E.g., to build with JEMalloc dynamically linked, use: +# +# prompt% make LD_JEM=dynamic +# +# [Note: "EXT_CFLAGS" contains "external" CFLAGS passed to sub-module builds.] +# + +ifneq ($(EEREPO),) + include $(EEREPO)/make_in/Makefile.vars +endif + +# By default, build the community edition. +EDITION = community + +# Build host machine architecture. +ARCH = $(shell uname -m) + +# Use LuaJIT instead of Lua? [By default, yes.] +USE_LUAJIT = 1 +ifeq ($(ARCH),$(filter $(ARCH),ppc64 ppc64le)) + USE_LUAJIT = 0 +endif + +# Default mode used for linking the Jansson JSON API Library: +LD_JANSSON = static + +# Default mode used for linking the LuaJIT library: +LD_LUAJIT = static + +# Default mode used for linking the Lua library: +LD_LUA = static + +# Options to pass to Jansson's "configure" script. +JANSSON_CONFIG_OPT = + +# Options to pass to JEMalloc's "configure" script. +JEM_CONFIG_OPT = "EXTRA_CFLAGS=-I/opt/valgrind/include -I/usr/local/include" --with-jemalloc-prefix=jem_ + +EXT_CFLAGS += -DENHANCED_ALLOC + +# Set the default depth to the top level unless overriden: +DEPTH ?= . + +# Directory structure for build products: + +TARGET_DIR = $(DEPTH)/target + +PLATFORM = $(shell uname)-$(ARCH) +BUILD_DIR = $(TARGET_DIR)/$(PLATFORM) + +GEN_DIR = $(BUILD_DIR)/gen +INCLUDE_DIR = ../include $(GEN_DIR) +OBJECT_DIR = $(BUILD_DIR)/obj +LIBRARY_DIR = $(BUILD_DIR)/lib +BIN_DIR = $(BUILD_DIR)/bin + +# Auto-generated version files: +VERSION_SRC = $(GEN_DIR)/version.c +VERSION_OBJ = $(VERSION_SRC:$(GEN_DIR)/%.c=$(OBJECT_DIR)/%.o) + +# Paths to the submodules: +AI_PATH := $(realpath $(DEPTH)/ai) +AS_PATH := $(realpath $(DEPTH)/as) +CF_PATH := $(realpath $(DEPTH)/cf) +COMMON_PATH := $(realpath $(DEPTH)/modules/common) +JANSSON_PATH := $(realpath $(DEPTH)/modules/jansson) +MOD_LUA_PATH := $(realpath $(DEPTH)/modules/mod-lua) +LUA_CORE_PATH := $(realpath $(DEPTH)/modules/lua-core) +JEMALLOC_PATH := $(realpath $(DEPTH)/modules/jemalloc) +LUAJIT_PATH := $(realpath $(DEPTH)/modules/luajit) +S2_PATH := $(realpath $(DEPTH)/modules/s2-geometry-library/geometry) + +# Overridable values used by sub-makefiles: +AI = $(AI_PATH) +AS = $(AS_PATH) +CF = $(CF_PATH) +COMMON = $(COMMON_PATH) +JANSSON = $(JANSSON_PATH) +MOD_LUA = $(MOD_LUA_PATH) +LUA_CORE = $(LUA_CORE_PATH) +JEMALLOC = $(JEMALLOC_PATH) +LUAJIT = $(LUAJIT_PATH) +S2 = $(S2_PATH) + +# Programs, for which GNU Make doesn't define implicit variables: +OBJCOPY := objcopy diff --git a/modules/common b/modules/common new file mode 160000 index 00000000..fc2dd1df --- /dev/null +++ b/modules/common @@ -0,0 +1 @@ +Subproject commit fc2dd1df4268f15674752440d568918a58b40eb7 diff --git a/modules/jansson b/modules/jansson new file mode 160000 index 00000000..5cc594c9 --- /dev/null +++ b/modules/jansson @@ -0,0 +1 @@ +Subproject commit 5cc594c9e8bc01f9531f80aba82c9775bba94c18 diff --git a/modules/jemalloc b/modules/jemalloc new file mode 160000 index 00000000..92192432 --- /dev/null +++ b/modules/jemalloc @@ -0,0 +1 @@ +Subproject commit 921924328a0bf4204feb2a315415170b8370223c diff --git a/modules/lua-core b/modules/lua-core new file mode 160000 index 00000000..acb9eb1e --- /dev/null +++ b/modules/lua-core @@ -0,0 +1 @@ +Subproject commit acb9eb1ec2dda2c64375b28c7fb08a0518aadd27 diff --git a/modules/luajit b/modules/luajit new file mode 160000 index 00000000..6c4a1825 --- /dev/null +++ b/modules/luajit @@ -0,0 +1 @@ +Subproject commit 6c4a18258631ff01f963e9a1e64df57d7a453fd6 diff --git a/modules/mod-lua b/modules/mod-lua new file mode 160000 index 00000000..5293a5c1 --- /dev/null +++ b/modules/mod-lua @@ -0,0 +1 @@ +Subproject commit 5293a5c10567269ac194c705a6910277aff1d2a1 diff --git a/modules/s2-geometry-library b/modules/s2-geometry-library new file mode 160000 index 00000000..97562341 --- /dev/null +++ b/modules/s2-geometry-library @@ -0,0 +1 @@ +Subproject commit 975623412e292079b962bf73983bfb6ac63f3ba9 diff --git a/modules/telemetry b/modules/telemetry new file mode 160000 index 00000000..611e169a --- /dev/null +++ b/modules/telemetry @@ -0,0 +1 @@ +Subproject commit 611e169a7d60d803e0de0fc92c35364a3a94f33c diff --git a/pkg/deb/Makefile b/pkg/deb/Makefile new file mode 100644 index 00000000..45d327fb --- /dev/null +++ b/pkg/deb/Makefile @@ -0,0 +1,148 @@ +# Build Aerospike Server ".deb" Distribution. + +DEPTH = ../.. +include $(DEPTH)/make_in/Makefile.vars + +PKG = $(realpath $(DEPTH)/pkg) +SOURCE_ROOT = $(PKG)/dist +BUILD_ROOT = $(SOURCE_ROOT)/BUILD +OPT_AS = $(BUILD_ROOT)/opt/aerospike + +REV = $(shell $(DEPTH)/build/version) +OS = $(shell $(DEPTH)/build/os_version) +SIZE = $(shell du -k $(BIN_DIR)/asd | cut -f1) +DEPS = + +comma:= , +empty:= +space:= $(empty) $(empty) + +ifeq ($(OS),$(filter $(OS),debian8 ubuntu16.04)) + USE_SYSTEMD = 1 + CONF_VERSION = _systemd + DEPS = +else + DEPS = logrotate +endif + +ifeq ($(EDITION),community) + DEPS += python +endif + +DEB = $(PKG)/packages/aerospike-server-$(EDITION)-$(REV).$(OS).x86_64.deb + +ifeq ($(USE_EE),1) +all: dist-xdr package clean +else +all: dist package clean +endif + +.PHONY: dist +dist: + install -d $(BUILD_ROOT)/DEBIAN + install -d $(BUILD_ROOT)/etc/aerospike + install -d $(BUILD_ROOT)/etc/aerospike/sample +ifeq ($(USE_SYSTEMD),1) + install -d $(BUILD_ROOT)/usr/lib/systemd/system + install -d $(BUILD_ROOT)/etc/systemd/system/aerospike.service.d +else + install -d $(BUILD_ROOT)/etc/init.d + install -d $(BUILD_ROOT)/etc/logrotate.d + install -d $(BUILD_ROOT)/var/log/aerospike + install -d $(BUILD_ROOT)/var/run/aerospike +endif + install -d $(BUILD_ROOT)/usr/bin + + install -pm 644 $(PKG)/deb/conffiles $(BUILD_ROOT)/DEBIAN +ifeq ($(EDITION),community) + cat $(PKG)/deb/conffiles.telemetry >> $(BUILD_ROOT)/DEBIAN/conffiles +endif + install -pm 755 $(PKG)/deb/postinst.server $(BUILD_ROOT)/DEBIAN/postinst + + install -pm 755 $(BIN_DIR)/asd $(BUILD_ROOT)/usr/bin/asd +ifeq ($(USE_SYSTEMD),1) + install -pm 755 $(DEPTH)/tools/bin/asd-coldstart $(BUILD_ROOT)/usr/bin/asd-coldstart +endif + install -pm 755 $(DEPTH)/tools/citrus2aero/upgrade2to3 $(BUILD_ROOT)/usr/bin/asmigrate2to3 + install -pm 755 $(DEPTH)/tools/fixownership/fixownership.py $(BUILD_ROOT)/usr/bin/asfixownership + install -pm 755 $(DEPTH)/as/etc/irqbalance-ban.sh $(BUILD_ROOT)/etc/aerospike/irqbalance-ban.sh + install -pm 644 $(DEPTH)/as/etc/aerospike$(CONF_VERSION).conf $(BUILD_ROOT)/etc/aerospike/aerospike.conf + cat $(DEPTH)/as/etc/README.sample.conf.md > $(BUILD_ROOT)/etc/aerospike/sample/README.md + install -pm 644 $(DEPTH)/as/etc/aerospike_ssd$(CONF_VERSION).conf $(BUILD_ROOT)/etc/aerospike/sample/aerospike_ssd.conf + install -pm 644 $(DEPTH)/as/etc/aerospike_mesh$(CONF_VERSION).conf $(BUILD_ROOT)/etc/aerospike/sample/aerospike_mesh.conf + +ifeq ($(USE_SYSTEMD),1) + cat $(DEPTH)/as/etc/aerospike.service.head >> $(PKG)/deb/aerospike.service + ifeq ($(EDITION),community) + cat $(DEPTH)/as/etc/aerospike.service.telemetry >> $(PKG)/deb/aerospike.service + endif + cat $(DEPTH)/as/etc/aerospike.service.tail >> $(PKG)/deb/aerospike.service + install -p -D -m 644 $(PKG)/deb/aerospike.service $(BUILD_ROOT)/usr/lib/systemd/system/aerospike.service + install -p -D -m 644 $(DEPTH)/as/etc/aerospike-server.tmpfiles $(BUILD_ROOT)/etc/tmpfiles.d/aerospike.conf + install -p -D -m 644 $(DEPTH)/as/etc/aerospike-server.sysconfig $(BUILD_ROOT)/etc/sysconfig/aerospike + install -p -D -m 755 $(DEPTH)/as/etc/asd-systemd-helper $(BUILD_ROOT)/usr/bin/asd-systemd-helper + install -p -D -m 644 $(DEPTH)/as/etc/aerospike.service.d/* $(BUILD_ROOT)/etc/systemd/system/aerospike.service.d +else + install -pm 755 $(DEPTH)/as/etc/init-script.deb $(BUILD_ROOT)/etc/init.d/aerospike + sed -i 's/@EDITION@/$(EDITION)/g' $(BUILD_ROOT)/etc/init.d/aerospike + install -pm 644 $(DEPTH)/as/etc/logrotate_asd $(BUILD_ROOT)/etc/logrotate.d/aerospike +endif + + install -d $(OPT_AS)/doc +ifeq ($(EDITION),community) + install -pm 644 $(DEPTH)/LICENSE.CE $(OPT_AS)/doc/LICENSE + install -pm 644 $(DEPTH)/LICENSE-AGPL $(OPT_AS)/doc + install -pm 644 $(DEPTH)/LICENSE-APACHE $(OPT_AS)/doc +else + install -pm 644 $(EEREPO)/LICENSE.EE $(OPT_AS)/doc/LICENSE +endif + cat $(DEPTH)/LICENSE.3rdParty >> $(OPT_AS)/doc/LICENSE + +ifeq ($(EDITION),community) + ifeq ($(USE_SYSTEMD),1) + install -pm 755 $(DEPTH)/as/etc/aerospike_telemetry.service $(BUILD_ROOT)/usr/lib/systemd/system/aerospike_telemetry.service + install -pm 644 $(DEPTH)/as/etc/aerospike_telemetry.sysconfig $(BUILD_ROOT)/etc/sysconfig/aerospike_telemetry + else + install -pm 755 $(DEPTH)/as/etc/init-telemetry-script.deb $(BUILD_ROOT)/etc/init.d/aerospike_telemetry + install -pm 644 $(DEPTH)/as/etc/logrotate_telemetry $(BUILD_ROOT)/etc/logrotate.d/aerospike_telemetry + endif + install -d $(OPT_AS)/telemetry + install -d $(OPT_AS)/telemetry/phonehome + install -d $(OPT_AS)/telemetry/daemon + install -pm 644 $(DEPTH)/as/etc/telemetry.conf $(BUILD_ROOT)/etc/aerospike + install -pm 644 $(DEPTH)/modules/telemetry/README.md $(OPT_AS)/doc/TELEMETRY.md + install -pm 755 $(DEPTH)/modules/telemetry/telemetry.py $(OPT_AS)/telemetry + install -pm 755 $(DEPTH)/modules/telemetry/phonehome/*.py $(OPT_AS)/telemetry/phonehome + install -pm 755 $(DEPTH)/modules/telemetry/daemon/*.py $(OPT_AS)/telemetry/daemon +endif + + install -d $(OPT_AS)/data + install -d $(OPT_AS)/smd + install -d $(OPT_AS)/sys/udf/lua/external + install -d $(OPT_AS)/usr/udf/lua + install -pm 644 $(DEPTH)/modules/lua-core/src/*.lua $(OPT_AS)/sys/udf/lua + for FILE in `find $(DEPTH)/modules/lua-core/src/external -type f` ; do \ + install -pm 644 $$FILE $(OPT_AS)/sys/udf/lua/external ; \ + done + + install -d $(OPT_AS)/bin + install -pm 755 $(DEPTH)/tools/memacct/asparsemem $(OPT_AS)/bin + + sed 's/@VERSION@/'$(REV)'/g' < $(PKG)/deb/server-64 > $(BUILD_ROOT)/DEBIAN/control + sed -i 's/@EDITION@/'$(EDITION)'/g' $(BUILD_ROOT)/DEBIAN/control + sed -i 's/@SIZE@/'$(SIZE)'/g' $(BUILD_ROOT)/DEBIAN/control + sed -i 's/@DEPS@/$(addprefix $(comma), $(subst $(space),$(comma),$(strip $(DEPS))))/g' $(BUILD_ROOT)/DEBIAN/control + +package: + install -pm 644 $(OPT_AS)/doc/LICENSE $(PKG)/packages + fakeroot dpkg-deb --build $(BUILD_ROOT) $(DEB) + +clean: + rm -rf $(SOURCE_ROOT)/* +ifeq ($(USE_SYSTEMD),1) + rm -rf $(PKG)/deb/aerospike.service +endif + +ifeq ($(USE_EE),1) + include $(XDR)/make_in/Makefile.deb.in +endif diff --git a/pkg/deb/asinstall b/pkg/deb/asinstall new file mode 100755 index 00000000..2fe45fec --- /dev/null +++ b/pkg/deb/asinstall @@ -0,0 +1,66 @@ +#!/bin/bash +# Install Aerospike server and tools on Debian6/Debian7/Ubuntu12. +# This script must be run as root or sudo. + +#------------------ +# Verify User +#------------------ + +if [ $EUID -ne 0 ] +then + echo "This script requires root or sudo privileges." + exit 1 +fi + +#--------------- +# Check argparse +#--------------- +fn=/tmp/pkgexists + +cat <$fn +try: + import argparse + print(1) +except: + print(0) +EOF + +has_argparse=`python $fn` +rm $fn + +if [ "$has_argparse" = "0" ] +then + echo Installing python-argparse + apt-get -y install python-argparse +fi + +#--------------- +# Install tools +#--------------- +echo Installing tools + +# Use default arguments if none passed in. +if [ $# -eq 0 ] +then + echo dpkg -i aerospike-tools-*.deb + dpkg -i aerospike-tools-*.deb +else + echo dpkg "$@" aerospike-tools-*.deb + dpkg "$@" aerospike-tools-*.deb +fi + +#--------------- +# Install server +#--------------- +echo Installing server + +# Use default arguments if none passed in. +if [ $# -eq 0 ] +then + echo dpkg -i aerospike-server-*.deb + dpkg -i aerospike-server-*.deb +else + echo dpkg "$@" aerospike-server-*.deb + dpkg "$@" aerospike-server-*.deb +fi + diff --git a/pkg/deb/conffiles b/pkg/deb/conffiles new file mode 100644 index 00000000..897d3824 --- /dev/null +++ b/pkg/deb/conffiles @@ -0,0 +1 @@ +/etc/aerospike/aerospike.conf diff --git a/pkg/deb/conffiles.telemetry b/pkg/deb/conffiles.telemetry new file mode 100644 index 00000000..d34ff127 --- /dev/null +++ b/pkg/deb/conffiles.telemetry @@ -0,0 +1 @@ +/etc/aerospike/telemetry.conf diff --git a/pkg/deb/copyright b/pkg/deb/copyright new file mode 100644 index 00000000..2cc8da93 --- /dev/null +++ b/pkg/deb/copyright @@ -0,0 +1,9 @@ +Aerospike Server + +Copyright: Aerospike, Inc + +These files are owned by Aerospike, Inc. + +Permission to use is covered by customer agreements signed by Aerospike. +Please see your customer agreements or non-disclosure agreements +for distribution and re-distribution rights. diff --git a/pkg/deb/postinst.server b/pkg/deb/postinst.server new file mode 100755 index 00000000..70847520 --- /dev/null +++ b/pkg/deb/postinst.server @@ -0,0 +1,31 @@ +#!/bin/sh + +set -e + +case "$1" in + configure) + + # create aerospike group if it isn't already there + if ! getent group aerospike >/dev/null; then + groupadd -r aerospike + fi + + # create aerospike user if it isn't already there + if ! getent passwd aerospike >/dev/null; then + useradd -r -d /opt/aerospike -c 'Aerospike server' -g aerospike -s /sbin/nologin aerospike + fi + + for dir in /opt/aerospike /var/log/aerospike /var/run/aerospike ; do + if [ -d $dir ]; then + chown -R aerospike:aerospike $dir + fi + done + + if [ -d /run/systemd/system ]; then + systemctl --system daemon-reload >/dev/null 2>&1 || true + fi + + ;; +esac + +exit 0 diff --git a/pkg/deb/server-64 b/pkg/deb/server-64 new file mode 100644 index 00000000..3ec899b5 --- /dev/null +++ b/pkg/deb/server-64 @@ -0,0 +1,9 @@ +Package: aerospike-server-@EDITION@ +Version: @VERSION@-1 +Section: Databases +Priority: optional +Architecture: amd64 +Depends: libc6 (>= 2.7)@DEPS@ +Maintainer: Aerospike, Inc. +Description: The Aerospike distributed datastore allows fully scalable and reliable data storage with elastic server properties. +Installed-Size: @SIZE@ diff --git a/pkg/dist/.gitignore b/pkg/dist/.gitignore new file mode 100644 index 00000000..13e4d83e --- /dev/null +++ b/pkg/dist/.gitignore @@ -0,0 +1 @@ +[^.]* diff --git a/pkg/packages/.gitignore b/pkg/packages/.gitignore new file mode 100644 index 00000000..13e4d83e --- /dev/null +++ b/pkg/packages/.gitignore @@ -0,0 +1 @@ +[^.]* diff --git a/pkg/rpm/Makefile b/pkg/rpm/Makefile new file mode 100644 index 00000000..9d7cc6e0 --- /dev/null +++ b/pkg/rpm/Makefile @@ -0,0 +1,158 @@ +# Build Aerospike Server RPM Distribution. + +DEPTH = ../.. +include $(DEPTH)/make_in/Makefile.vars + +PKG = $(realpath $(DEPTH)/pkg) +SOURCE_ROOT = $(PKG)/dist +BUILD_ROOT = $(SOURCE_ROOT)/BUILD +OPT_AS = $(BUILD_ROOT)/opt/aerospike + +REV = $(shell $(DEPTH)/build/version | sed 's/-/_/g') +OS = $(shell $(DEPTH)/build/os_version) + +ifeq ($(OS),el7) + USE_SYSTEMD = 1 + CONF_VERSION = _systemd +endif + +ifeq ($(USE_EE),1) +all: dist-xdr package clean +else +all: dist package clean +endif + +.PHONY:dist +dist: + install -d $(BUILD_ROOT)/etc/aerospike + install -d $(BUILD_ROOT)/etc/aerospike/sample + +ifeq ($(USE_SYSTEMD),1) + install -d $(BUILD_ROOT)/usr/lib/systemd/system + install -d $(BUILD_ROOT)/etc/systemd/system/aerospike.service.d +else + install -d $(BUILD_ROOT)/var/log/aerospike + install -d $(BUILD_ROOT)/var/run/aerospike + install -d $(BUILD_ROOT)/etc/init.d + install -d $(BUILD_ROOT)/etc/logrotate.d +endif + install -d $(BUILD_ROOT)/usr/bin + + install -pm 755 $(BIN_DIR)/asd $(BUILD_ROOT)/usr/bin/asd +ifeq ($(USE_SYSTEMD),1) + install -pm 755 $(DEPTH)/tools/bin/asd-coldstart $(BUILD_ROOT)/usr/bin/asd-coldstart +endif + install -pm 755 $(DEPTH)/tools/citrus2aero/upgrade2to3 $(BUILD_ROOT)/usr/bin/asmigrate2to3 + install -pm 755 $(DEPTH)/tools/fixownership/fixownership.py $(BUILD_ROOT)/usr/bin/asfixownership + install -pm 755 $(DEPTH)/as/etc/irqbalance-ban.sh $(BUILD_ROOT)/etc/aerospike/irqbalance-ban.sh + install -pm 644 $(DEPTH)/as/etc/aerospike$(CONF_VERSION).conf $(BUILD_ROOT)/etc/aerospike/aerospike.conf + cat $(DEPTH)/as/etc/README.sample.conf.md > $(BUILD_ROOT)/etc/aerospike/sample/README.md + install -pm 644 $(DEPTH)/as/etc/aerospike_ssd$(CONF_VERSION).conf $(BUILD_ROOT)/etc/aerospike/sample/aerospike_ssd.conf + install -pm 644 $(DEPTH)/as/etc/aerospike_mesh$(CONF_VERSION).conf $(BUILD_ROOT)/etc/aerospike/sample/aerospike_mesh.conf + +ifeq ($(USE_SYSTEMD),1) + cat $(DEPTH)/as/etc/aerospike.service.head >> $(PKG)/rpm/aerospike.service + ifeq ($(EDITION),community) + cat $(DEPTH)/as/etc/aerospike.service.telemetry >> $(PKG)/rpm/aerospike.service + endif + cat $(DEPTH)/as/etc/aerospike.service.tail >> $(PKG)/rpm/aerospike.service + install -p -D -m 644 $(PKG)/rpm/aerospike.service $(BUILD_ROOT)/usr/lib/systemd/system/aerospike.service + install -p -D -m 644 $(DEPTH)/as/etc/aerospike-server.tmpfiles $(BUILD_ROOT)/etc/tmpfiles.d/aerospike.conf + install -p -D -m 644 $(DEPTH)/as/etc/aerospike-server.sysconfig $(BUILD_ROOT)/etc/sysconfig/aerospike + install -p -D -m 755 $(DEPTH)/as/etc/asd-systemd-helper $(BUILD_ROOT)/usr/bin/asd-systemd-helper + install -p -D -m 644 $(DEPTH)/as/etc/aerospike.service.d/* $(BUILD_ROOT)/etc/systemd/system/aerospike.service.d +else + install -pm 755 $(DEPTH)/as/etc/init-script $(BUILD_ROOT)/etc/init.d/aerospike + sed -i 's/@EDITION@/$(EDITION)/g' $(BUILD_ROOT)/etc/init.d/aerospike + install -pm 644 $(DEPTH)/as/etc/logrotate_asd $(BUILD_ROOT)/etc/logrotate.d/aerospike +endif + + install -d $(OPT_AS)/doc +ifeq ($(EDITION),community) + install -pm 644 $(DEPTH)/LICENSE.CE $(OPT_AS)/doc/LICENSE + install -pm 644 $(DEPTH)/LICENSE-AGPL $(OPT_AS)/doc + install -pm 644 $(DEPTH)/LICENSE-APACHE $(OPT_AS)/doc +else + install -pm 644 $(EEREPO)/LICENSE.EE $(OPT_AS)/doc/LICENSE +endif + cat $(DEPTH)/LICENSE.3rdParty >> $(OPT_AS)/doc/LICENSE + +ifeq ($(EDITION),community) + ifeq ($(USE_SYSTEMD),1) + install -pm 755 $(DEPTH)/as/etc/aerospike_telemetry.service $(BUILD_ROOT)/usr/lib/systemd/system/aerospike_telemetry.service + install -pm 644 $(DEPTH)/as/etc/aerospike_telemetry.sysconfig $(BUILD_ROOT)/etc/sysconfig/aerospike_telemetry + else + install -pm 755 $(DEPTH)/as/etc/init-telemetry-script $(BUILD_ROOT)/etc/init.d/aerospike_telemetry + install -pm 644 $(DEPTH)/as/etc/logrotate_telemetry $(BUILD_ROOT)/etc/logrotate.d/aerospike_telemetry + endif + install -d $(OPT_AS)/telemetry + install -d $(OPT_AS)/telemetry/phonehome + install -d $(OPT_AS)/telemetry/daemon + install -pm 644 $(DEPTH)/as/etc/telemetry.conf $(BUILD_ROOT)/etc/aerospike + install -pm 644 $(DEPTH)/modules/telemetry/README.md $(OPT_AS)/doc/TELEMETRY.md + install -pm 755 $(DEPTH)/modules/telemetry/telemetry.py $(OPT_AS)/telemetry + install -pm 755 $(DEPTH)/modules/telemetry/phonehome/*.py $(OPT_AS)/telemetry/phonehome + install -pm 755 $(DEPTH)/modules/telemetry/daemon/*.py $(OPT_AS)/telemetry/daemon +endif + + install -d $(OPT_AS)/data + install -d $(OPT_AS)/smd + install -d $(OPT_AS)/sys/udf/lua/external + install -d $(OPT_AS)/usr/udf/lua + install -pm 644 $(DEPTH)/modules/lua-core/src/*.lua $(OPT_AS)/sys/udf/lua + for FILE in `find $(DEPTH)/modules/lua-core/src/external -type f` ; do \ + install -pm 644 $$FILE $(OPT_AS)/sys/udf/lua/external ; \ + done + + install -d $(OPT_AS)/bin + install -pm 755 $(DEPTH)/tools/memacct/asparsemem $(OPT_AS)/bin + +package: + install -pm 644 $(OPT_AS)/doc/LICENSE $(PKG)/packages + install -d $(SOURCE_ROOT)/RPMS/x86_64 + + sed 's/@VERSION@/'$(REV)'/g' < $(PKG)/rpm/server-spec-base > $(PKG)/rpm/aerospike.spec +ifneq ($(USE_SYSTEMD),1) + cat $(PKG)/rpm/server-spec-logrotate >> $(PKG)/rpm/aerospike.spec +endif + cat $(PKG)/rpm/server-spec-files >> $(PKG)/rpm/aerospike.spec +ifeq ($(USE_SYSTEMD),1) + cat $(PKG)/rpm/server-spec-systemd >> $(PKG)/rpm/aerospike.spec +else + cat $(PKG)/rpm/server-spec-sysv >> $(PKG)/rpm/aerospike.spec +endif + +ifeq ($(USE_EE),1) + cat $(EEREPO)/pkg/rpm/xdr-files >> $(PKG)/rpm/aerospike.spec +endif + cat $(PKG)/rpm/server-spec-config >> $(PKG)/rpm/aerospike.spec +ifeq ($(EDITION),community) + cat $(PKG)/rpm/server-spec-telemetry >> $(PKG)/rpm/aerospike.spec + ifeq ($(USE_SYSTEMD),1) + cat $(PKG)/rpm/server-spec-telemetry-systemd >> $(PKG)/rpm/aerospike.spec + else + cat $(PKG)/rpm/server-spec-telemetry-sysv >> $(PKG)/rpm/aerospike.spec + endif +endif + cat $(PKG)/rpm/server-spec-scripts >> $(PKG)/rpm/aerospike.spec +ifeq ($(USE_SYSTEMD),1) + cat $(PKG)/rpm/server-spec-scripts-systemd >> $(PKG)/rpm/aerospike.spec +endif + + sed -i 's/@RELEASE@/'$(OS)'/g' $(PKG)/rpm/aerospike.spec + sed -i 's/@EDITION@/'$(EDITION)'/g' $(PKG)/rpm/aerospike.spec + + cd $(DEPTH); rpmbuild -bb -vv --define "dist .$(OS)" --buildroot $(BUILD_ROOT) $(PKG)/rpm/aerospike.spec + + find $(SOURCE_ROOT)/RPMS -type f -exec mv {} $(PKG)/packages \; + +clean: + rm -rf $(PKG)/rpm/aerospike.spec +ifeq ($(USE_SYSTEMD),1) + rm -rf $(PKG)/rpm/aerospike.service +endif + rm -rf $(SOURCE_ROOT)/* + +ifeq ($(USE_EE),1) + include $(EEREPO)/xdr/make_in/Makefile.rpm.in +endif diff --git a/pkg/rpm/asinstall b/pkg/rpm/asinstall new file mode 100755 index 00000000..ab592763 --- /dev/null +++ b/pkg/rpm/asinstall @@ -0,0 +1,66 @@ +#!/bin/bash +# Install Aerospike server and tools on RHEL/Fedora/Centos 6. +# This script must be run as root or sudo. + +#------------------ +# Verify User +#------------------ + +if [ $EUID -ne 0 ] +then + echo "This script requires root or sudo privileges." + exit 1 +fi + +#--------------- +# Check argparse +#--------------- +fn=/tmp/pkgexists + +cat <$fn +try: + import argparse + print(1) +except: + print(0) +EOF + +has_argparse=`python $fn` +rm $fn + +if [ "$has_argparse" = "0" ] +then + echo Installing python-argparse + rpm -Uvh python-argparse-1.2.1-2.el6.noarch.rpm +fi + +#--------------- +# Install tools +#--------------- +echo Installing tools + +# Use default arguments if none passed in. +if [ $# -eq 0 ] +then + echo rpm -Uvh aerospike-tools-*.rpm + rpm -Uvh aerospike-tools-*.rpm +else + echo rpm "$@" aerospike-tools-*.rpm + rpm "$@" aerospike-tools-*.rpm +fi + +#--------------- +# Install server +#--------------- +echo Installing server + +# Use default arguments if none passed in. +if [ $# -eq 0 ] +then + echo rpm -Uvh aerospike-server-*.rpm + rpm -Uvh aerospike-server-*.rpm +else + echo rpm "$@" aerospike-server-*.rpm + rpm "$@" aerospike-server-*.rpm +fi + diff --git a/pkg/rpm/server-spec-base b/pkg/rpm/server-spec-base new file mode 100644 index 00000000..ed89d8eb --- /dev/null +++ b/pkg/rpm/server-spec-base @@ -0,0 +1,20 @@ +Name: aerospike +Version: @VERSION@ +Release: 1%{?dist} +Summary: The Aerospike Database +License: Proprietary +Group: Application +BuildArch: x86_64 +Vendor: Aerospike, Inc. + +%description +The Aerospike distributed datastore allows fully scalable +and reliable data storage with elastic server properties. + +%define _topdir pkg/dist +%define __spec_install_post /usr/lib/rpm/brp-compress +%package server-@EDITION@ +Summary: Aerospike server +Group: Applications +%description server-@EDITION@ +This package contains all of the code for running the Aerospike server. diff --git a/pkg/rpm/server-spec-config b/pkg/rpm/server-spec-config new file mode 100644 index 00000000..8aa3e1b6 --- /dev/null +++ b/pkg/rpm/server-spec-config @@ -0,0 +1,10 @@ +%defattr(-,aerospike,aerospike) +/opt/aerospike +%defattr(-,root,root) +%config(noreplace) /etc/aerospike/aerospike.conf +%dir /etc/aerospike +/etc/aerospike/irqbalance-ban.sh +%dir /etc/aerospike/sample +/etc/aerospike/sample/README.md +/etc/aerospike/sample/aerospike_ssd.conf +/etc/aerospike/sample/aerospike_mesh.conf diff --git a/pkg/rpm/server-spec-files b/pkg/rpm/server-spec-files new file mode 100644 index 00000000..3ed2b03a --- /dev/null +++ b/pkg/rpm/server-spec-files @@ -0,0 +1,5 @@ +%files server-@EDITION@ +%defattr(-,root,root) +/usr/bin/asd +/usr/bin/asmigrate2to3 +/usr/bin/asfixownership diff --git a/pkg/rpm/server-spec-logrotate b/pkg/rpm/server-spec-logrotate new file mode 100644 index 00000000..5e0079e4 --- /dev/null +++ b/pkg/rpm/server-spec-logrotate @@ -0,0 +1 @@ +Requires: logrotate diff --git a/pkg/rpm/server-spec-scripts b/pkg/rpm/server-spec-scripts new file mode 100644 index 00000000..31475f10 --- /dev/null +++ b/pkg/rpm/server-spec-scripts @@ -0,0 +1,9 @@ +%pre server-@EDITION@ +if ! id -g aerospike >/dev/null 2>&1; then + echo "Adding group aerospike" + /usr/sbin/groupadd -r aerospike +fi +if ! id -u aerospike >/dev/null 2>&1; then + echo "Adding user aerospike" + /usr/sbin/useradd -r -d /opt/aerospike -c 'Aerospike server' -g aerospike -s /sbin/nologin aerospike +fi diff --git a/pkg/rpm/server-spec-scripts-systemd b/pkg/rpm/server-spec-scripts-systemd new file mode 100644 index 00000000..78f1cd53 --- /dev/null +++ b/pkg/rpm/server-spec-scripts-systemd @@ -0,0 +1,4 @@ +%post server-@EDITION@ +/bin/systemctl --system daemon-reload &> /dev/null || : +%postun server-@EDITION@ +/bin/systemctl --system daemon-reload &> /dev/null || : diff --git a/pkg/rpm/server-spec-systemd b/pkg/rpm/server-spec-systemd new file mode 100644 index 00000000..f5921d39 --- /dev/null +++ b/pkg/rpm/server-spec-systemd @@ -0,0 +1,8 @@ +/etc/systemd/system/aerospike.service.d/aerospike.conf +/etc/systemd/system/aerospike.service.d/aerospike.conf.coldstart +/etc/systemd/system/aerospike.service.d/aerospike.conf.default +/usr/bin/asd-coldstart +/usr/bin/asd-systemd-helper +/usr/lib/systemd/system/aerospike.service +%config /etc/tmpfiles.d/aerospike.conf +%config(noreplace) /etc/sysconfig/aerospike diff --git a/pkg/rpm/server-spec-sysv b/pkg/rpm/server-spec-sysv new file mode 100644 index 00000000..107f2443 --- /dev/null +++ b/pkg/rpm/server-spec-sysv @@ -0,0 +1,4 @@ +%config(noreplace) /etc/logrotate.d/aerospike +/etc/init.d/aerospike +%dir /var/log/aerospike +%dir /var/run/aerospike diff --git a/pkg/rpm/server-spec-telemetry b/pkg/rpm/server-spec-telemetry new file mode 100644 index 00000000..d5ab2650 --- /dev/null +++ b/pkg/rpm/server-spec-telemetry @@ -0,0 +1 @@ +%config(noreplace) /etc/aerospike/telemetry.conf diff --git a/pkg/rpm/server-spec-telemetry-systemd b/pkg/rpm/server-spec-telemetry-systemd new file mode 100644 index 00000000..061fa986 --- /dev/null +++ b/pkg/rpm/server-spec-telemetry-systemd @@ -0,0 +1,2 @@ +/usr/lib/systemd/system/aerospike_telemetry.service +%config(noreplace) /etc/sysconfig/aerospike_telemetry diff --git a/pkg/rpm/server-spec-telemetry-sysv b/pkg/rpm/server-spec-telemetry-sysv new file mode 100644 index 00000000..c08cb9ad --- /dev/null +++ b/pkg/rpm/server-spec-telemetry-sysv @@ -0,0 +1,2 @@ +/etc/init.d/aerospike_telemetry +%config(noreplace) /etc/logrotate.d/aerospike_telemetry diff --git a/pkg/src/Makefile b/pkg/src/Makefile new file mode 100644 index 00000000..99262638 --- /dev/null +++ b/pkg/src/Makefile @@ -0,0 +1,29 @@ +# Build Aerospike source distribution. + +DEPTH = ../.. +include $(DEPTH)/make_in/Makefile.vars + +REPO=$(realpath $(DEPTH)) +PKG = $(REPO)/pkg +DIST = $(PKG)/dist +SOURCE = $(DIST)/SOURCE +SOURCE_ASD = $(SOURCE)/aerospike-server + +REV = $(shell $(DEPTH)/build/version) + +# Name of the source package: +ARCHIVE = $(DEPTH)/pkg/packages/aerospike-server-$(EDITION)-$(REV).src.tar.bz2 + +all: dist package clean + + +.PHONY:dist +dist: + bash git-cp-files.sh $(REPO) $(SOURCE_ASD) + +.PHONY: package +package: dist + tar cvfj $(ARCHIVE) -C $(SOURCE) aerospike-server + +clean: + rm -rf $(SOURCE) \ No newline at end of file diff --git a/pkg/src/git-cp-files.sh b/pkg/src/git-cp-files.sh new file mode 100755 index 00000000..f5516d66 --- /dev/null +++ b/pkg/src/git-cp-files.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +CWD=$(pwd) +SCRIPT=${BASH_SOURCE[0]} +SCRIPT_PATH=$( cd "$( dirname "${SCRIPT}" )" && pwd ) +SCRIPT_BASE=$( basename "${SCRIPT}" ) + +SOURCE=${1} +TARGET=${2} +DEPTH=${3} + +if [ -z "${SOURCE}" ]; then + echo "ERROR: Missing SOURCE argument." >&2 + exit 1 +fi +if [ -z "${TARGET}" ]; then + echo "ERROR: Missing TARGET argument." >&2 + exit 1 +fi +if [ ! -d ${SOURCE} ]; then + echo "ERROR: SOURCE not found: ${SOURCE}" >&2 + exit 1 +fi + +if [ -n "${DEPTH}" ]; then + if [ ${DEPTH} -eq 0 ]; then + exit 0 + fi + DEPTH=$((DEPTH - 1)) +fi + +mkdir -p ${TARGET} + +IFS=$'\n' +for file in $(cd ${SOURCE} && git ls-files --abbrev); do + if [ -f ${SOURCE}/${file} ]; then + dir=$(dirname ${file}) + if [ ! -z "${dir}" ] && [ ! -d ${TARGET}/${dir} ]; then + mkdir -p "${TARGET}/${dir}" + fi + cp -a "${SOURCE}/${file}" "${TARGET}/${file}" + fi +done + +for module in $(cd ${SOURCE} && git submodule status | awk '{print $2}'); do + bash ${SCRIPT_PATH}/${SCRIPT_BASE} ${SOURCE}/${module} ${TARGET}/${module} ${DEPTH} +done diff --git a/pkg/tar/Makefile b/pkg/tar/Makefile new file mode 100644 index 00000000..260838d4 --- /dev/null +++ b/pkg/tar/Makefile @@ -0,0 +1,71 @@ +# Build Aerospike TAR distribution. + +DEPTH = ../.. +include $(DEPTH)/make_in/Makefile.vars + +PKG = $(realpath $(DEPTH)/pkg) +SOURCE = $(PKG)/tar +SOURCE_ROOT = $(PKG)/dist +BUILD_ROOT = $(SOURCE_ROOT)/BUILD +TARGET = $(BUILD_ROOT)/aerospike-server + +REV = $(shell $(DEPTH)/build/version) + +ARCHIVE = $(PKG)/packages/aerospike-server-$(EDITION)-$(REV).tar + +all: dist package clean + +.PHONY: dist +dist: + @# create directory + install -pm 755 -Dd $(TARGET) + + @# docs + install -pm 644 -D $(SOURCE)/README $(TARGET)/README + install -pm 644 -D $(DEPTH)/LICENSE $(TARGET)/LICENSE + + @# binaries + install -pm 755 -Dd $(TARGET)/bin + install -pm 755 -D $(BIN_DIR)/asd $(TARGET)/bin/asd + install -pm 755 -D $(SOURCE)/bin/aerospike $(TARGET)/bin/aerospike + + @# share binaries + install -pm 755 -Dd $(TARGET)/share/bin + install -pm 755 -D $(SOURCE)/share/bin/aerospike $(TARGET)/share/bin/aerospike + + @# share libraries + install -pm 755 -Dd $(TARGET)/share/lib + install -pm 644 -D $(SOURCE)/share/lib/* $(TARGET)/share/lib/. + + @# share libexecs + install -pm 755 -Dd $(TARGET)/share/libexec + install -pm 644 -D $(SOURCE)/share/libexec/* $(TARGET)/share/libexec/. + + @# share manpages + install -pm 755 -Dd $(TARGET)/share/man + install -pm 644 -D $(SOURCE)/share/man/* $(TARGET)/share/man/. + + @# share configs + install -pm 755 -Dd $(TARGET)/share/etc + install -pm 644 -D $(SOURCE)/share/etc/aerospike.conf $(TARGET)/share/etc/aerospike.conf + install -pm 755 -D $(DEPTH)/as/etc/irqbalance-ban.sh $(TARGET)/share/etc/irqbalance-ban.sh + + @# lua files + install -pm 755 -Dd $(TARGET)/share/udf/lua + install -pm 644 -D $(DEPTH)/modules/lua-core/src/*.lua $(TARGET)/share/udf/lua/. + + install -pm 755 -Dd $(TARGET)/share/udf/lua/external + for FILE in `find $(DEPTH)/modules/lua-core/src/external -type f` ; do \ + install -pm 644 $$FILE $(OPT_AS)/sys/udf/lua/external ; \ + done + +tar: + tar cvf $(ARCHIVE) -C $(BUILD_ROOT) aerospike-server + +gzip: tar + gzip -f $(ARCHIVE) + +package: gzip + +clean: + rm -rf $(SOURCE_ROOT)/* diff --git a/pkg/tar/README b/pkg/tar/README new file mode 100644 index 00000000..3110049a --- /dev/null +++ b/pkg/tar/README @@ -0,0 +1,101 @@ +AEROSPIKE README +================ + +SYNOPSIS + + ./bin/aerospike init + sudo ./bin/aerospike start + ./bin/aerospike status + ./bin/aerospike stop + +DESCRIPTION + + This package contains the Aerospike Server Daemon (asd), scripts, + configuration files, and other resources. + +QUICK START + + The `aerospike` script, located in the `bin` directory, provides the ability + to initialize a directory for running `asd` and managing an instance of `asd` + from that directory. + + The following is a quick walk-through to help you get started: + + 1. Initialize a directory for hosting your aerospike instance. + + ./bin/aerospike init + + 2. Start the aerospike server + + sudo ./bin/aerospike start + + Superuser privileges are required because it attempts to set upper + limits on system resources. + + 3. Check the status of the aerospike server + + ./bin/aerospike status + + 4. Stop the aerospike server + + sudo ./bin/aerospike stop + + +AEROSPIKE SCRIPT + + The `aerospike` script, located in the `bin` directory, provides the ability + to managing an instance of `asd`. + + For help with the script, use the `--help` option: + + ./bin/aerospike --help + +AEROSPIKE INSTANCE DIRECTORY + + The directory created by the `aerospike init` command will contain: + + bin/aerospike - The management script to manage this instance. + bin/asd - The aerospike server daemon. + etc/aerospike.conf - The configuration file used by this instance. + share/ - Contains read-only files, used by this instance. + var/ - Contains runtime files generated by `asd`, including + logs and data files. + +NOTES + + 1. The `aerospike init` script can be used to initialize any directory to host + an aerospike instance by specifying the `--home ` option + + ./bin/aerospike init --home ~/myaerospike + + For running multiple instances, see below. + + 2. Running Multiple Instances + + NOTE: For production environments, it is not recommended to run multiple + instances on a single host. The reason is that you would ideally allocate + as many resource as possible to a production instance. + + If you want to run multiple instances of aerospike on a single machine, then + each instance should be initialized with different instance id and port + number. The default instance id is "1" and the default port is "3000". + + To initialize two instances, you can use: + + ./bin/aerospike init --home ~/a --instance 1 -p 3000 + ./bin/aerospike init --home ~/b --instance 2 -p 3010 + + The aerospike.conf for each instance cannot share resources. If you + define storage engines other than in-memory, then each must have dedicated + resources (file, device, etc) dedicated to those instance. There is + a limit of 15 possible instances which can created. + + 3. Running as non-root Users + + If you want to run instances of aerospike as non-root users, you can provide + the user and group ids during initialization time: + + ./bin/aerospike init --home ~/aerobob --user bob --group bobs + + This will initialize setup the home directory to be owned by "bob". Also, + the server will run as the user "bob". diff --git a/pkg/tar/bin/aerospike b/pkg/tar/bin/aerospike new file mode 100644 index 00000000..78432537 --- /dev/null +++ b/pkg/tar/bin/aerospike @@ -0,0 +1,4 @@ +#!/bin/bash +SCRIPT_PATH=$0 +SCRIPT_HOME=$(cd $(dirname ${SCRIPT_PATH})/..; pwd) +AEROSPIKE_DAEMON=${SCRIPT_HOME}/bin/asd ${SCRIPT_HOME}/share/bin/aerospike $@ diff --git a/pkg/tar/share/bin/aerospike b/pkg/tar/share/bin/aerospike new file mode 100755 index 00000000..8cd08880 --- /dev/null +++ b/pkg/tar/share/bin/aerospike @@ -0,0 +1,224 @@ +#!/bin/bash +################################################################################ +# +# Run Script for Aerospike +# +################################################################################ + +SCRIPT_PATH=$0 +SCRIPT_NAME=$(basename $SCRIPT_PATH) +SCRIPT_HOME=$(cd $(dirname $SCRIPT_PATH)/..; pwd) +SCRIPT_BIN=${SCRIPT_HOME}/bin +SCRIPT_LIB=${SCRIPT_HOME}/lib +SCRIPT_LIBEXEC=${SCRIPT_HOME}/libexec +SCRIPT_MAN=${SCRIPT_HOME}/man + +if [ ! AEROSPIKE_DAEMON ]; then + AEROSPIKE_DAEMON=${SCRIPT_BIN}/asd +fi +AEROSPIKE_HOME=$(pwd) + +DEBUG=0 + +################################################################################ +# +# FUNCTIONS +# +################################################################################ + +print() { + printf "$1\n" +} + +debug() { + [ $DEBUG == 1 ] && print "$(tput setaf 0)$(tput bold)debug:$(tput sgr0) $1" +} + +info() { + print "$(tput setaf 4)$(tput bold)info:$(tput sgr0) $1" +} + +warning() { + print "$(tput setaf 3)$(tput bold)warning:$(tput sgr0) $1" >&2 +} + +error() { + print "$(tput setaf 1)$(tput bold)error:$(tput sgr0) $1" +} + +# try an operation and log the result +try() { + local cmd="$1" + local msg="$2" + + if [[ -z $msg ]]; then + msg="$cmd" + fi + + rc=0 + debug "$msg" + if [ $DEBUG ] && [ $DEBUG -gt 0 ]; then + eval "$cmd" + rc=$? + else + eval "$cmd" &>/dev/null + rc=$? + fi + + if [[ $rc -eq 0 ]]; then + debug "$msg" + else + error "$msg" + fi + + return $rc +} + +default_parseopt() { + case $1 in + "--debug" ) + DEBUG=1 + debug "DEBUG MODE" + return 1 + ;; + "--help" ) + help + exit 0 + ;; + "--home" ) + if [ -z $2 ]; then + error "--home requires a PATH." + exit 1 + fi + if [ ! -d $2 ]; then + error "--home requires a valid PATH: $2" + exit 1 + fi + AEROSPIKE_HOME=$(cd $2; pwd) + return 2 + ;; + * ) + error "Unknown option: $1" + exit 1 + ;; + esac +} + +parseopt() { + default_parseopt $* + return $? +} + +parseopts() { + while (( "$#" )); do + case $1 in + "--"* | "-"* ) + parseopt $* + shift $? + ;; + + * ) + COMMAND=$1 + debug "running ${COMMAND}" + if [ ! ${COMMAND} ]; then + error "Command is not specified." + usage >&2 + exit 1 + elif [ ! -f ${SCRIPT_LIBEXEC}/aerospike-${COMMAND} ]; then + error "'$COMMAND' is not a valid command." + usage >&2 + exit 1 + else + source ${SCRIPT_LIBEXEC}/aerospike-${COMMAND} + fi + shift + ;; + + esac + done +} + +process_running() { + debug "process running" + return 0 +} + +process_stopped() { + debug "process stopped" + return 0 +} + +process_died() { + debug "process died" + return 0 +} + +process_check() { + local pid_file=${AEROSPIKE_HOME}/var/run/aerospike.pid + if [ -f ${pid_file} ]; then + debug "${pid_file} exists." + pid=$(cat ${pid_file}) + debug "${pid_file} => ${pid}" + pline=$(ps -p $pid -o "command=") + if [ $? -eq 0 ]; then + debug "pid ${pid} found: ${pline}" + if [[ "${pline}" == *${AEROSPIKE_HOME}/etc/aerospike.conf* ]]; then + debug "aerospike is running as ${pid}" + process_running ${pid} ${pid_file} + return $? + else + debug "pid ${pid} does not match expected command." + process_died ${pid} ${pid_file} + return $? + fi + else + debug "pid ${pid} not found" + process_died ${pid} ${pid_file} + return $? + fi + else + process_stopped + return $? + fi +} + +usage() { + # |--------------------------------------------------------------------------------| + print "" + print "usage: aerospike COMMAND [OPTIONS]" + print + print "$(tput bold)COMMANDS$(tput sgr0)" + for script in ${SCRIPT_LIBEXEC}/aerospike-*; do + s=$(basename ${script}) + s=${s#aerospike-} + s=${s%.sh} + print " $s" + done + print + print "Use 'aerospike COMMAND --help' for command specific help." + print + # |--------------------------------------------------------------------------------| +} + +help() { + if [ -f ${SCRIPT_MAN}/aerospike-${COMMAND}.man ]; then + man ${SCRIPT_MAN}/aerospike-${COMMAND}.man + elif [ -z ${COMMAND} ]; then + usage + else + print "No help available for '${COMMAND}'" + fi +} + +################################################################################ +# +# MAIN +# +################################################################################ + +parseopts $* +if [ ! ${COMMAND} ]; then + usage +else + main +fi diff --git a/pkg/tar/share/etc/aerospike.conf b/pkg/tar/share/etc/aerospike.conf new file mode 100644 index 00000000..e79555c0 --- /dev/null +++ b/pkg/tar/share/etc/aerospike.conf @@ -0,0 +1,54 @@ +# Aerospike database configuration file. + +# This stanza must come first. +service { + user ${user} + group ${group} + paxos-single-replica-limit 1 # Number of nodes where the replica count is automatically reduced to 1. + pidfile ${home}/var/run/aerospike.pid + proto-fd-max 15000 + work-directory ${home}/var +} + +logging { + # Log file must be an absolute path. + file ${home}/var/log/aerospike.log { + context any info + } +} + +mod-lua { + system-path ${home}/share/udf/lua + user-path ${home}/var/udf/lua +} + +network { + service { + address ${service_addr} + port ${service_port} + } + + heartbeat { + mode multicast + multicast-group ${multicast_addr} + port ${multicast_port} + + interval 150 + timeout 10 + } + + fabric { + port ${fabric_port} + } + + info { + port ${info_port} + } +} + +namespace test { + replication-factor 2 + memory-size 4G + default-ttl 30d # 30 days, use 0 to never expire/evict. + storage-engine memory +} diff --git a/pkg/tar/share/lib/aerospike-render.py b/pkg/tar/share/lib/aerospike-render.py new file mode 100644 index 00000000..2c4a2044 --- /dev/null +++ b/pkg/tar/share/lib/aerospike-render.py @@ -0,0 +1,38 @@ +#/usr/bin/python +''' +SYNOPSIS + + python aerospike-render.py